diff --git a/.clang-format b/.clang-format index 9ba433b17362424973626470d930356c2173dd84..aff93435f58c522f5ed1090aef2005f76e91cf31 100644 --- a/.clang-format +++ b/.clang-format @@ -25,4 +25,3 @@ AllowAllParametersOfDeclarationOnNextLine: true BinPackParameters: false BinPackArguments: false ... - diff --git a/.clang_format.hook b/.clang_format.hook new file mode 100755 index 0000000000000000000000000000000000000000..1d928216867c0ba3897d71542fea44debf8d72a0 --- /dev/null +++ b/.clang_format.hook @@ -0,0 +1,15 @@ +#!/bin/bash +set -e + +readonly VERSION="3.8" + +version=$(clang-format -version) + +if ! [[ $version == *"$VERSION"* ]]; then + echo "clang-format version check failed." + echo "a version contains '$VERSION' is needed, but get '$version'" + echo "you can install the right version, and make an soft-link to '\$PATH' env" + exit -1 +fi + +clang-format $@ diff --git a/.copyright.hook b/.copyright.hook new file mode 100644 index 0000000000000000000000000000000000000000..09afff2072df3384a429d01d06188218ae6e85d1 --- /dev/null +++ b/.copyright.hook @@ -0,0 +1,121 @@ +from __future__ import absolute_import +from __future__ import print_function +from __future__ import unicode_literals + +import argparse +import io, re +import sys, os +import subprocess +import platform + +COPYRIGHT = ''' + Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +''' + +LANG_COMMENT_MARK = None + +NEW_LINE_MARK = None + +COPYRIGHT_HEADER = None + +if platform.system() == "Windows": + NEW_LINE_MARK = "\r\n" +else: + NEW_LINE_MARK = '\n' + COPYRIGHT_HEADER = COPYRIGHT.split(NEW_LINE_MARK)[1] + p = re.search('(\d{4})', COPYRIGHT_HEADER).group(0) + process = subprocess.Popen(["date", "+%Y"], stdout=subprocess.PIPE) + date, err = process.communicate() + date = date.decode("utf-8").rstrip("\n") + COPYRIGHT_HEADER = COPYRIGHT_HEADER.replace(p, date) + + +def generate_copyright(template, lang='C'): + if lang == 'Python': + LANG_COMMENT_MARK = '#' + else: + LANG_COMMENT_MARK = "//" + + lines = template.split(NEW_LINE_MARK) + BLANK = " " + ans = LANG_COMMENT_MARK + BLANK + COPYRIGHT_HEADER + NEW_LINE_MARK + for lino, line in enumerate(lines): + if lino == 0 or lino == 1 or lino == len(lines) - 1: continue + if len(line) == 0: + BLANK = "" + else: + BLANK = " " + ans += LANG_COMMENT_MARK + BLANK + line + NEW_LINE_MARK + + return ans + "\n" + + +def lang_type(filename): + if filename.endswith(".py"): + return "Python" + elif filename.endswith(".h"): + return "C" + elif filename.endswith(".c"): + return "C" + elif filename.endswith(".hpp"): + return "C" + elif filename.endswith(".cc"): + return "C" + elif filename.endswith(".cpp"): + return "C" + elif filename.endswith(".cu"): + return "C" + elif filename.endswith(".cuh"): + return "C" + elif filename.endswith(".go"): + return "C" + elif filename.endswith(".proto"): + return "C" + else: + print("Unsupported filetype %s", filename) + exit(0) + + +PYTHON_ENCODE = re.compile("^[ \t\v]*#.*?coding[:=][ \t]*([-_.a-zA-Z0-9]+)") + + +def main(argv=None): + parser = argparse.ArgumentParser( + description='Checker for copyright declaration.') + parser.add_argument('filenames', nargs='*', help='Filenames to check') + args = parser.parse_args(argv) + + retv = 0 + for filename in args.filenames: + fd = io.open(filename, encoding="utf-8") + first_line = fd.readline() + second_line = fd.readline() + if "COPYRIGHT (C)" in first_line.upper(): continue + if first_line.startswith("#!") or PYTHON_ENCODE.match( + second_line) != None or PYTHON_ENCODE.match(first_line) != None: + continue + original_contents = io.open(filename, encoding="utf-8").read() + new_contents = generate_copyright( + COPYRIGHT, lang_type(filename)) + original_contents + print('Auto Insert Copyright Header {}'.format(filename)) + retv = 1 + with io.open(filename, 'w') as output_file: + output_file.write(new_contents) + + return retv + + +if __name__ == '__main__': + exit(main()) diff --git a/.gitignore b/.gitignore index 275173b9677bffe028152fe8eadb3384329aeb5a..ac56a3320ec85769d2c87c072512f5217eca0c24 100644 --- a/.gitignore +++ b/.gitignore @@ -19,3 +19,13 @@ third_party/ # clion workspace. cmake-build-* + +# generated while compiling +python/paddle/v2/fluid/core.so +paddle/pybind/pybind.h +CMakeFiles +cmake_install.cmake +paddle/.timestamp +python/paddlepaddle.egg-info/ +paddle/pybind/pybind.h +python/paddle/version.py diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 4cd8eb12f6b23c67e8fb22f43d57afd4a96770fd..89c620bb2f7ef634fa80b64eec7037e8cb9a190c 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -17,7 +17,25 @@ - id: detect-private-key files: (?!.*third_party)^.*$ | (?!.*book)^.*$ - id: end-of-file-fixer -- repo: https://github.com/PaddlePaddle/clang-format-pre-commit-hook.git - sha: 28c0ea8a67a3e2dbbf4822ef44e85b63a0080a29 +- repo: local hooks: - - id: clang-formater + - id: clang-format-with-version-check + name: clang-format + description: Format files with ClangFormat. + entry: bash ./.clang_format.hook -i + language: system + files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx|proto)$ +- repo: https://github.com/PaddlePaddle/pre-commit-golang + sha: 8337620115c25ff8333f1b1a493bd031049bd7c0 + hooks: + - id: go-fmt + types: + - go +- repo: local + hooks: + - id: copyright_checker + name: copyright_checker + entry: python ./.copyright.hook + language: system + files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx|proto|py)$ + exclude: (?!.*third_party)^.*$ | (?!.*book)^.*$ diff --git a/.travis.yml b/.travis.yml index 44b755ee32d204c883f0d74e7ad0f78380918954..e2d49daa1981396628efa5d16459eb70e9e76884 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,29 +1,26 @@ language: cpp cache: directories: - - $HOME/third_party - $HOME/.ccache - $HOME/.cache/pip + - $TRAVIS_BUILD_DIR/build/third_party sudo: required dist: trusty os: - linux env: - - JOB=DOCS - - JOB=BUILD_AND_TEST - - JOB=PRE_COMMIT + - JOB=build_doc + - JOB=check_style addons: apt: packages: - gcc-4.8 - g++-4.8 - - gfortran-4.8 - git - build-essential - python - python-pip - python2.7-dev - - python-numpy - python-wheel - libboost-dev - curl @@ -33,29 +30,27 @@ addons: - automake - libtool - ccache + ssh_known_hosts: 52.76.173.135 before_install: - - | - if [ ${JOB} == "BUILD_AND_TEST" ]; then - local change_list=`git diff --name-only $TRAVIS_COMMIT_RANGE` - if [ $? -eq 0 ]; then # if git diff return no zero, then rerun unit test. - if ! echo ${change_list} | grep -qvE '(\.md$)|(\.rst$)|(\.jpg$)|(\.png$)' - then - echo "Only markdown docs were updated, stopping build process." - exit - fi - fi - fi - - if [[ "$JOB" == "PRE_COMMIT" ]]; then sudo ln -s /usr/bin/clang-format-3.8 /usr/bin/clang-format; fi - # Paddle is using protobuf 3.1 currently. Protobuf 3.2 breaks the compatibility. So we specify the python + - if [[ "$JOB" == "check_style" ]]; then sudo ln -s /usr/bin/clang-format-3.8 /usr/bin/clang-format; fi + # Paddle is using protobuf 3.1 currently. Protobuf 3.2 breaks the compatibility. So we specify the python # protobuf version. - - pip install numpy wheel 'protobuf==3.1' sphinx==1.5.6 recommonmark sphinx-rtd-theme==0.1.9 virtualenv pre-commit requests==2.9.2 LinkChecker - - pip install rarfile + - sudo pip install -r $TRAVIS_BUILD_DIR/python/requirements.txt + - sudo pip install wheel sphinx==1.5.6 recommonmark sphinx-rtd-theme==0.1.9 virtualenv pre-commit LinkChecker - | function timeout() { perl -e 'alarm shift; exec @ARGV' "$@"; } script: - - | - timeout 2580 paddle/scripts/travis/main.sh # 43min timeout - RESULT=$?; if [ $RESULT -eq 0 ] || [ $RESULT -eq 142 ]; then true; else false; fi; + - | + timeout 2580 paddle/scripts/travis/${JOB}.sh # 43min timeout + RESULT=$?; if [ $RESULT -eq 0 ] || [ $RESULT -eq 142 ]; then true ;else exit 1; fi; + - | + if [[ "$JOB" != "build_doc" ]]; then exit 0; fi; + if [[ "$TRAVIS_PULL_REQUEST" != "false" ]]; then exit 0; fi; + if [[ "$TRAVIS_BRANCH" != "develop" && ! "$TRAVIS_BRANCH" =~ ^v[[:digit:]]+\.[[:digit:]]+(\.[[:digit:]]+)?(-\S*)?$ ]]; then exit 0; fi; + export DEPLOY_DOCS_SH=https://raw.githubusercontent.com/PaddlePaddle/PaddlePaddle.org/master/scripts/deploy/deploy_docs.sh + export DOCS_DIR=`pwd` + cd .. + curl $DEPLOY_DOCS_SH | bash -s $CONTENT_DEC_PASSWD $TRAVIS_BRANCH $DOCS_DIR $DOCS_DIR/build/doc notifications: email: on_success: change diff --git a/AUTHORS.md b/AUTHORS.md index d5baee2161aa1d5360056e03ca67d5b2fe9ff7d2..4db4a4a8e7441b07ce2db4adff043bb99a09014b 100644 --- a/AUTHORS.md +++ b/AUTHORS.md @@ -1,28 +1,48 @@ | Github account | name | |---|---| -| reyoung | Yang Yu | +| backyes | Yan-Fei Wang | +| beckett1124 | Bin Qi | +| Canpio | Jia-Yi Feng | +| chengxiaohua1105 | Xiao-Hua Cheng | +| cxwangyi, yiwangbaidu, wangkuiyi | Yi Wang | +| cxysteven | Xing-Yi Cheng | +| dzhwinter | Zhi-Hong Dong | +| emailweixu | Wei Xu | | gangliao | Gang Liao | -| luotao01 | Tao Luo | -| jacquesqiao | Long-Fei Qiao | -| qingqing01 | Qing-Qing Dang | +| gongweibao | Wei-Bao Gong | +| Guo Sheng | Sheng Guo | +| Haichao-Zhang | Hai-Chao Zhang | | hedaoyuan | Dao-Yuan He | -| wangyang59 | Yang Wang | +| helinwang | He-Lin Wang | +| jacquesqiao | Long-Fei Qiao | +| kuke | Yi-Bing Liu | +| lcy-seso | Ying Cao | +| lipeng-unisound | Peng Li | +| liuyuan | Yuan Liu | +| livc | Zhao Li | +| llxxxll | Yong-Feng Liu | +| luotao01 | Tao Luo | +| lzhao4ever | Liang Zhao | +| NHZlX | Zhao-Long Xing | +| pakchoi | Chuan-Jiang Song | +| pengli09 | Peng Li | +| pkuyym | Ya-Ming Yang | | QiJune | Jun Qi | +| qingqing01 | Qing-Qing Dang | +| reyoung | Yang Yu | +| Superjom | Chun-Wei Yan | | tianbingsz | Tian-Bing Xu | -| cxwangyi, yiwangbaidu, wangkuiyi | Yi Wang | | typhoonzero | Yi Wu | -| backyes | Yan-Fei Wang | -| pengli09 | Peng Li | -| livc | Zhao Li | +| wanghaoshuang | Hao-Shuang Wang | +| wangyang59 | Yang Wang | +| wangzhen-nlp | Zhen Wang | +| wen-bo-yang | Wen-Bo Yang | +| wwhu | Wei-Wei Hu | +| xinghai-sun | Xing-Hai Sun | | Xreki | Yi-Qun Liu | +| xujun05 | Jun Xu | +| xushaoyong | Shao-Yong Xu | | Yancey1989 | Xu Yan | -| emailweixu | Wei Xu | -| wen-bo-yang | Wen-Bo Yang | -| helinwang | He-Lin Wang | -| lcy-seso | Ying Cao | -| Zrachel | Rui-Qing Zhang | -| Haichao-Zhang | Hai-Chao Zhang | -| gongweibao | Wei-Bao Gong | -| lzhao4ever | Liang Zhao | +| zhaopu7 | Pu Zhao | | zhouxiao-coder | Xiao Zhou | -| lipeng-unisound | Peng Li | +| Zrachel | Rui-Qing Zhang | diff --git a/CMakeLists.txt b/CMakeLists.txt index 79210d043648de5d493f0b998eeb885c993a6106..7c7eb260aea8478f4833cb79253f4481e10b8685 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -13,13 +13,17 @@ # limitations under the License cmake_minimum_required(VERSION 3.0) - set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_CURRENT_SOURCE_DIR}/cmake") -set(PROJ_ROOT ${CMAKE_CURRENT_SOURCE_DIR}) +set(PADDLE_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}) +set(PADDLE_BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR}) include(system) project(paddle CXX C Go) +message(STATUS "CXX compiler: ${CMAKE_CXX_COMPILER}, version: " + "${CMAKE_CXX_COMPILER_ID} ${CMAKE_CXX_COMPILER_VERSION}") +message(STATUS "C compiler: ${CMAKE_C_COMPILER}, version: " + "${CMAKE_C_COMPILER_ID} ${CMAKE_C_COMPILER_VERSION}") find_package(Sphinx) if(NOT CMAKE_CROSSCOMPILING) @@ -33,6 +37,7 @@ include(simd) ################################ Configurations ####################################### option(WITH_GPU "Compile PaddlePaddle with NVIDIA GPU" ${CUDA_FOUND}) option(WITH_AVX "Compile PaddlePaddle with AVX intrinsics" ${AVX_FOUND}) +option(WITH_MKL "Compile PaddlePaddle with MKL support." ${AVX_FOUND}) option(WITH_DSO "Compile PaddlePaddle with dynamic linked CUDA" ON) option(WITH_TESTING "Compile PaddlePaddle with unit testing" ON) option(WITH_SWIG_PY "Compile PaddlePaddle with inference api" ON) @@ -47,6 +52,14 @@ option(WITH_COVERAGE "Compile PaddlePaddle with code coverage" OFF) option(COVERALLS_UPLOAD "Package code coverage data to coveralls" OFF) option(ON_TRAVIS "Exclude special unit test on Travis CI" OFF) option(WITH_C_API "Compile PaddlePaddle with C-API(Prediction)" OFF) +# TODO: Only compile PaddlePaddle fluid version by WITH_FLUID option. +option(WITH_FLUID "Compile PaddlePaddle fluid only(TODO)" ON) +option(WITH_GOLANG "Compile PaddlePaddle with GOLANG" OFF) +option(GLIDE_INSTALL "Download and install go dependencies " ON) +option(USE_NNPACK "Compile PaddlePaddle with NNPACK library" OFF) +option(WITH_DISTRIBUTE "Compile with grpc distributed support" OFF) +option(USE_EIGEN_FOR_BLAS "Use matrix multiplication in Eigen" OFF) +option(WITH_ARM_FP16 "Use half precision support on armv8.2-a cpu" OFF) # CMAKE_BUILD_TYPE if(NOT CMAKE_BUILD_TYPE) @@ -55,22 +68,36 @@ if(NOT CMAKE_BUILD_TYPE) FORCE) endif() -if(ANDROID) - if(${CMAKE_SYSTEM_VERSION} VERSION_LESS "21") - message(FATAL_ERROR "Unsupport standalone toolchains with Android API level lower than 21") +if(ANDROID OR IOS) + if(ANDROID) + if(${CMAKE_SYSTEM_VERSION} VERSION_LESS "16") + message(FATAL_ERROR "Unsupport standalone toolchains with Android API level lower than 16") + endif() endif() set(WITH_GPU OFF CACHE STRING - "Disable GPU when cross-compiling for Android" FORCE) + "Disable GPU when cross-compiling for Android and iOS" FORCE) set(WITH_AVX OFF CACHE STRING - "Disable AVX when cross-compiling for Android" FORCE) + "Disable AVX when cross-compiling for Android and iOS" FORCE) set(WITH_PYTHON OFF CACHE STRING - "Disable PYTHON when cross-compiling for Android" FORCE) + "Disable PYTHON when cross-compiling for Android and iOS" FORCE) set(WITH_RDMA OFF CACHE STRING - "Disable RDMA when cross-compiling for Android" FORCE) -endif(ANDROID) + "Disable RDMA when cross-compiling for Android and iOS" FORCE) + set(WITH_MKL OFF CACHE STRING + "Disable MKL when cross-compiling for Android and iOS" FORCE) + set(WITH_GOLANG OFF CACHE STRING + "Disable golang when cross-compiling for Android and iOS" FORCE) + + # Compile PaddlePaddle mobile inference library + if (NOT WITH_C_API) + set(WITH_C_API ON CACHE STRING + "Always compile the C_API when cross-compiling for Android and iOS" FORCE) + endif() + set(MOBILE_INFERENCE ON) + add_definitions(-DPADDLE_MOBILE_INFERENCE) +endif() -set(THIRD_PARTY_PATH "${PROJ_ROOT}/third_party" CACHE STRING +set(THIRD_PARTY_PATH "${CMAKE_BINARY_DIR}/third_party" CACHE STRING "A path setting third party libraries download & build directories.") if (WITH_C_API AND WITH_PYTHON) @@ -79,8 +106,27 @@ if (WITH_C_API AND WITH_PYTHON) "different Python interpreter from compiling.") endif() +if (WITH_C_API) + set(WITH_FLUID OFF CACHE STRING "Disable install fluid when compile the C_API" FORCE) +endif() + +if(MOBILE_INFERENCE) + set(THIRD_PARTY_BUILD_TYPE MinSizeRel) +else() + set(THIRD_PARTY_BUILD_TYPE Release) +endif() + +set(WITH_MKLML ${WITH_MKL}) +if (WITH_MKL AND AVX2_FOUND) + set(WITH_MKLDNN ON) +else() + message(STATUS "Do not have AVX2 intrinsics and disabled MKL-DNN") + set(WITH_MKLDNN OFF) +endif() + ######################################################################################## +include(external/mklml) # download mklml package include(external/zlib) # download, build, install zlib include(external/gflags) # download, build, install gflags include(external/glog) # download, build, install glog @@ -88,10 +134,19 @@ include(external/gtest) # download, build, install gtest include(external/protobuf) # download, build, install protobuf include(external/python) # download, build, install python include(external/openblas) # download, build, install openblas +include(external/mkldnn) # download, build, install mkldnn include(external/swig) # download, build, install swig include(external/warpctc) # download, build, install warpctc +include(external/boost) # download, build, install boost include(external/any) # download libn::any +include(external/eigen) # download eigen3 +include(external/pybind11) # download pybind11 +include(external/nccl) +include(external/cares) +include(external/grpc) +include(cudnn) # set cudnn libraries, must before configure +include(configure) # add paddle env configuration include(generic) # simplify cmake module include(package) # set paddle packages include(cpplint) # set paddle c++ style @@ -99,14 +154,14 @@ include(ccache) # set ccache for compilation include(util) # set unittest and link libs include(rdma) # set rdma libraries include(flags) # set paddle compile flags -include(cudnn) # set cudnn libraries include(version) # set PADDLE_VERSION include(coveralls) # set code coverage -include(configure) # add paddle env configuration -include_directories("${PROJ_ROOT}") -include_directories("${PROJ_ROOT}/paddle/cuda/include") + +include_directories("${PADDLE_SOURCE_DIR}") +include_directories("${PADDLE_SOURCE_DIR}/paddle/cuda/include") include_directories("${CMAKE_CURRENT_BINARY_DIR}/proto") +include_directories("${CMAKE_CURRENT_BINARY_DIR}/go/pserver/client/c") set(EXTERNAL_LIBS ${GFLAGS_LIBRARIES} @@ -118,15 +173,45 @@ set(EXTERNAL_LIBS ) if(WITH_GPU) - list(APPEND EXTERNAL_LIB ${CUDA_LIBRARIES} ${CUDA_rt_LIBRARY}) - if(NOT WITH_DSO) - list(APPEND EXTERNAL_LIB ${CUDNN_LIBRARY} ${CUDA_CUBLAS_LIBRARIES} ${CUDA_curand_LIBRARY}) - endif(NOT WITH_DSO) + include(cuda) endif(WITH_GPU) +if(WITH_MKLML) + list(APPEND EXTERNAL_LIBS ${MKLML_IOMP_LIB}) +endif() + +if(WITH_MKLDNN) + list(APPEND EXTERNAL_LIBS ${MKLDNN_LIB}) +endif() + +if(USE_NNPACK) + include(external/nnpack) + list(APPEND EXTERNAL_LIBS ${NNPACK_LIBS}) +endif(USE_NNPACK) + add_subdirectory(proto) + +if(NOT MOBILE_INFERENCE) + # "add_subdirectory(go)" should be placed after the following loine, + # because it depends on paddle/optimizer. + add_subdirectory(paddle/optimizer) +endif() + +# "add_subdirectory(paddle)" and "add_subdirectory(python)" should be +# placed after this block, because they depends on it. +if(WITH_GOLANG) + add_subdirectory(go) +endif(WITH_GOLANG) + +set(PADDLE_PYTHON_BUILD_DIR "${CMAKE_CURRENT_BINARY_DIR}/python/build") + +SET(CMAKE_CXX_FLAGS_RELWITHDEBINFO "-O3 -g -DNDEBUG") +SET(CMAKE_C_FLAGS_RELWITHDEBINFO "-O3 -g -DNDEBUG") + add_subdirectory(paddle) -add_subdirectory(python) +if(WITH_PYTHON) + add_subdirectory(python) +endif() if(WITH_DOC) add_subdirectory(doc) diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md new file mode 100644 index 0000000000000000000000000000000000000000..54131b48eca463aef817a4b96ba1b64de4b60aab --- /dev/null +++ b/CODE_OF_CONDUCT.md @@ -0,0 +1,46 @@ +# Contributor Covenant Code of Conduct + +## Our Pledge + +In the interest of fostering an open and welcoming environment, we as contributors and maintainers pledge to making participation in our project and our community a harassment-free experience for everyone, regardless of age, body size, disability, ethnicity, gender identity and expression, level of experience, nationality, personal appearance, race, religion, or sexual identity and orientation. + +## Our Standards + +Examples of behavior that contributes to creating a positive environment include: + +* Using welcoming and inclusive language +* Being respectful of differing viewpoints and experiences +* Gracefully accepting constructive criticism +* Focusing on what is best for the community +* Showing empathy towards other community members + +Examples of unacceptable behavior by participants include: + +* The use of sexualized language or imagery and unwelcome sexual attention or advances +* Trolling, insulting/derogatory comments, and personal or political attacks +* Public or private harassment +* Publishing others' private information, such as a physical or electronic address, without explicit permission +* Other conduct which could reasonably be considered inappropriate in a professional setting + +## Our Responsibilities + +Project maintainers are responsible for clarifying the standards of acceptable behavior and are expected to take appropriate and fair corrective action in response to any instances of unacceptable behavior. + +Project maintainers have the right and responsibility to remove, edit, or reject comments, commits, code, wiki edits, issues, and other contributions that are not aligned to this Code of Conduct, or to ban temporarily or permanently any contributor for other behaviors that they deem inappropriate, threatening, offensive, or harmful. + +## Scope + +This Code of Conduct applies both within project spaces and in public spaces when an individual is representing the project or its community. Examples of representing a project or community include using an official project e-mail address, posting via an official social media account, or acting as an appointed representative at an online or offline event. Representation of a project may be further defined and clarified by project maintainers. + +## Enforcement + +Instances of abusive, harassing, or otherwise unacceptable behavior may be reported by contacting the project team at paddle-dev@baidu.com. The project team will review and investigate all complaints, and will respond in a way that it deems appropriate to the circumstances. The project team is obligated to maintain confidentiality with regard to the reporter of an incident. Further details of specific enforcement policies may be posted separately. + +Project maintainers who do not follow or enforce the Code of Conduct in good faith may face temporary or permanent repercussions as determined by other members of the project's leadership. + +## Attribution + +This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4, available at [http://contributor-covenant.org/version/1/4][version] + +[homepage]: http://contributor-covenant.org +[version]: http://contributor-covenant.org/version/1/4/ diff --git a/CODE_OF_CONDUCT_cn.md b/CODE_OF_CONDUCT_cn.md new file mode 100644 index 0000000000000000000000000000000000000000..2be794f1f324cf9b6bc304d4e5812076b56f4551 --- /dev/null +++ b/CODE_OF_CONDUCT_cn.md @@ -0,0 +1,50 @@ +# 参与者公约 + +## 我们的保证 + +为了促进一个开放透明且友好的环境,我们作为贡献者和维护者保证:无论年龄、种族、民族、性别认同和表达(方式)、体型、身体健全与否、经验水平、国籍、个人表现、宗教或性别取向,参与者在我们项目和社区中都免于骚扰。 + +## 我们的标准 + +有助于创造正面环境的行为包括但不限于: +* 使用友好和包容性语言 +* 尊重不同的观点和经历 +* 耐心地接受建设性批评 +* 关注对社区最有利的事情 +* 友善对待其他社区成员 + +身为参与者不能接受的行为包括但不限于: +* 使用与性有关的言语或是图像,以及不受欢迎的性骚扰 +* 捣乱/煽动/造谣的行为或进行侮辱/贬损的评论,人身攻击及政治攻击 +* 公开或私下的骚扰 +* 未经许可地发布他人的个人资料,例如住址或是电子地址 +* 其他可以被合理地认定为不恰当或者违反职业操守的行为 + +## 我们的责任 + +项目维护者有责任为「可接受的行为」标准做出诠释,以及对已发生的不被接受的行为采取恰当且公平的纠正措施。 + +项目维护者有权利及责任去删除、编辑、拒绝与本行为标准有所违背的评论(comments)、提交(commits)、代码、wiki 编辑、问题(issues)和其他贡献,以及项目维护者可暂时或永久性的禁止任何他们认为有不适当、威胁、冒犯、有害行为的贡献者。 + +## 使用范围 + +当一个人代表该项目或是其社区时,本行为标准适用于其项目平台和公共平台。 + +代表项目或是社区的情况,举例来说包括使用官方项目的电子邮件地址、通过官方的社区媒体账号发布或线上或线下事件中担任指定代表。 + +该项目的呈现方式可由其项目维护者进行进一步的定义及解释。 + +## 强制执行 + +可以通过paddle-dev@baidu.com,来联系项目团队来举报滥用、骚扰或其他不被接受的行为。 + +任何维护团队认为有必要且适合的所有投诉都将进行审查及调查,并做出相对应的回应。项目小组有对事件回报者有保密的义务。具体执行的方针近一步细节可能会单独公布。 + +没有切实地遵守或是执行本行为标准的项目维护人员,可能会因项目领导人或是其他成员的决定,暂时或是永久地取消其参与资格。 + +## 来源 + +本行为标准改编自[贡献者公约][主页],版本 1.4 +可在此观看https://www.contributor-covenant.org/zh-cn/version/1/4/code-of-conduct.html + +[主页]: https://www.contributor-covenant.org diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 0d4bb973ae87bb45ef4386a63c26ed62602f2cee..a60453ff4e3bba6e6cb3b3de915dd69afd3a1ec3 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -1 +1,157 @@ -./doc/howto/dev/contribute_to_paddle_en.md +# Contribute Code + +We sincerely appreciate your contribution. This document explains our workflow and work style. + +## Workflow + +PaddlePaddle uses this [Git branching model](http://nvie.com/posts/a-successful-git-branching-model/). The following steps guide usual contributions. + +1. Fork + + Our development community has been growing fastly; it doesn't make sense for everyone to write into the official repo. So, please file Pull Requests from your fork. To make a fork, just head over to the GitHub page and click the ["Fork" button](https://help.github.com/articles/fork-a-repo/). + +1. Clone + + To make a copy of your fork to your local computers, please run + + ```bash + git clone https://github.com/your-github-account/paddle + cd paddle + ``` + +1. Create the local feature branch + + For daily works like adding a new feature or fixing a bug, please open your feature branch before coding: + + ```bash + git checkout -b my-cool-stuff + ``` + +1. Commit + + Before issuing your first `git commit` command, please install [`pre-commit`](http://pre-commit.com/) by running the following commands: + + ```bash + pip install pre-commit + pre-commit install + ``` + + Our pre-commit configuration requires clang-format 3.8 for auto-formating C/C++ code and yapf for Python. + + Once installed, `pre-commit` checks the style of code and documentation in every commit. We will see something like the following when you run `git commit`: + + ``` + ➜ git commit + CRLF end-lines remover...............................(no files to check)Skipped + yapf.................................................(no files to check)Skipped + Check for added large files..............................................Passed + Check for merge conflicts................................................Passed + Check for broken symlinks................................................Passed + Detect Private Key...................................(no files to check)Skipped + Fix End of Files.....................................(no files to check)Skipped + clang-formater.......................................(no files to check)Skipped + [my-cool-stuff c703c041] add test file + 1 file changed, 0 insertions(+), 0 deletions(-) + create mode 100644 233 + ``` + +1. Build and test + + Users can build PaddlePaddle natively on Linux and Mac OS X. But to unify the building environment and to make it easy for debugging, the recommended way is [using Docker](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/howto/dev/build_en.md). + +1. Keep pulling + + An experienced Git user pulls from the official repo often -- daily or even hourly, so they notice conflicts with others work early, and it's easier to resolve smaller conflicts. + + ```bash + git remote add upstream https://github.com/PaddlePaddle/Paddle + git pull upstream develop + ``` + +1. Push and file a pull request + + You can "push" your local work into your forked repo: + + ```bash + git push origin my-cool-stuff + ``` + + The push allows you to create a pull request, requesting owners of this [official repo](https://github.com/PaddlePaddle/Paddle) to pull your change into the official one. + + To create a pull request, please follow [these steps](https://help.github.com/articles/creating-a-pull-request/). + + If your change is for fixing an issue, please write ["Fixes "](https://help.github.com/articles/closing-issues-using-keywords/) in the description section of your pull request. Github would close the issue when the owners merge your pull request. + + Please remember to specify some reviewers for your pull request. If you don't know who are the right ones, please follow Github's recommendation. + + +1. Delete local and remote branches + + To keep your local workspace and your fork clean, you might want to remove merged branches: + + ```bash + git push origin :my-cool-stuff + git checkout develop + git pull upstream develop + git branch -d my-cool-stuff + ``` + +### Code Review + +- Please feel free to ping your reviewers by sending them the URL of your pull request via IM or email. Please do this after your pull request passes the CI. + +- Please answer reviewers' every comment. If you are to follow the comment, please write "Done"; please give a reason otherwise. + +- If you don't want your reviewers to get overwhelmed by email notifications, you might reply their comments by [in a batch](https://help.github.com/articles/reviewing-proposed-changes-in-a-pull-request/). + +- Reduce the unnecessary commits. Some developers commit often. It is recommended to append a sequence of small changes into one commit by running `git commit --amend` instead of `git commit`. + + +## Coding Standard + +### Code Style + +Our C/C++ code follows the [Google style guide](http://google.github.io/styleguide/cppguide.html). + +Our Python code follows the [PEP8 style guide](https://www.python.org/dev/peps/pep-0008/). + +Our build process helps to check the code style. In [`build.sh`](https://github.com/PaddlePaddle/Paddle/blob/b84e8226514b8bb4405c3c28e54aa5077193d179/paddle/scripts/docker/build.sh#L42), the entry point of our [builder Docker image](https://github.com/PaddlePaddle/Paddle/blob/b84e8226514b8bb4405c3c28e54aa5077193d179/Dockerfile#L88), the CMake argument `WITH_STYLE_CHECK` is set to `ON` by default. This flag is on + +Please install pre-commit, which automatically reformat the changes to C/C++ and Python code whenever we run `git commit`. To check the whole codebase, we can run the command `pre-commit run -a`, as in the [`check_style.sh` file](https://github.com/PaddlePaddle/Paddle/blob/b84e8226514b8bb4405c3c28e54aa5077193d179/paddle/scripts/travis/check_style.sh#L30), which is invoked by [our Travis CI configuration](https://github.com/PaddlePaddle/Paddle/blob/b84e8226514b8bb4405c3c28e54aa5077193d179/.travis.yml#L43). + +### Unit Tests + +Please remember to add related unit tests. + +- For C/C++ code, please follow [`google-test` Primer](https://github.com/google/googletest/blob/master/googletest/docs/Primer.md). + +- For Python code, please use [Python's standard `unittest` package](http://pythontesting.net/framework/unittest/unittest-introduction/). + + +### Writing Logs + +We use [glog](https://github.com/google/glog) for logging in our C/C++ code. + +For general information, please use `LOG`. For debug information, please use [`VLOG`](http://htmlpreview.github.io/?https://github.com/google/glog/blob/master/doc/glog.html#verbose). The reason is at [here](https://groups.google.com/a/chromium.org/d/msg/chromium-dev/3NDNd1KzXeY/AZKMMx37fdQJ). + +`VLOG` requires a *verbose level* parameter. For example: + +```c++ +VLOG(3) << "Operator FC is taking " << num_inputs << "inputs." +``` + +When we run a PaddlePaddle application or test, we can specify a verbose threshold. For example: + +```bash +GLOG_vmodule=buddy_allocator=2 \ +GLOG_v=10 \ +python \ +../python/paddle/v2/framework/tests/test_recurrent_op.py +``` + +This will enable VLOG messages generated by `buddy_allocator.{h,cc}` and in the verbose range of 0 to 3, so you will see above example VLOG message, which is in level 3. This suggests that we output overall messages in lower verbose levels, so they display with higher probability. When coding C++, please follow the verbose level convention as follows: + +- verbose level 1: [framework](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/framework) +- verbose level 3: [operators](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/operators) +- verbose level 5: [memory](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/memory), [platform](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/platform) +- verbose level 7: [math](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/math) diff --git a/Dockerfile b/Dockerfile index 39af60966b6cab7d8b9e644f4ea658613f8ba518..6ac9901ac6cea12e97047efdfb6272c957f166ae 100644 --- a/Dockerfile +++ b/Dockerfile @@ -10,13 +10,11 @@ RUN /bin/bash -c 'if [[ -n ${UBUNTU_MIRROR} ]]; then sed -i 's#http://archive.ub ARG WITH_GPU ARG WITH_AVX ARG WITH_DOC -ARG WITH_STYLE_CHECK ENV WOBOQ OFF -ENV WITH_GPU=${WITH_GPU:-OFF} +ENV WITH_GPU=${WITH_GPU:-ON} ENV WITH_AVX=${WITH_AVX:-ON} ENV WITH_DOC=${WITH_DOC:-OFF} -ENV WITH_STYLE_CHECK=${WITH_STYLE_CHECK:-OFF} ENV HOME /root # Add bash enhancements @@ -24,24 +22,27 @@ COPY ./paddle/scripts/docker/root/ /root/ RUN apt-get update && \ apt-get install -y \ - git python-pip python-dev openssh-server bison \ - wget unzip tar xz-utils bzip2 gzip coreutils \ + git python-pip python-dev openssh-server bison libnccl-dev \ + wget unzip unrar tar xz-utils bzip2 gzip coreutils ntp \ curl sed grep graphviz libjpeg-dev zlib1g-dev \ - python-numpy python-matplotlib gcc g++ \ - automake locales clang-format-3.8 swig doxygen cmake \ - liblapack-dev liblapacke-dev libboost-dev \ + python-matplotlib gcc-4.8 g++-4.8 \ + automake locales clang-format swig doxygen cmake \ + liblapack-dev liblapacke-dev \ clang-3.8 llvm-3.8 libclang-3.8-dev \ - net-tools && \ + net-tools libtool && \ apt-get clean -y -# Install Go -RUN wget -O go.tgz https://storage.googleapis.com/golang/go1.8.1.linux-amd64.tar.gz && \ - tar -C /usr/local -xzf go.tgz && \ +# Install Go and glide +RUN wget -qO- https://storage.googleapis.com/golang/go1.8.1.linux-amd64.tar.gz | \ + tar -xz -C /usr/local && \ mkdir /root/gopath && \ - rm go.tgz + mkdir /root/gopath/bin && \ + mkdir /root/gopath/src ENV GOROOT=/usr/local/go GOPATH=/root/gopath # should not be in the same line with GOROOT definition, otherwise docker build could not find GOROOT. -ENV PATH=${PATH}:${GOROOT}/bin +ENV PATH=${PATH}:${GOROOT}/bin:${GOPATH}/bin +# install glide +RUN curl -s -q https://glide.sh/get | sh # git credential to skip password typing RUN git config --global credential.helper store @@ -52,19 +53,23 @@ RUN localedef -i en_US -f UTF-8 en_US.UTF-8 # FIXME: due to temporary ipykernel dependency issue, specify ipykernel jupyter # version util jupyter fixes this issue. RUN pip install --upgrade pip && \ - pip install -U 'protobuf==3.1.0' && \ - pip install -U wheel pillow BeautifulSoup && \ + pip install -U wheel && \ pip install -U docopt PyYAML sphinx && \ - pip install -U sphinx-rtd-theme==0.1.9 recommonmark && \ - pip install pre-commit 'requests==2.9.2' 'ipython==5.3.0' && \ - pip install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \ - pip install rarfile + pip install -U sphinx-rtd-theme==0.1.9 recommonmark + +RUN pip install pre-commit 'ipython==5.3.0' && \ + pip install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \ + pip install opencv-python + +COPY ./python/requirements.txt /root/ +RUN pip install -r /root/requirements.txt # To fix https://github.com/PaddlePaddle/Paddle/issues/1954, we use # the solution in https://urllib3.readthedocs.io/en/latest/user-guide.html#ssl-py2 RUN apt-get install -y libssl-dev libffi-dev RUN pip install certifi urllib3[secure] + # Install woboq_codebrowser to /woboq RUN git clone https://github.com/woboq/woboq_codebrowser /woboq && \ (cd /woboq \ diff --git a/Dockerfile.android b/Dockerfile.android index fa24f6f06c4e76444c83bcf13fe312afdcb6c348..9d13a414f67be04e17b7d83403228d92bce0eda9 100644 --- a/Dockerfile.android +++ b/Dockerfile.android @@ -4,9 +4,16 @@ MAINTAINER PaddlePaddle Authors ARG UBUNTU_MIRROR RUN /bin/bash -c 'if [[ -n ${UBUNTU_MIRROR} ]]; then sed -i 's#http://archive.ubuntu.com/ubuntu#${UBUNTU_MIRROR}#g' /etc/apt/sources.list; fi' +# ENV variables +ARG ANDROID_ABI +ARG ANDROID_API + +ENV ANDROID_ABI=${ANDROID_ABI:-"armeabi-v7a"} +ENV ANDROID_API=${ANDROID_API:-21} + ENV HOME=/root \ ANDROID_NDK_HOME=/opt/android-ndk-linux \ - ANDROID_STANDALONE_TOOLCHAIN=/opt/android-toolchain-gcc + ANDROID_TOOLCHAINS_DIR=/opt/toolchains RUN apt-get update && \ apt-get install -y \ @@ -14,6 +21,16 @@ RUN apt-get update && \ wget curl tar unzip gcc g++ locales clang-format-3.8 swig cmake && \ apt-get clean -y +# Install Go and glide +RUN wget -qO- go.tgz https://storage.googleapis.com/golang/go1.8.1.linux-amd64.tar.gz | \ + tar -xz -C /usr/local && \ + mkdir /root/gopath && \ + mkdir /root/gopath/bin && \ + mkdir /root/gopath/src +ENV GOROOT=/usr/local/go GOPATH=/root/gopath +# should not be in the same line with GOROOT definition, otherwise docker build could not find GOROOT. +ENV PATH=${PATH}:${GOROOT}/bin:${GOPATH}/bin + # git credential to skip password typing RUN git config --global credential.helper store @@ -26,13 +43,12 @@ RUN pip install --upgrade pip && \ pip install pre-commit # Android NDK -RUN mkdir /opt/android-ndk-tmp && \ +RUN mkdir -p ${ANDROID_TOOLCHAINS_DIR} && \ + mkdir -p /opt/android-ndk-tmp && \ cd /opt/android-ndk-tmp && \ wget -q https://dl.google.com/android/repository/android-ndk-r14b-linux-x86_64.zip && \ unzip -q android-ndk-r14b-linux-x86_64.zip && \ mv android-ndk-r14b ${ANDROID_NDK_HOME} && \ - ${ANDROID_NDK_HOME}/build/tools/make-standalone-toolchain.sh --arch=arm --platform=android-21 --install-dir=${ANDROID_STANDALONE_TOOLCHAIN} && \ - rm -rf /opt/android-ndk-tmp && \ - rm -rf ${ANDROID_NDK_HOME} + rm -rf /opt/android-ndk-tmp CMD ["bash", "/paddle/paddle/scripts/docker/build_android.sh"] diff --git a/README.md b/README.md index fa16cc3cf2ef9c1200a19e03192c94c65fc08679..d06375a444dd65675bdd75baccf8445c1638a87c 100644 --- a/README.md +++ b/README.md @@ -2,8 +2,8 @@ [![Build Status](https://travis-ci.org/PaddlePaddle/Paddle.svg?branch=develop)](https://travis-ci.org/PaddlePaddle/Paddle) -[![Documentation Status](https://img.shields.io/badge/docs-latest-brightgreen.svg?style=flat)](http://www.paddlepaddle.org/develop/doc/) -[![Documentation Status](https://img.shields.io/badge/中文文档-最新-brightgreen.svg)](http://www.paddlepaddle.org/doc_cn/) +[![Documentation Status](https://img.shields.io/badge/docs-latest-brightgreen.svg?style=flat)](http://www.paddlepaddle.org/docs/develop/documentation/en/getstarted/index_en.html) +[![Documentation Status](https://img.shields.io/badge/中文文档-最新-brightgreen.svg)](http://www.paddlepaddle.org/docs/develop/documentation/zh/getstarted/index_cn.html) [![Coverage Status](https://coveralls.io/repos/github/PaddlePaddle/Paddle/badge.svg?branch=develop)](https://coveralls.io/github/PaddlePaddle/Paddle?branch=develop) [![Release](https://img.shields.io/github/release/PaddlePaddle/Paddle.svg)](https://github.com/PaddlePaddle/Paddle/releases) [![License](https://img.shields.io/badge/license-Apache%202-blue.svg)](LICENSE) @@ -36,7 +36,8 @@ Please refer to our [release announcement](https://github.com/PaddlePaddle/Paddl examples: - Optimized math operations through SSE/AVX intrinsics, BLAS libraries - (e.g. MKL, ATLAS, cuBLAS) or customized CPU/GPU kernels. + (e.g. MKL, OpenBLAS, cuBLAS) or customized CPU/GPU kernels. + - Optimized CNN networks through MKL-DNN library. - Highly optimized recurrent networks which can handle **variable-length** sequence without padding. - Optimized local and distributed training for models with high dimensional @@ -51,45 +52,46 @@ Please refer to our [release announcement](https://github.com/PaddlePaddle/Paddl - **Connected to Products** In addition, PaddlePaddle is also designed to be easily deployable. At Baidu, - PaddlePaddle has been deployed into products or service with a vast number + PaddlePaddle has been deployed into products and services with a vast number of users, including ad click-through rate (CTR) prediction, large-scale image classification, optical character recognition(OCR), search ranking, computer virus detection, recommendation, etc. It is widely utilized in products at - Baidu and it has achieved a significant impact. We hope you can also exploit - the capability of PaddlePaddle to make a huge impact for your product. + Baidu and it has achieved a significant impact. We hope you can also explore + the capability of PaddlePaddle to make an impact on your product. ## Installation It is recommended to check out the -[Docker installation guide](http://www.paddlepaddle.org/develop/doc/getstarted/build_and_install/docker_install_en.html) +[Docker installation guide](http://www.paddlepaddle.org/docs/develop/documentation/en/getstarted/build_and_install/docker_install_en.html) before looking into the -[build from source guide](http://www.paddlepaddle.org/develop/doc/getstarted/build_and_install/build_from_source_en.html) +[build from source guide](http://www.paddlepaddle.org/docs/develop/documentation/en/getstarted/build_and_install/build_from_source_en.html). ## Documentation -We provide [English](http://www.paddlepaddle.org/develop/doc/) and -[Chinese](http://www.paddlepaddle.org/doc_cn/) documentation. +We provide [English](http://www.paddlepaddle.org/docs/develop/documentation/en/getstarted/index_en.html) and +[Chinese](http://www.paddlepaddle.org/docs/develop/documentation/zh/getstarted/index_cn.html) documentation. -- [Deep Learning 101](http://book.paddlepaddle.org/index.html) +- [Deep Learning 101](http://www.paddlepaddle.org/docs/develop/book/01.fit_a_line/index.html) - You might want to start from the this online interactive book that can run in Jupyter Notebook. + You might want to start from this online interactive book that can run in a Jupyter Notebook. -- [Distributed Training](http://www.paddlepaddle.org/develop/doc/howto/usage/cluster/cluster_train_en.html) +- [Distributed Training](http://www.paddlepaddle.org/docs/develop/documentation/en/howto/usage/cluster/cluster_train_en.html) You can run distributed training jobs on MPI clusters. -- [Distributed Training on Kubernetes](http://www.paddlepaddle.org/develop/doc/howto/usage/k8s/k8s_en.html) +- [Distributed Training on Kubernetes](http://www.paddlepaddle.org/docs/develop/documentation/en/howto/usage/cluster/k8s_en.html) You can also run distributed training jobs on Kubernetes clusters. -- [Python API](http://www.paddlepaddle.org/develop/doc/api/index_en.html) +- [Python API](http://www.paddlepaddle.org/docs/develop/documentation/en/api/index_en.html) Our new API enables much shorter programs. -- [How to Contribute](http://www.paddlepaddle.org/develop/doc/howto/dev/contribute_to_paddle_en.html) +- [How to Contribute](http://www.paddlepaddle.org/docs/develop/documentation/en/howto/dev/contribute_to_paddle_en.html) We appreciate your contributions! + ## Ask Questions You are welcome to submit questions and bug reports as [Github Issues](https://github.com/PaddlePaddle/Paddle/issues). diff --git a/RELEASE.cn.md b/RELEASE.cn.md index 5deaf230a8f5dd3089993f0fc79b9460fd049750..494c59730dd3c2830514e8924aa3d59a34ac412e 100644 --- a/RELEASE.cn.md +++ b/RELEASE.cn.md @@ -1,3 +1,62 @@ +# v0.11.0版本 + +## PaddlePaddle Fluid + +- PaddlePaddle发布版本v0.11.0包含一个新的特性*PaddlePaddle Fluid*. Fluid 是设计用来让用户像Pytorch和Tensorflow Eager Execution一样执行程序。在这些系统中,不再有*模型*这个概念,应用也不再包含一个用于描述Operator图或者一系列层的符号描述,而是像通用程序那样描述训练或者预测的过程。而Fluid与PyTorch或Eager Execution的区别在于Fluid不依赖Python提供的控制流,例如 if-else-then或者for,而是提供了基于C++实现的控制流并暴露了对应的用with语法实现的Python接口。例如: + + https://github.com/PaddlePaddle/Paddle/blob/3df78ed2a98d37f7ae6725894cc7514effd5664b/python/paddle/v2/fluid/tests/test_while_op.py#L36-L44 + +- 在v0.11.0版本中,我们提供了一个C++类`Executor`用于运行一个Fluid程序。Executor类似一个解释器。在未来的版本中,我们将提升和优化Executor成为一个调试器,就像GDB。并可能提供一些编译器,这个编译器会读取一个上文所描述的应用然后编译成一个等价的 +源代码,这个源代码可以被nvcc编译成可以使用CUDA的二进制,或者被icc编译成可以充分利用Intel CPU的二进制。 + + +## 新特点 + +* 发布 `PaddlePaddle Fluid`。 +* 增加了用于模型预测的C-API。 +* 用Fluid API实现了一个简单的GAN的例子。 +* 增加了关于性能调优的文档。 +* 为`paddle.v2.dataset`下载数据集提供了重试机制. +* C++中使用protobuf-lite替换protobuf减少了二进制的大小。 +* 发布了新特性 [Elastic Deep Learning (EDL)](https://github.com/PaddlePaddle/cloud/tree/develop/doc/autoscale/experiment). +* 基于Bazel API利用cmake实现了一个的新的构建系统函数库。 +* 当使用编译选项`WITH_MKL=ON`时自动下载和编译Intel® [MKLML](https://github.com/01org/mkl-dnn/releases/download/v0.11/mklml_lnx_2018.0.1.20171007.tgz) 函数库. +* [Intel® MKL-DNN on PaddlePaddle](https://github.com/PaddlePaddle/Paddle/tree/develop/doc/design/mkldnn): + - 完成了 11个 MKL-DNN 层: Convolution, Fully connectivity, Pooling, ReLU, Tanh, ELU, Softmax, BatchNorm, AddTo, Concat, LRN。 + - 完成了 3个 MKL-DNN 网络: VGG-19, ResNet-50, GoogleNet + - 基于Intel Skylake 6148 CPU的[性能测试](https://github.com/PaddlePaddle/Paddle/blob/develop/benchmark/IntelOptimizedPaddle.md) : 相对于MKLML有2~3倍的训练加速。 +* 增加 [softsign activation](http://www.paddlepaddle.org/docs/develop/documentation/zh/api/v2/config/activation.html#softsign) +* 增加 [dot product layer](http://www.paddlepaddle.org/docs/develop/documentation/zh/api/v2/config/layer.html#dot-prod) +* 增加 [L2 distance layer](http://www.paddlepaddle.org/docs/develop/documentation/zh/api/v2/config/layer.html#l2-distance) +* 增加 [sub-nested sequence layer](http://www.paddlepaddle.org/docs/develop/documentation/zh/api/v2/config/layer.html#sub-nested-seq) +* 增加 [kmax sequence score layer](http://www.paddlepaddle.org/docs/develop/documentation/zh/api/v2/config/layer.html#kmax-sequence-score) +* 增加 [sequence slice layer](http://www.paddlepaddle.org/docs/develop/documentation/zh/api/v2/config/layer.html#seq-slice) +* 增加 [row convolution layer](http://www.paddlepaddle.org/docs/develop/documentation/zh/api/v2/config/layer.html#row-conv) +* 增加移动端友好的网页 + +## 改进 + +* 使用一个Python`whl`包即可安装. +* [V2 API可以实现用户定制化评估](https://github.com/PaddlePaddle/models/tree/develop/ltr#训练过程中输出自定义评估指标)。 +* 将 `PADDLE_ONLY_CPU` 改为 `PADDLE_WITH_GPU`, 因为我们会支持多种设备。 +* 删除了有一些bug的BarrierStat。 +* 清理和删除了paddle::Parameter中未使用的函数。 +* 删除了ProtoDataProvider。 +* Huber loss同时支持回归和分类。 +* 为sequence pooling 层增加`stride`参数。 +* v2 API自动使用cudnn batch normalization。 +* 可以使用一个固定的参数名共享BN层的参数。 +* 2D convolution operation支持variable-dimension input特性。 +* 重构cmake中关于CUDA的部分并实现自动检测GPU架构的功能。 +* 优化网页导航。 + +## 错误修复 + +* 修复ROI pooling的Bug. cc9a761 +* 修复当label是dense vector是AUC变成0的问题. #5274 +* 修复WarpCTC 层的Bug. + + # v0.10.0版本 我们非常高兴发布了PaddlePaddle V0.10.0版,并开发了新的[Python API](http://research.baidu.com/paddlepaddles-new-api-simplifies-deep-learning-programs/)。 diff --git a/RELEASE.md b/RELEASE.md index 146f7afa7dfbc152500b82fde28445ae3155c16c..5a62c955131007c9f3329d162c20d1b462550019 100644 --- a/RELEASE.md +++ b/RELEASE.md @@ -1,3 +1,75 @@ +# Release v0.11.0 + +## PaddlePaddle Fluid + +- Release 0.11.0 includes a new feature *PaddlePaddle Fluid*. Fluid is + designed to allow users to program like PyTorch and TensorFlow Eager Execution. + In these systems, there is no longer the concept *model* and applications + do not include a symbolic description of a graph of operators nor a sequence + of layers. Instead, applications look exactly like a usual program that + describes a process of training or inference. The difference between + Fluid and PyTorch or Eager Execution is that Fluid doesn't rely on Python's + control-flow, `if-then-else` nor `for`. Instead, Fluid provides its + C++ implementations and their Python binding using the `with` statement. For an example + + https://github.com/PaddlePaddle/Paddle/blob/3df78ed2a98d37f7ae6725894cc7514effd5664b/python/paddle/v2/fluid/tests/test_while_op.py#L36-L44 + +- In 0.11.0, we provides a C++ class `Executor` to run a Fluid program. +Executor works like an interpreter. In future version, we will improve +`Executor` into a debugger like GDB, and we might provide some compilers, +which, for example, takes an application like the above one, and outputs +an equivalent C++ source program, which can be compiled using +[`nvcc`](http://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html) +to generate binaries that use CUDA, or using +[`icc`](https://software.intel.com/en-us/c-compilers) to generate binaries +that make full use of Intel CPUs. + +## New Features + +* Release `PaddlePaddle Fluid`. +* Add C-API for model inference +* Use fluid API to create a simple GAN demo. +* Add develop guide about performance tunning. +* Add retry when download `paddle.v2.dataset`. +* Linking protobuf-lite not protobuf in C++. Reduce the binary size. +* Feature [Elastic Deep Learning (EDL)](https://github.com/PaddlePaddle/cloud/tree/develop/doc/autoscale/experiment) released. +* A new style cmake functions for Paddle. It is based on Bazel API. +* Automatically download and compile with Intel® [MKLML](https://github.com/01org/mkl-dnn/releases/download/v0.11/mklml_lnx_2018.0.1.20171007.tgz) library as CBLAS when build `WITH_MKL=ON`. +* [Intel® MKL-DNN on PaddlePaddle](https://github.com/PaddlePaddle/Paddle/tree/develop/doc/design/mkldnn): + - Complete 11 MKL-DNN layers: Convolution, Fully connectivity, Pooling, ReLU, Tanh, ELU, Softmax, BatchNorm, AddTo, Concat, LRN. + - Complete 3 MKL-DNN networks: VGG-19, ResNet-50, GoogleNet + - [Benchmark](https://github.com/PaddlePaddle/Paddle/blob/develop/benchmark/IntelOptimizedPaddle.md) on Intel Skylake 6148 CPU: 2~3x training speedup compared with MKLML. +* Add the [`softsign` activation](http://www.paddlepaddle.org/docs/develop/documentation/zh/api/v2/config/activation.html#softsign). +* Add the [dot product layer](http://www.paddlepaddle.org/docs/develop/documentation/zh/api/v2/config/layer.html#dot-prod). +* Add the [L2 distance layer](http://www.paddlepaddle.org/docs/develop/documentation/zh/api/v2/config/layer.html#l2-distance). +* Add the [sub-nested sequence layer](http://www.paddlepaddle.org/docs/develop/documentation/zh/api/v2/config/layer.html#sub-nested-seq). +* Add the [kmax sequence score layer](http://www.paddlepaddle.org/docs/develop/documentation/zh/api/v2/config/layer.html#kmax-sequence-score). +* Add the [sequence slice layer](http://www.paddlepaddle.org/docs/develop/documentation/zh/api/v2/config/layer.html#seq-slice). +* Add the [row convolution layer](http://www.paddlepaddle.org/docs/develop/documentation/zh/api/v2/config/layer.html#row-conv) +* Add mobile friendly webpages. + +## Improvements + +* Build and install using a single `whl` package. +* [Custom evaluating in V2 API](https://github.com/PaddlePaddle/models/tree/develop/ltr#训练过程中输出自定义评估指标). +* Change `PADDLE_ONLY_CPU` to `PADDLE_WITH_GPU`, since we will support many kinds of devices. +* Remove buggy BarrierStat. +* Clean and remove unused functions in paddle::Parameter. +* Remove ProtoDataProvider. +* Huber loss supports both regression and classification. +* Add the `stride` parameter for sequence pooling layers. +* Enable v2 API use cudnn batch normalization automatically. +* The BN layer's parameter can be shared by a fixed the parameter name. +* Support variable-dimension input feature for 2D convolution operation. +* Refine cmake about CUDA to automatically detect GPU architecture. +* Improved website navigation. + +## Bug Fixes + +* Fix bug in ROI pooling. cc9a761 +* Fix AUC is zero when label is dense vector. #5274 +* Fix bug in WarpCTC layer. + # Release v0.10.0 We are glad to release version 0.10.0. In this version, we are happy to release the new diff --git a/benchmark/IntelOptimizedPaddle.md b/benchmark/IntelOptimizedPaddle.md new file mode 100644 index 0000000000000000000000000000000000000000..8b7dc5b7db800896eb4de2054ab5e584aed93999 --- /dev/null +++ b/benchmark/IntelOptimizedPaddle.md @@ -0,0 +1,112 @@ +# Benchmark + +Machine: + +- Server: Intel(R) Xeon(R) Gold 6148 CPU @ 2.40GHz, 2 Sockets, 20 Cores per socket +- Laptop: TBD + +System: CentOS release 6.3 (Final), Docker 1.12.1. + +PaddlePaddle: +- paddlepaddle/paddle:0.11.0 (for MKLML and MKL-DNN) + - MKL-DNN tag v0.11 + - MKLML 2018.0.1.20171007 +- paddlepaddle/paddle:0.11.0-openblas (for OpenBLAS) + - OpenBLAS v0.2.20 + +On each machine, we will test and compare the performance of training on single node using MKL-DNN / MKLML / OpenBLAS respectively. + +## Benchmark Model + +### Server + +#### Training +Test on batch size 64, 128, 256 on Intel(R) Xeon(R) Gold 6148 CPU @ 2.40GHz +Pay attetion that the speed below includes forward, backward and parameter update time. So we can not directly compare the data with the benchmark of caffe `time` [command](https://github.com/PaddlePaddle/Paddle/blob/develop/benchmark/caffe/image/run.sh#L9), which only contain forward and backward. The updating time of parameter would become very heavy when the weight size are large, especially on alexnet. + +Input image size - 3 * 224 * 224, Time: images/second + +- VGG-19 + +| BatchSize | 64 | 128 | 256 | +|--------------|-------| -----| --------| +| OpenBLAS | 7.80 | 9.00 | 10.80 | +| MKLML | 12.12 | 13.70 | 16.18 | +| MKL-DNN | 28.46 | 29.83 | 30.44 | + + + + - ResNet-50 + +| BatchSize | 64 | 128 | 256 | +|--------------|-------| ------| -------| +| OpenBLAS | 25.22 | 25.68 | 27.12 | +| MKLML | 32.52 | 31.89 | 33.12 | +| MKL-DNN | 81.69 | 82.35 | 84.08 | + + + + - GoogLeNet + +| BatchSize | 64 | 128 | 256 | +|--------------|-------| ------| -------| +| OpenBLAS | 89.52 | 96.97 | 108.25 | +| MKLML | 128.46| 137.89| 158.63 | +| MKL-DNN     | 250.46| 264.83| 269.50 | + + + +- AlexNet + +| BatchSize | 64 | 128 | 256 | +|--------------|--------| ------ | -------| +| OpenBLAS | 45.62 | 72.79 | 107.22 | +| MKLML | 66.37 | 105.60 | 144.04 | +| MKL-DNN | 399.00 | 498.94 | 626.53 | + + + +#### Inference +Test on batch size 1, 2, 4, 8, 16 on Intel(R) Xeon(R) Gold 6148 CPU @ 2.40GHz +- VGG-19 + +| BatchSize | 1 | 2 | 4 | 8 | 16 | +|-----------|-------|-------|-------|-------|-------| +| OpenBLAS | 1.10 | 1.96 | 3.62 | 3.63 | 2.25 | +| MKLML | 5.58 | 9.80 | 15.15 | 21.21 | 28.67 | +| MKL-DNN | 75.07 | 88.64 | 82.58 | 92.29 | 96.75 | + + + +- ResNet-50 + +| BatchSize | 1 | 2 | 4 | 8 | 16 | +|-----------|-------|--------|--------|--------|--------| +| OpenBLAS | 3.31 | 6.72 | 11.59 | 13.17 | 9.27 | +| MKLML | 6.33 | 12.02 | 22.88 | 40.53 | 63.09 | +| MKL-DNN | 107.83| 148.84 | 177.78 | 189.35 | 217.69 | + + + +- GoogLeNet + +| BatchSize | 1 | 2 | 4 | 8 | 16 | +|-----------|--------|--------|--------|--------|--------| +| OpenBLAS | 12.06 | 23.56 | 34.48 | 36.45 | 23.12 | +| MKLML | 22.74 | 41.56 | 81.22 | 133.47 | 210.53 | +| MKL-DNN | 175.10 | 272.92 | 450.70 | 512.00 | 600.94 | + + + +- AlexNet + +| BatchSize | 1 | 2 | 4 | 8 | 16 | +|-----------|--------|--------|--------|--------|--------| +| OpenBLAS | 3.53 | 6.23 | 15.04 | 26.06 | 31.62 | +| MKLML | 21.32 | 36.55 | 73.06 | 131.15 | 192.77 | +| MKL-DNN | 442.91 | 656.41 | 719.10 | 847.68 | 850.51 | + + + +### Laptop +TBD diff --git a/benchmark/cluster/README.md b/benchmark/cluster/README.md new file mode 100644 index 0000000000000000000000000000000000000000..b619613ea7a5b6e940ec735314e8e47338b2c600 --- /dev/null +++ b/benchmark/cluster/README.md @@ -0,0 +1,78 @@ +# Cluster Training Benchmark + +## Setup + +- Platform + - Kubernetes: v1.6.2 + - Linux Kernel: v3.10.0 + +- Resource + - CPU: 10 Cores per Pod + - Memory: 5GB per Pod + +- Docker Image + + We use different base Docker Image to run the benchmark on Kubernetes: + - PaddlePaddle v2: paddlepaddle/paddle:0.11.0 + - PaddlePaddle Fluid: paddlepaddle/paddle:[commit-id] + - TensorFlow: tensorflow/tensorflow:1.5.0-rc0 + +- Model + vgg16 is used in this benchmark. + +## Cases + +- Variable + - Batch Size of training data. + - PServer count of the training job. + - The number of trainers. + +- Invariant + - The resource of trainer/pserver Pod. + +### Measure the Performance for Different Batch Size + +- PServer Count: 40 +- Trainer Count: 100 +- Metrics: mini-batch / sec + +| Batch Size | 32 | 64 | 128 | 256 | +| -- | -- | -- | -- | -- | +| PaddlePaddle Fluid | - | - | - | - | +| PaddlePaddle v2 | - | - | - | - | +| TensorFlow | - | - | - | - | + +### Measure the Performance for Different PServer Count + +- Trainer Count: 100 +- Batch Size: 64 +- Metrics: mini-batch / sec + +| PServer Count | 10 | 20 | 40 | 60 | +| -- | -- | -- | -- | -- | +| PaddlePaddle Fluid | - | - | - | - | +| PaddlePaddle v2 | - | - | - | - | +| TensorFlow | - | - | - | - | + +### Measure Parallel Efficiency By Increasing Trainer Count + +- PServer Count: 20 +- Batch Size: 64 +- Metrics: + +$S = \div(T1, TN)$ + +which S is the ratio of T1 over TN, training time of 1 and N trainers. +The parallel efficiency is: + +$E = \div(S, N)$ + +| Trainer Counter | 1 | 10 | 20 | 30 | 40 | 50 | 60 | 70 | 80 | 90 | 100 | +| -- | -- | -- | -- | -- | -- | -- | -- | -- | -- | -- | -- | +| PaddlePaddle Fluid | - | - | - | - | - | - | - | - | - | - | - | +| PaddlePaddle v2 | - | - | - | - | - | - | - | - | - | - | - | - | +| TensorFlow | - | - | - | - | - | - | - | - | - | - | - | - | - | + +## Reproduce the benchmark + +TODO diff --git a/benchmark/figs/alexnet-cpu-infer.png b/benchmark/figs/alexnet-cpu-infer.png new file mode 100644 index 0000000000000000000000000000000000000000..6215ae4e4288f969a909c258ddd5b5f51e6abb3f Binary files /dev/null and b/benchmark/figs/alexnet-cpu-infer.png differ diff --git a/benchmark/figs/alexnet-cpu-train.png b/benchmark/figs/alexnet-cpu-train.png new file mode 100644 index 0000000000000000000000000000000000000000..b3200bbc049a9d75857fb5692902d7b475aa8f68 Binary files /dev/null and b/benchmark/figs/alexnet-cpu-train.png differ diff --git a/benchmark/figs/googlenet-cpu-infer.png b/benchmark/figs/googlenet-cpu-infer.png new file mode 100644 index 0000000000000000000000000000000000000000..19478d433bae651f4506153ded11a96d5137b409 Binary files /dev/null and b/benchmark/figs/googlenet-cpu-infer.png differ diff --git a/benchmark/figs/googlenet-cpu-train.png b/benchmark/figs/googlenet-cpu-train.png new file mode 100644 index 0000000000000000000000000000000000000000..4e86e058d0654d02c898bf7f5fe73aa1c7614e20 Binary files /dev/null and b/benchmark/figs/googlenet-cpu-train.png differ diff --git a/benchmark/figs/resnet-cpu-infer.png b/benchmark/figs/resnet-cpu-infer.png new file mode 100644 index 0000000000000000000000000000000000000000..bc43d4b8d20c600d6f1046a5986a6c62adfa6b44 Binary files /dev/null and b/benchmark/figs/resnet-cpu-infer.png differ diff --git a/benchmark/figs/resnet-cpu-train.png b/benchmark/figs/resnet-cpu-train.png new file mode 100644 index 0000000000000000000000000000000000000000..96746b1759fa17d25ac5f40ed3678e16086364ba Binary files /dev/null and b/benchmark/figs/resnet-cpu-train.png differ diff --git a/benchmark/figs/vgg-cpu-infer.png b/benchmark/figs/vgg-cpu-infer.png new file mode 100644 index 0000000000000000000000000000000000000000..3a51ec6c474f0e0f0c4384c8ccd1e08c4382230b Binary files /dev/null and b/benchmark/figs/vgg-cpu-infer.png differ diff --git a/benchmark/figs/vgg-cpu-train.png b/benchmark/figs/vgg-cpu-train.png new file mode 100644 index 0000000000000000000000000000000000000000..6d548cfd59f86f8166c011d71ebde4e4b33ef644 Binary files /dev/null and b/benchmark/figs/vgg-cpu-train.png differ diff --git a/benchmark/paddle/image/alexnet.py b/benchmark/paddle/image/alexnet.py index 3358d43a4b08c6a9b89d59e1a8be53ee1f12bbe0..70296081877588885a7a6d4d8491409b41a8d378 100644 --- a/benchmark/paddle/image/alexnet.py +++ b/benchmark/paddle/image/alexnet.py @@ -1,4 +1,16 @@ -#!/usr/bin/env python +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. from paddle.trainer_config_helpers import * @@ -6,10 +18,24 @@ height = 227 width = 227 num_class = 1000 batch_size = get_config_arg('batch_size', int, 128) +gp = get_config_arg('layer_num', int, 1) +is_infer = get_config_arg("is_infer", bool, False) +num_samples = get_config_arg('num_samples', int, 2560) -args = {'height': height, 'width': width, 'color': True, 'num_class': num_class} +args = { + 'height': height, + 'width': width, + 'color': True, + 'num_class': num_class, + 'is_infer': is_infer, + 'num_samples': num_samples +} define_py_data_sources2( - "train.list", None, module="provider", obj="process", args=args) + "train.list" if not is_infer else None, + "test.list" if is_infer else None, + module="provider", + obj="process", + args=args) settings( batch_size=batch_size, @@ -31,7 +57,7 @@ net = img_pool_layer(input=net, pool_size=3, stride=2) # conv2 net = img_conv_layer( - input=net, filter_size=5, num_filters=256, stride=1, padding=2, groups=1) + input=net, filter_size=5, num_filters=256, stride=1, padding=2, groups=gp) net = img_cmrnorm_layer(input=net, size=5, scale=0.0001, power=0.75) net = img_pool_layer(input=net, pool_size=3, stride=2) @@ -40,11 +66,11 @@ net = img_conv_layer( input=net, filter_size=3, num_filters=384, stride=1, padding=1) # conv4 net = img_conv_layer( - input=net, filter_size=3, num_filters=384, stride=1, padding=1, groups=1) + input=net, filter_size=3, num_filters=384, stride=1, padding=1, groups=gp) # conv5 net = img_conv_layer( - input=net, filter_size=3, num_filters=256, stride=1, padding=1, groups=1) + input=net, filter_size=3, num_filters=256, stride=1, padding=1, groups=gp) net = img_pool_layer(input=net, pool_size=3, stride=2) net = fc_layer( @@ -59,6 +85,9 @@ net = fc_layer( layer_attr=ExtraAttr(drop_rate=0.5)) net = fc_layer(input=net, size=1000, act=SoftmaxActivation()) -lab = data_layer('label', num_class) -loss = cross_entropy(input=net, label=lab) -outputs(loss) +if is_infer: + outputs(net) +else: + lab = data_layer('label', num_class) + loss = cross_entropy(input=net, label=lab) + outputs(loss) diff --git a/benchmark/paddle/image/googlenet.py b/benchmark/paddle/image/googlenet.py index bc893bab98c4d2e07c62fbd012d51a0939db4766..2a850ccb7f2c75b467554181fc5f4aa8f2b97a09 100644 --- a/benchmark/paddle/image/googlenet.py +++ b/benchmark/paddle/image/googlenet.py @@ -5,10 +5,24 @@ height = 224 width = 224 num_class = 1000 batch_size = get_config_arg('batch_size', int, 128) - -args = {'height': height, 'width': width, 'color': True, 'num_class': num_class} +use_gpu = get_config_arg('use_gpu', bool, True) +is_infer = get_config_arg("is_infer", bool, False) +num_samples = get_config_arg('num_samples', int, 2560) + +args = { + 'height': height, + 'width': width, + 'color': True, + 'num_class': num_class, + 'is_infer': is_infer, + 'num_samples': num_samples +} define_py_data_sources2( - "train.list", None, module="provider", obj="process", args=args) + "train.list" if not is_infer else None, + "test.list" if is_infer else None, + module="provider", + obj="process", + args=args) settings( batch_size=batch_size, @@ -16,6 +30,8 @@ settings( learning_method=MomentumOptimizer(0.9), regularization=L2Regularization(0.0005 * batch_size)) +conv_projection = conv_projection if use_gpu else img_conv_layer + def inception2(name, input, channels, \ filter1, filter3R, filter3, @@ -138,12 +154,11 @@ def inception(name, input, channels, \ cat = concat_layer( name=name, input=[cov1, cov3, cov5, covprj], - bias_attr=True, + bias_attr=True if use_gpu else False, act=ReluActivation()) return cat -lab = data_layer(name="label", size=1000) data = data_layer(name="input", size=3 * height * width) # stage 1 @@ -221,6 +236,10 @@ pool5 = img_pool_layer( dropout = dropout_layer(name="dropout", input=pool5, dropout_rate=0.4) out3 = fc_layer( name="output3", input=dropout, size=1000, act=SoftmaxActivation()) -loss3 = cross_entropy(name='loss3', input=out3, label=lab) -outputs(loss3) +if is_infer: + outputs(out3) +else: + lab = data_layer(name="label", size=num_class) + loss3 = cross_entropy(name='loss3', input=out3, label=lab) + outputs(loss3) diff --git a/benchmark/paddle/image/plotlog.py b/benchmark/paddle/image/plotlog.py new file mode 100644 index 0000000000000000000000000000000000000000..8679d4f272d1b7aaf8d5a397f07698a6b70e4fcd --- /dev/null +++ b/benchmark/paddle/image/plotlog.py @@ -0,0 +1,114 @@ +# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import sys +import argparse +import matplotlib.pyplot as plt + + +def parse_args(): + parser = argparse.ArgumentParser('Parse Log') + parser.add_argument( + '--file_path', '-f', type=str, help='the path of the log file') + parser.add_argument( + '--sample_rate', + '-s', + type=float, + default=1.0, + help='the rate to take samples from log') + parser.add_argument( + '--log_period', '-p', type=int, default=1, help='the period of log') + + args = parser.parse_args() + return args + + +def parse_file(file_name): + loss = [] + error = [] + with open(file_name) as f: + for i, line in enumerate(f): + line = line.strip() + if not line.startswith('pass'): + continue + line_split = line.split(' ') + if len(line_split) != 5: + continue + + loss_str = line_split[2][:-1] + cur_loss = float(loss_str.split('=')[-1]) + loss.append(cur_loss) + + err_str = line_split[3][:-1] + cur_err = float(err_str.split('=')[-1]) + error.append(cur_err) + + accuracy = [1.0 - err for err in error] + + return loss, accuracy + + +def sample(metric, sample_rate): + interval = int(1.0 / sample_rate) + if interval > len(metric): + return metric[:1] + + num = len(metric) / interval + idx = [interval * i for i in range(num)] + metric_sample = [metric[id] for id in idx] + return metric_sample + + +def plot_metric(metric, + batch_id, + graph_title, + line_style='b-', + line_label='y', + line_num=1): + plt.figure() + plt.title(graph_title) + if line_num == 1: + plt.plot(batch_id, metric, line_style, label=line_label) + else: + for i in range(line_num): + plt.plot(batch_id, metric[i], line_style[i], label=line_label[i]) + plt.xlabel('batch') + plt.ylabel(graph_title) + plt.legend() + plt.savefig(graph_title + '.jpg') + plt.close() + + +def main(): + args = parse_args() + assert args.sample_rate > 0. and args.sample_rate <= 1.0, "The sample rate should in the range (0, 1]." + + loss, accuracy = parse_file(args.file_path) + batch = [args.log_period * i for i in range(len(loss))] + + batch_sample = sample(batch, args.sample_rate) + loss_sample = sample(loss, args.sample_rate) + accuracy_sample = sample(accuracy, args.sample_rate) + + plot_metric(loss_sample, batch_sample, 'loss', line_label='loss') + plot_metric( + accuracy_sample, + batch_sample, + 'accuracy', + line_style='g-', + line_label='accuracy') + + +if __name__ == '__main__': + main() diff --git a/benchmark/paddle/image/provider.py b/benchmark/paddle/image/provider.py index 1ac47212b5a75667e8e9d4465b33f575516e2836..21e0d381aab24d21d91cdcc4aa630f6107405f9e 100644 --- a/benchmark/paddle/image/provider.py +++ b/benchmark/paddle/image/provider.py @@ -1,3 +1,17 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import io, os import random import numpy as np @@ -13,14 +27,21 @@ def initHook(settings, height, width, color, num_class, **kwargs): settings.data_size = settings.height * settings.width * 3 else: settings.data_size = settings.height * settings.width - - settings.slots = [dense_vector(settings.data_size), integer_value(1)] + settings.is_infer = kwargs.get('is_infer', False) + settings.num_samples = kwargs.get('num_samples', 2560) + if settings.is_infer: + settings.slots = [dense_vector(settings.data_size)] + else: + settings.slots = [dense_vector(settings.data_size), integer_value(1)] @provider( init_hook=initHook, min_pool_size=-1, cache=CacheType.CACHE_PASS_IN_MEM) def process(settings, file_list): - for i in xrange(1024): + for i in xrange(settings.num_samples): img = np.random.rand(1, settings.data_size).reshape(-1, 1).flatten() - lab = random.randint(0, settings.num_class) - yield img.astype('float32'), int(lab) + if settings.is_infer: + yield img.astype('float32') + else: + lab = random.randint(0, settings.num_class - 1) + yield img.astype('float32'), int(lab) diff --git a/benchmark/paddle/image/resnet.py b/benchmark/paddle/image/resnet.py new file mode 100644 index 0000000000000000000000000000000000000000..2846e4763f1cda4602f03af5ec649d57ee6cf0d8 --- /dev/null +++ b/benchmark/paddle/image/resnet.py @@ -0,0 +1,230 @@ +#!/usr/bin/env python +from paddle.trainer_config_helpers import * + +height = 224 +width = 224 +num_class = 1000 +batch_size = get_config_arg('batch_size', int, 64) +layer_num = get_config_arg("layer_num", int, 50) +is_infer = get_config_arg("is_infer", bool, False) +num_samples = get_config_arg('num_samples', int, 2560) + +args = { + 'height': height, + 'width': width, + 'color': True, + 'num_class': num_class, + 'is_infer': is_infer, + 'num_samples': num_samples +} +define_py_data_sources2( + "train.list" if not is_infer else None, + "test.list" if is_infer else None, + module="provider", + obj="process", + args=args) + +settings( + batch_size=batch_size, + learning_rate=0.01 / batch_size, + learning_method=MomentumOptimizer(0.9), + regularization=L2Regularization(0.0005 * batch_size)) + + +#######################Network Configuration ############# +def conv_bn_layer(name, + input, + filter_size, + num_filters, + stride, + padding, + channels=None, + active_type=ReluActivation()): + """ + A wrapper for conv layer with batch normalization layers. + Note: + conv layer has no activation. + """ + + tmp = img_conv_layer( + name=name + "_conv", + input=input, + filter_size=filter_size, + num_channels=channels, + num_filters=num_filters, + stride=stride, + padding=padding, + act=LinearActivation(), + bias_attr=False) + return batch_norm_layer( + name=name + "_bn", + input=tmp, + act=active_type, + use_global_stats=is_infer) + + +def bottleneck_block(name, input, num_filters1, num_filters2): + """ + A wrapper for bottlenect building block in ResNet. + Last conv_bn_layer has no activation. + Addto layer has activation of relu. + """ + last_name = conv_bn_layer( + name=name + '_branch2a', + input=input, + filter_size=1, + num_filters=num_filters1, + stride=1, + padding=0) + last_name = conv_bn_layer( + name=name + '_branch2b', + input=last_name, + filter_size=3, + num_filters=num_filters1, + stride=1, + padding=1) + last_name = conv_bn_layer( + name=name + '_branch2c', + input=last_name, + filter_size=1, + num_filters=num_filters2, + stride=1, + padding=0, + active_type=LinearActivation()) + + return addto_layer( + name=name + "_addto", input=[input, last_name], act=ReluActivation()) + + +def mid_projection(name, input, num_filters1, num_filters2, stride=2): + """ + A wrapper for middile projection in ResNet. + projection shortcuts are used for increasing dimensions, + and other shortcuts are identity + branch1: projection shortcuts are used for increasing + dimensions, has no activation. + branch2x: bottleneck building block, shortcuts are identity. + """ + # stride = 2 + branch1 = conv_bn_layer( + name=name + '_branch1', + input=input, + filter_size=1, + num_filters=num_filters2, + stride=stride, + padding=0, + active_type=LinearActivation()) + + last_name = conv_bn_layer( + name=name + '_branch2a', + input=input, + filter_size=1, + num_filters=num_filters1, + stride=stride, + padding=0) + last_name = conv_bn_layer( + name=name + '_branch2b', + input=last_name, + filter_size=3, + num_filters=num_filters1, + stride=1, + padding=1) + + last_name = conv_bn_layer( + name=name + '_branch2c', + input=last_name, + filter_size=1, + num_filters=num_filters2, + stride=1, + padding=0, + active_type=LinearActivation()) + + return addto_layer( + name=name + "_addto", input=[branch1, last_name], act=ReluActivation()) + + +img = data_layer(name='image', size=height * width * 3) + + +def deep_res_net(res2_num=3, res3_num=4, res4_num=6, res5_num=3): + """ + A wrapper for 50,101,152 layers of ResNet. + res2_num: number of blocks stacked in conv2_x + res3_num: number of blocks stacked in conv3_x + res4_num: number of blocks stacked in conv4_x + res5_num: number of blocks stacked in conv5_x + """ + # For ImageNet + # conv1: 112x112 + tmp = conv_bn_layer( + "conv1", + input=img, + filter_size=7, + channels=3, + num_filters=64, + stride=2, + padding=3) + tmp = img_pool_layer(name="pool1", input=tmp, pool_size=3, stride=2) + + # conv2_x: 56x56 + tmp = mid_projection( + name="res2_1", input=tmp, num_filters1=64, num_filters2=256, stride=1) + for i in xrange(2, res2_num + 1, 1): + tmp = bottleneck_block( + name="res2_" + str(i), input=tmp, num_filters1=64, num_filters2=256) + + # conv3_x: 28x28 + tmp = mid_projection( + name="res3_1", input=tmp, num_filters1=128, num_filters2=512) + for i in xrange(2, res3_num + 1, 1): + tmp = bottleneck_block( + name="res3_" + str(i), + input=tmp, + num_filters1=128, + num_filters2=512) + + # conv4_x: 14x14 + tmp = mid_projection( + name="res4_1", input=tmp, num_filters1=256, num_filters2=1024) + for i in xrange(2, res4_num + 1, 1): + tmp = bottleneck_block( + name="res4_" + str(i), + input=tmp, + num_filters1=256, + num_filters2=1024) + + # conv5_x: 7x7 + tmp = mid_projection( + name="res5_1", input=tmp, num_filters1=512, num_filters2=2048) + for i in xrange(2, res5_num + 1, 1): + tmp = bottleneck_block( + name="res5_" + str(i), + input=tmp, + num_filters1=512, + num_filters2=2048) + + tmp = img_pool_layer( + name='avgpool', + input=tmp, + pool_size=7, + stride=1, + pool_type=AvgPooling()) + + return fc_layer(input=tmp, size=num_class, act=SoftmaxActivation()) + + +if layer_num == 50: + resnet = deep_res_net(3, 4, 6, 3) +elif layer_num == 101: + resnet = deep_res_net(3, 4, 23, 3) +elif layer_num == 152: + resnet = deep_res_net(3, 8, 36, 3) +else: + print("Wrong layer number.") + +if is_infer: + outputs(resnet) +else: + lbl = data_layer(name="label", size=num_class) + loss = cross_entropy(name='loss', input=resnet, label=lbl) + outputs(loss) diff --git a/benchmark/paddle/image/run_mkl_infer.sh b/benchmark/paddle/image/run_mkl_infer.sh new file mode 100755 index 0000000000000000000000000000000000000000..62c9bf6efd3810f506fd4592b2ba3a21b1b7f0e7 --- /dev/null +++ b/benchmark/paddle/image/run_mkl_infer.sh @@ -0,0 +1,87 @@ +set -e + +function clock_to_seconds() { + hours=`echo $1 | awk -F ':' '{print $1}'` + mins=`echo $1 | awk -F ':' '{print $2}'` + secs=`echo $1 | awk -F ':' '{print $3}'` + echo `awk 'BEGIN{printf "%.2f",('$secs' + '$mins' * 60 + '$hours' * 3600)}'` +} + +function infer() { + unset OMP_NUM_THREADS MKL_NUM_THREADS OMP_DYNAMIC KMP_AFFINITY + topology=$1 + layer_num=$2 + bs=$3 + use_mkldnn=$4 + if [ $4 == "True" ]; then + thread=1 + log="logs/infer-${topology}-${layer_num}-mkldnn-${bs}.log" + elif [ $4 == "False" ]; then + thread=`nproc` + if [ $thread -gt $bs ]; then + thread=$bs + fi + log="logs/infer-${topology}-${layer_num}-${thread}mklml-${bs}.log" + else + echo "Wrong input $4, use True or False." + exit 0 + fi + + models_in="models/${topology}-${layer_num}/pass-00000/" + if [ ! -d $models_in ]; then + echo "Training model ${topology}_${layer_num}" + paddle train --job=train \ + --config="${topology}.py" \ + --use_mkldnn=True \ + --use_gpu=False \ + --trainer_count=1 \ + --num_passes=1 \ + --save_dir="models/${topology}-${layer_num}" \ + --config_args="batch_size=128,layer_num=${layer_num},num_samples=256" \ + > /dev/null 2>&1 + echo "Done" + fi + log_period=$((256 / bs)) + paddle train --job=test \ + --config="${topology}.py" \ + --use_mkldnn=$use_mkldnn \ + --use_gpu=False \ + --trainer_count=$thread \ + --log_period=$log_period \ + --config_args="batch_size=${bs},layer_num=${layer_num},is_infer=True" \ + --init_model_path=$models_in \ + 2>&1 | tee ${log} + + # calculate the last 5 logs period time of 1280 samples, + # the time before are burning time. + start=`tail ${log} -n 7 | head -n 1 | awk -F ' ' '{print $2}' | xargs` + end=`tail ${log} -n 2 | head -n 1 | awk -F ' ' '{print $2}' | xargs` + start_sec=`clock_to_seconds $start` + end_sec=`clock_to_seconds $end` + fps=`awk 'BEGIN{printf "%.2f",(1280 / ('$end_sec' - '$start_sec'))}'` + echo "Last 1280 samples start: ${start}(${start_sec} sec), end: ${end}(${end_sec} sec;" >> ${log} + echo "FPS: $fps images/sec" 2>&1 | tee -a ${log} +} + +if [ ! -f "train.list" ]; then + echo " " > train.list +fi +if [ ! -f "test.list" ]; then + echo " " > test.list +fi +if [ ! -d "logs" ]; then + mkdir logs +fi +if [ ! -d "models" ]; then + mkdir -p models +fi + +# inference benchmark +for use_mkldnn in True False; do + for batchsize in 1 2 4 8 16; do + infer vgg 19 $batchsize $use_mkldnn + infer resnet 50 $batchsize $use_mkldnn + infer googlenet v1 $batchsize $use_mkldnn + infer alexnet 2 $batchsize $use_mkldnn + done +done diff --git a/benchmark/paddle/image/run_mkl_train.sh b/benchmark/paddle/image/run_mkl_train.sh new file mode 100755 index 0000000000000000000000000000000000000000..03d2d378fb72e36f765d89af788f6ee96fe21d4e --- /dev/null +++ b/benchmark/paddle/image/run_mkl_train.sh @@ -0,0 +1,52 @@ +set -e + +function train() { + unset OMP_NUM_THREADS MKL_NUM_THREADS OMP_DYNAMIC KMP_AFFINITY + topology=$1 + layer_num=$2 + bs=$3 + use_mkldnn=$4 + if [ $4 == "True" ]; then + thread=1 + log="logs/train-${topology}-${layer_num}-mkldnn-${bs}.log" + elif [ $4 == "False" ]; then + thread=`nproc` + # each trainer_count use only 1 core to avoid conflict + log="logs/train-${topology}-${layer_num}-${thread}mklml-${bs}.log" + else + echo "Wrong input $4, use True or False." + exit 0 + fi + args="batch_size=${bs},layer_num=${layer_num}" + config="${topology}.py" + paddle train --job=time \ + --config=$config \ + --use_mkldnn=$use_mkldnn \ + --use_gpu=False \ + --trainer_count=$thread \ + --log_period=10 \ + --test_period=100 \ + --config_args=$args \ + 2>&1 | tee ${log} + + avg_time=`tail ${log} -n 1 | awk -F ' ' '{print $8}' | sed 's/avg=//'` + fps=`awk 'BEGIN{printf "%.2f",('$bs' / '$avg_time' * 1000)}'` + echo "FPS: $fps images/sec" 2>&1 | tee -a ${log} +} + +if [ ! -f "train.list" ]; then + echo " " > train.list +fi +if [ ! -d "logs" ]; then + mkdir logs +fi + +# training benchmark +for use_mkldnn in True False; do + for batchsize in 64 128 256; do + train vgg 19 $batchsize $use_mkldnn + train resnet 50 $batchsize $use_mkldnn + train googlenet v1 $batchsize $use_mkldnn + train alexnet 2 $batchsize $use_mkldnn + done +done diff --git a/benchmark/paddle/image/run_openblas_infer.sh b/benchmark/paddle/image/run_openblas_infer.sh new file mode 100755 index 0000000000000000000000000000000000000000..a9a7b8a66717c4be0543c3fe2db293fe199e3dc4 --- /dev/null +++ b/benchmark/paddle/image/run_openblas_infer.sh @@ -0,0 +1,69 @@ +set -e + +function clock_to_seconds() { + hours=`echo $1 | awk -F ':' '{print $1}'` + mins=`echo $1 | awk -F ':' '{print $2}'` + secs=`echo $1 | awk -F ':' '{print $3}'` + echo `awk 'BEGIN{printf "%.2f",('$secs' + '$mins' * 60 + '$hours' * 3600)}'` +} + +function infer() { + export OPENBLAS_MAIN_FREE=1 + topology=$1 + layer_num=$2 + bs=$3 + trainers=`nproc` + if [ $trainers -gt $bs ]; then + trainers=$bs + fi + log="logs/infer-${topology}-${layer_num}-${trainers}openblas-${bs}.log" + threads=$((`nproc` / trainers)) + if [ $threads -eq 0 ]; then + threads=1 + fi + export OPENBLAS_NUM_THREADS=$threads + + models_in="models/${topology}-${layer_num}/pass-00000/" + if [ ! -d $models_in ]; then + echo "./run_mkl_infer.sh to save the model first" + exit 0 + fi + log_period=$((32 / bs)) + paddle train --job=test \ + --config="${topology}.py" \ + --use_mkldnn=False \ + --use_gpu=False \ + --trainer_count=$trainers \ + --log_period=$log_period \ + --config_args="batch_size=${bs},layer_num=${layer_num},is_infer=True,num_samples=256" \ + --init_model_path=$models_in \ + 2>&1 | tee ${log} + + # calculate the last 5 logs period time of 160(=32*5) samples, + # the time before are burning time. + start=`tail ${log} -n 7 | head -n 1 | awk -F ' ' '{print $2}' | xargs` + end=`tail ${log} -n 2 | head -n 1 | awk -F ' ' '{print $2}' | xargs` + start_sec=`clock_to_seconds $start` + end_sec=`clock_to_seconds $end` + fps=`awk 'BEGIN{printf "%.2f",(160 / ('$end_sec' - '$start_sec'))}'` + echo "Last 160 samples start: ${start}(${start_sec} sec), end: ${end}(${end_sec} sec;" >> ${log} + echo "FPS: $fps images/sec" 2>&1 | tee -a ${log} +} + +if [ ! -f "train.list" ]; then + echo " " > train.list +fi +if [ ! -f "test.list" ]; then + echo " " > test.list +fi +if [ ! -d "logs" ]; then + mkdir logs +fi + +# inference benchmark +for batchsize in 1 2 4 8 16; do + infer vgg 19 $batchsize + infer resnet 50 $batchsize + infer googlenet v1 $batchsize + infer alexnet 2 $batchsize +done diff --git a/benchmark/paddle/image/run_openblas_train.sh b/benchmark/paddle/image/run_openblas_train.sh new file mode 100755 index 0000000000000000000000000000000000000000..935cff6f2c97d25d6de556cfee25e27dbe49b5b6 --- /dev/null +++ b/benchmark/paddle/image/run_openblas_train.sh @@ -0,0 +1,41 @@ +set -e + +function train() { + export OPENBLAS_NUM_THREADS=1 + topology=$1 + layer_num=$2 + bs=$3 + thread=`nproc` + # each trainer_count use only 1 core to avoid conflict + log="logs/train-${topology}-${layer_num}-${thread}openblas-${bs}.log" + args="batch_size=${bs},layer_num=${layer_num}" + config="${topology}.py" + paddle train --job=time \ + --config=$config \ + --use_mkldnn=False \ + --use_gpu=False \ + --trainer_count=$thread \ + --log_period=3 \ + --test_period=30 \ + --config_args=$args \ + 2>&1 | tee ${log} + + avg_time=`tail ${log} -n 1 | awk -F ' ' '{print $8}' | sed 's/avg=//'` + fps=`awk 'BEGIN{printf "%.2f",('$bs' / '$avg_time' * 1000)}'` + echo "FPS: $fps images/sec" 2>&1 | tee -a ${log} +} + +if [ ! -f "train.list" ]; then + echo " " > train.list +fi +if [ ! -d "logs" ]; then + mkdir logs +fi + +# training benchmark +for batchsize in 64 128 256; do + train vgg 19 $batchsize + train resnet 50 $batchsize + train googlenet v1 $batchsize + train alexnet 2 $batchsize +done diff --git a/benchmark/paddle/image/vgg.py b/benchmark/paddle/image/vgg.py new file mode 100644 index 0000000000000000000000000000000000000000..ca0a6798fb8c35b68cf84d263855955eb93ba0b0 --- /dev/null +++ b/benchmark/paddle/image/vgg.py @@ -0,0 +1,119 @@ +#!/usr/bin/env python +from paddle.trainer_config_helpers import * + +height = 224 +width = 224 +num_class = 1000 +batch_size = get_config_arg('batch_size', int, 64) +layer_num = get_config_arg('layer_num', int, 19) +is_infer = get_config_arg("is_infer", bool, False) +num_samples = get_config_arg('num_samples', int, 2560) + +args = { + 'height': height, + 'width': width, + 'color': True, + 'num_class': num_class, + 'is_infer': is_infer, + 'num_samples': num_samples +} +define_py_data_sources2( + "train.list" if not is_infer else None, + "test.list" if is_infer else None, + module="provider", + obj="process", + args=args) + +settings( + batch_size=batch_size, + learning_rate=0.001 / batch_size, + learning_method=MomentumOptimizer(0.9), + regularization=L2Regularization(0.0005 * batch_size)) + +img = data_layer(name='image', size=height * width * 3) + + +def vgg_network(vgg_num=3): + tmp = img_conv_group( + input=img, + num_channels=3, + conv_padding=1, + conv_num_filter=[64, 64], + conv_filter_size=3, + conv_act=ReluActivation(), + pool_size=2, + pool_stride=2, + pool_type=MaxPooling()) + + tmp = img_conv_group( + input=tmp, + conv_num_filter=[128, 128], + conv_padding=1, + conv_filter_size=3, + conv_act=ReluActivation(), + pool_stride=2, + pool_type=MaxPooling(), + pool_size=2) + + channels = [] + for i in range(vgg_num): + channels.append(256) + tmp = img_conv_group( + input=tmp, + conv_num_filter=channels, + conv_padding=1, + conv_filter_size=3, + conv_act=ReluActivation(), + pool_stride=2, + pool_type=MaxPooling(), + pool_size=2) + channels = [] + for i in range(vgg_num): + channels.append(512) + tmp = img_conv_group( + input=tmp, + conv_num_filter=channels, + conv_padding=1, + conv_filter_size=3, + conv_act=ReluActivation(), + pool_stride=2, + pool_type=MaxPooling(), + pool_size=2) + tmp = img_conv_group( + input=tmp, + conv_num_filter=channels, + conv_padding=1, + conv_filter_size=3, + conv_act=ReluActivation(), + pool_stride=2, + pool_type=MaxPooling(), + pool_size=2) + + tmp = fc_layer( + input=tmp, + size=4096, + act=ReluActivation(), + layer_attr=ExtraAttr(drop_rate=0.5)) + + tmp = fc_layer( + input=tmp, + size=4096, + act=ReluActivation(), + layer_attr=ExtraAttr(drop_rate=0.5)) + + return fc_layer(input=tmp, size=num_class, act=SoftmaxActivation()) + + +if layer_num == 16: + vgg = vgg_network(3) +elif layer_num == 19: + vgg = vgg_network(4) +else: + print("Wrong layer number.") + +if is_infer: + outputs(vgg) +else: + lab = data_layer('label', num_class) + loss = cross_entropy(input=vgg, label=lab) + outputs(loss) diff --git a/benchmark/paddle/rnn/imdb.py b/benchmark/paddle/rnn/imdb.py index fc4ed4025f9ed2e0a32a1709ff8df4af53521196..c3b5faa19aaa325aaaf9cd9cc8f757d3f3b3bdcb 100755 --- a/benchmark/paddle/rnn/imdb.py +++ b/benchmark/paddle/rnn/imdb.py @@ -1,3 +1,17 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + from __future__ import print_function import six.moves.cPickle as pickle import gzip diff --git a/benchmark/paddle/rnn/provider.py b/benchmark/paddle/rnn/provider.py index 928ca75daf84ccebb775364b0be0d8b3d5eebff9..f35cd5b079ff09ac35b171ff3032ecc5adabc947 100644 --- a/benchmark/paddle/rnn/provider.py +++ b/benchmark/paddle/rnn/provider.py @@ -1,3 +1,17 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import io, os import random import numpy as np diff --git a/benchmark/tensorflow/image/alexnet.py b/benchmark/tensorflow/image/alexnet.py index f6a39ef778e21bee7374718a1b1ddf43392825a8..a37d7e7c62282891d67cee74cac1408eac1244f7 100644 --- a/benchmark/tensorflow/image/alexnet.py +++ b/benchmark/tensorflow/image/alexnet.py @@ -1,3 +1,17 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + from six.moves import xrange # pylint: disable=redefined-builtin from datetime import datetime import math diff --git a/benchmark/tensorflow/image/alexnet_multi_gpu.py b/benchmark/tensorflow/image/alexnet_multi_gpu.py index 7b5ee78f4dd5429abd85d75c092a6e3a2a39f922..2ebab8fb60d471cd6de6d332d81608f2a992b9be 100644 --- a/benchmark/tensorflow/image/alexnet_multi_gpu.py +++ b/benchmark/tensorflow/image/alexnet_multi_gpu.py @@ -1,3 +1,17 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + from six.moves import xrange # pylint: disable=redefined-builtin from datetime import datetime import math diff --git a/benchmark/tensorflow/image/googlenet.py b/benchmark/tensorflow/image/googlenet.py index decf855b54451efba5f6a7868fbcf631789f3572..1202cbb171efd74dec1fc6690a82e1f5126685f0 100644 --- a/benchmark/tensorflow/image/googlenet.py +++ b/benchmark/tensorflow/image/googlenet.py @@ -1,3 +1,17 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + from six.moves import xrange from datetime import datetime import math diff --git a/benchmark/tensorflow/image/googlenet_multi_gpu.py b/benchmark/tensorflow/image/googlenet_multi_gpu.py index 31466faa37c47c66e4fe4628e28c867875e89f2e..f06437eb6c82bf0dfbb9a76799e83182ec5ee888 100644 --- a/benchmark/tensorflow/image/googlenet_multi_gpu.py +++ b/benchmark/tensorflow/image/googlenet_multi_gpu.py @@ -1,3 +1,17 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + from six.moves import xrange # pylint: disable=redefined-builtin from datetime import datetime import math diff --git a/benchmark/tensorflow/image/smallnet_mnist_cifar.py b/benchmark/tensorflow/image/smallnet_mnist_cifar.py index 1a625134a6c58586b29190ede9c66253f484d2cf..558c68575f4ae3ceba7119ed081549ad63e208d8 100644 --- a/benchmark/tensorflow/image/smallnet_mnist_cifar.py +++ b/benchmark/tensorflow/image/smallnet_mnist_cifar.py @@ -1,3 +1,17 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + from six.moves import xrange # pylint: disable=redefined-builtin from datetime import datetime import math diff --git a/benchmark/tensorflow/rnn/reader.py b/benchmark/tensorflow/rnn/reader.py index f538329a15ea9ad9293c97c94340989e2c421eb2..9660d3c22b3a16954b0a1d09d38cf033824f0a5f 100755 --- a/benchmark/tensorflow/rnn/reader.py +++ b/benchmark/tensorflow/rnn/reader.py @@ -1,3 +1,17 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import os.path import io import numpy as np diff --git a/cmake/cblas.cmake b/cmake/cblas.cmake index 913f711afff3b8f9f77b8da978a3b9e7165d0077..6320b17520a687f88993b6f464d9115838b0f96b 100644 --- a/cmake/cblas.cmake +++ b/cmake/cblas.cmake @@ -1,85 +1,27 @@ # Find the CBlas and lapack libraries # -# It will search MKL, atlas, OpenBlas, reference-cblas in order. +# It will search MKLML, atlas, OpenBlas, reference-cblas in order. # # If any cblas implementation found, the following variable will be set. -# CBLAS_PROVIDER # one of MKL, ATLAS, OPENBLAS, REFERENCE +# CBLAS_PROVIDER # one of MKLML, OPENBLAS, REFERENCE # CBLAS_INC_DIR # the include directory for cblas. # CBLAS_LIBS # a list of libraries should be linked by paddle. # # Each library should be full path to object file. -# -# User should set one of MKL_ROOT, ATLAS_ROOT, OPENBLAS_ROOT, REFERENCE_CBLAS_ROOT -# during cmake. If none of them set, it will try to find cblas implementation in -# system paths. -# set(CBLAS_FOUND OFF) -## Find MKL First. -set(INTEL_ROOT "/opt/intel" CACHE PATH "Folder contains intel libs") -set(MKL_ROOT ${INTEL_ROOT}/mkl CACHE PATH "Folder contains MKL") - -find_path(MKL_INC_DIR mkl.h PATHS - ${MKL_ROOT}/include) -find_path(MKL_LAPACK_INC_DIR mkl_lapacke.h PATHS - ${MKL_ROOT}/include) -find_library(MKL_CORE_LIB NAMES mkl_core PATHS - ${MKL_ROOT}/lib - ${MKL_ROOT}/lib/intel64) -find_library(MKL_SEQUENTIAL_LIB NAMES mkl_sequential PATHS - ${MKL_ROOT}/lib - ${MKL_ROOT}/lib/intel64) -find_library(MKL_INTEL_LP64 NAMES mkl_intel_lp64 PATHS - ${MKL_ROOT}/lib - ${MKL_ROOT}/lib/intel64) - -if(MKL_LAPACK_INC_DIR AND MKL_INC_DIR AND MKL_CORE_LIB AND MKL_SEQUENTIAL_LIB AND MKL_INTEL_LP64) - set(CBLAS_FOUND ON) - set(CBLAS_PROVIDER MKL) - set(CBLAS_INC_DIR ${MKL_INC_DIR} ${MKL_LAPACK_INC_DIR}) - set(CBLAS_LIBRARIES ${MKL_INTEL_LP64} ${MKL_SEQUENTIAL_LIB} ${MKL_CORE_LIB}) - - add_definitions(-DPADDLE_USE_MKL) - add_definitions(-DLAPACK_FOUND) - - message(STATUS "Found MKL (include: ${MKL_INC_DIR}, library: ${CBLAS_LIBRARIES})") - message(STATUS "Found lapack in MKL (include: ${MKL_LAPACK_INC_DIR})") - return() -endif() - -## Then find atlas. -set(ATLAS_ROOT $ENV{ATLAS_ROOT} CACHE PATH "Folder contains Atlas") -set(ATLAS_INCLUDE_SEARCH_PATHS - ${ATLAS_ROOT}/include - /usr/include - /usr/include/atlas) -set(ATLAS_LIB_SEARCH_PATHS - ${ATLAS_ROOT}/lib - /usr/lib - /usr/lib/blas/atlas - /usr/lib/atlas - /usr/lib/atlas-base # special for ubuntu 14.04. - ) -find_path(ATLAS_INC_DIR NAMES cblas.h - PATHS ${ATLAS_INCLUDE_SEARCH_PATHS}) -find_path(ATLAS_CLAPACK_INC_DIR NAMES clapack.h - PATHS ${ATLAS_INCLUDE_SEARCH_PATHS}) -find_library(ATLAS_CBLAS_LIB NAMES cblas libcblas.so.3 - PATHS ${ATLAS_LIB_SEARCH_PATHS}) -find_library(ATLAS_CLAPACK_LIB NAMES lapack_atlas liblapack_atlas.so.3 - PATHS ${ATLAS_LIB_SEARCH_PATHS}) - -if(ATLAS_CLAPACK_INC_DIR AND ATLAS_INC_DIR AND ATLAS_CBLAS_LIB AND ATLAS_CLAPACK_LIB) +## Find MKLML First. +if(WITH_MKLML AND MKLML_INC_DIR AND MKLML_LIB) set(CBLAS_FOUND ON) - set(CBLAS_PROVIDER ATLAS) - set(CBLAS_INC_DIR ${ATLAS_INC_DIR} ${ATLAS_CLAPACK_INC_DIR}) - set(CBLAS_LIBRARIES ${ATLAS_CLAPACK_LIB} ${ATLAS_CBLAS_LIB}) + set(CBLAS_PROVIDER MKLML) + set(CBLAS_INC_DIR ${MKLML_INC_DIR}) + set(CBLAS_LIBRARIES ${MKLML_LIB}) - add_definitions(-DPADDLE_USE_ATLAS) + add_definitions(-DPADDLE_WITH_MKLML) add_definitions(-DLAPACK_FOUND) - message(STATUS "Found ATLAS (include: ${ATLAS_INC_DIR}, library: ${CBLAS_LIBRARIES})") - message(STATUS "Found lapack in ATLAS (include: ${ATLAS_CLAPACK_INC_DIR})") + message(STATUS "Found cblas and lapack in MKLML " + "(include: ${CBLAS_INC_DIR}, library: ${CBLAS_LIBRARIES})") return() endif() @@ -150,3 +92,10 @@ if (REFERENCE_CBLAS_INCLUDE_DIR AND REFERENCE_CBLAS_LIBRARY) add_definitions(-DPADDLE_USE_REFERENCE_CBLAS) message(STATUS "Found reference-cblas (include: ${CBLAS_INC_DIR}, library: ${CBLAS_LIBRARIES})") endif() + +if(IOS_USE_VECLIB_FOR_BLAS AND VECLIB_FOUND) + set(CBLAS_FOUND ON) + set(CBLAS_PROVIDER vecLib) + set(CBLAS_INC_DIR ${VECLIB_INC_DIR}) + add_definitions(-DPADDLE_USE_VECLIB) +endif() diff --git a/cmake/configure.cmake b/cmake/configure.cmake index 5e507e78f74eee885922f502f35e3c15fafb622d..5c6bcfde76a1201f792d04766d698db8cd395a49 100644 --- a/cmake/configure.cmake +++ b/cmake/configure.cmake @@ -1,11 +1,11 @@ # Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. -# +# # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at -# +# # http://www.apache.org/licenses/LICENSE-2.0 -# +# # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -24,10 +24,23 @@ if(WITH_DOUBLE) add_definitions(-DPADDLE_TYPE_DOUBLE) endif(WITH_DOUBLE) +if(WITH_ARM_FP16) + add_definitions(-DPADDLE_ARM_FP16) + add_definitions("-march=armv8.2-a+fp16+simd") +endif(WITH_ARM_FP16) + +if(WITH_TESTING) + add_definitions(-DPADDLE_WITH_TESTING) +endif(WITH_TESTING) + if(NOT WITH_TIMER) add_definitions(-DPADDLE_DISABLE_TIMER) endif(NOT WITH_TIMER) +if(USE_EIGEN_FOR_BLAS) + add_definitions(-DPADDLE_USE_EIGEN_FOR_BLAS) +endif(USE_EIGEN_FOR_BLAS) + if(NOT WITH_PROFILER) add_definitions(-DPADDLE_DISABLE_PROFILER) endif(NOT WITH_PROFILER) @@ -40,20 +53,25 @@ if(NOT CMAKE_CROSSCOMPILING) endif() endif() +if(NOT WITH_GOLANG) + add_definitions(-DPADDLE_WITHOUT_GOLANG) +endif(NOT WITH_GOLANG) + if(NOT WITH_GPU) - add_definitions(-DPADDLE_ONLY_CPU) add_definitions(-DHPPL_STUB_FUNC) list(APPEND CMAKE_CXX_SOURCE_FILE_EXTENSIONS cu) else() + add_definitions(-DPADDLE_WITH_CUDA) + FIND_PACKAGE(CUDA REQUIRED) if(${CUDA_VERSION_MAJOR} VERSION_LESS 7) - message(FATAL_ERROR "Paddle need CUDA >= 7.0 to compile") + message(FATAL_ERROR "Paddle needs CUDA >= 7.0 to compile") endif() if(NOT CUDNN_FOUND) - message(FATAL_ERROR "Paddle need cudnn to compile") + message(FATAL_ERROR "Paddle needs cudnn to compile") endif() set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "-Xcompiler ${SIMD_FLAG}") @@ -63,5 +81,63 @@ else() include_directories(${CUDA_TOOLKIT_INCLUDE}) endif(NOT WITH_GPU) +if (WITH_MKLML AND MKLML_IOMP_LIB) + message(STATUS "Enable Intel OpenMP with ${MKLML_IOMP_LIB}") + set(OPENMP_FLAGS "-fopenmp") + set(CMAKE_C_CREATE_SHARED_LIBRARY_FORBIDDEN_FLAGS ${OPENMP_FLAGS}) + set(CMAKE_CXX_CREATE_SHARED_LIBRARY_FORBIDDEN_FLAGS ${OPENMP_FLAGS}) + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OPENMP_FLAGS}") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OPENMP_FLAGS}") +endif() + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${SIMD_FLAG}") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${SIMD_FLAG}") + +if(WITH_GOLANG) + # we need to symlink Paddle directory into GOPATH. If we + # don't do it and we have code that depends on Paddle, go + # get ./... will download a new Paddle repo from Github, + # without the changes in our current Paddle repo that we + # want to build. + set(GOPATH "${CMAKE_CURRENT_BINARY_DIR}/go") + file(MAKE_DIRECTORY ${GOPATH}) + set(PADDLE_IN_GOPATH "${GOPATH}/src/github.com/PaddlePaddle/Paddle") + file(MAKE_DIRECTORY "${PADDLE_IN_GOPATH}") + set(PADDLE_GO_PATH "${CMAKE_SOURCE_DIR}/go") + + add_custom_target(go_path) + add_custom_command(TARGET go_path + # Symlink Paddle directory into GOPATH + COMMAND mkdir -p ${PADDLE_IN_GOPATH} + COMMAND rm -rf ${PADDLE_IN_GOPATH} + COMMAND ln -sf ${CMAKE_SOURCE_DIR} ${PADDLE_IN_GOPATH} + # Automatically get all dependencies specified in the source code + # We can't run `go get -d ./...` for every target, because + # multiple `go get` can not run concurrently, but make need to be + # able to run with multiple jobs. + WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} + ) + + if (GLIDE_INSTALL) + if(EXISTS $ENV{GOPATH}/bin/glide) + set(GLIDE "$ENV{GOPATH}/bin/glide") + else() + message(FATAL_ERROR "no glide executeble found: $ENV{GOPATH}/bin/glide") + endif() + + # this command will only run when the file it depends is missing + # or has changed, or the output is missing. + add_custom_command(OUTPUT ${CMAKE_BINARY_DIR}/glide + COMMAND env GOPATH=${GOPATH} ${GLIDE} install + COMMAND touch ${CMAKE_BINARY_DIR}/glide + DEPENDS ${PADDLE_SOURCE_DIR}/go/glide.lock + WORKING_DIRECTORY "${PADDLE_IN_GOPATH}/go" + ) + + # depends on the custom command which outputs + # ${CMAKE_BINARY_DIR}/glide, the custom command does not need to + # run every time this target is built. + add_custom_target(go_vendor DEPENDS ${CMAKE_BINARY_DIR}/glide go_path) + endif() + +endif(WITH_GOLANG) diff --git a/cmake/cpplint.cmake b/cmake/cpplint.cmake index 48f705818b70c92adef107fd3c973ae1ab3d34bb..4823dc3e91390002aefac70f7931b4197db05789 100644 --- a/cmake/cpplint.cmake +++ b/cmake/cpplint.cmake @@ -25,8 +25,10 @@ set(STYLE_FILTER "${STYLE_FILTER}-readability/casting") set(IGNORE_PATTERN .*ImportanceSampler.* .*cblas\\.h.* - .*LtrDataProvider.* - .*MultiDataProvider.*) + .*\\.pb\\.txt + .*MultiDataProvider.* + .*pb.* + .*pybind.h) # add_style_check_target # @@ -40,27 +42,21 @@ macro(add_style_check_target TARGET_NAME) if(WITH_STYLE_CHECK) set(SOURCES_LIST ${ARGN}) list(REMOVE_DUPLICATES SOURCES_LIST) - list(SORT SOURCES_LIST) - foreach(filename ${SOURCES_LIST}) - set(LINT ON) foreach(pattern ${IGNORE_PATTERN}) if(filename MATCHES ${pattern}) - message(STATUS "DROP LINT ${filename}") - set(LINT OFF) + list(REMOVE_ITEM SOURCES_LIST ${filename}) endif() endforeach() - if(LINT MATCHES ON) - get_filename_component(base_filename ${filename} NAME) - set(CUR_GEN ${CMAKE_CURRENT_BINARY_DIR}/${base_filename}.cpplint) - add_custom_command(OUTPUT ${CUR_GEN} - PRE_BUILD - COMMAND env ${py_env} "${PYTHON_EXECUTABLE}" "${PROJ_ROOT}/paddle/scripts/cpplint.py" - "--filter=${STYLE_FILTER}" - "--write-success=${CUR_GEN}" ${filename} - DEPENDS ${filename} - WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}) - endif() endforeach() + + if(SOURCES_LIST) + add_custom_command(TARGET ${TARGET_NAME} POST_BUILD + COMMAND "${PYTHON_EXECUTABLE}" "${PADDLE_SOURCE_DIR}/paddle/scripts/cpplint.py" + "--filter=${STYLE_FILTER}" + ${SOURCES_LIST} + COMMENT "cpplint: Checking source code style" + WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}) + endif() endif() endmacro() diff --git a/cmake/cross_compiling/android.cmake b/cmake/cross_compiling/android.cmake index 9724c16122ab2e6be55864c8716698c9b9d7c3f0..84219cfa5587f5b765b2e8f35180797d7053169f 100644 --- a/cmake/cross_compiling/android.cmake +++ b/cmake/cross_compiling/android.cmake @@ -20,6 +20,7 @@ # The supported variables are listed belows: # # ANDROID_STANDALONE_TOOLCHAIN +# ANDROID_TOOLCHAIN # ANDROID_ABI # ANDROID_NATIVE_API_LEVEL # ANDROID_ARM_MODE @@ -57,6 +58,10 @@ IF(NOT DEFINED CMAKE_SYSTEM_VERSION AND ANDROID_NATIVE_API_LEVEL) ENDIF() ENDIF() +IF(NOT DEFINED ANDROID_TOOLCHAIN) + SET(ANDROID_TOOLCHAIN clang) +ENDIF() + IF(NOT DEFINED ANDROID_ABI) SET(ANDROID_ABI "armeabi-v7a") ENDIF() @@ -82,6 +87,7 @@ IF("${CMAKE_VERSION}" VERSION_LESS "3.7.0") "${CMAKE_VERSION}), when cross-compiling for Android.") IF(ANDROID_STANDALONE_TOOLCHAIN) + # Use standalone toolchain SET(CMAKE_SYSROOT "${ANDROID_STANDALONE_TOOLCHAIN}/sysroot") IF(NOT CMAKE_SYSTEM_VERSION) @@ -96,22 +102,44 @@ IF("${CMAKE_VERSION}" VERSION_LESS "3.7.0") ENDIF() # Toolchain - SET(ANDROID_TOOLCHAIN "gcc") SET(ANDROID_TOOLCHAIN_ROOT ${ANDROID_STANDALONE_TOOLCHAIN}) - IF(ANDROID_ABI MATCHES "^armeabi(-v7a)?$") - SET(ANDROID_TOOLCHAIN_NAME arm-linux-androideabi) - IF(ANDROID_ABI STREQUAL "armeabi") - SET(CMAKE_SYSTEM_PROCESSOR armv5te) - ELSEIF(ANDROID_ABI STREQUAL "armeabi-v7a") - SET(CMAKE_SYSTEM_PROCESSOR armv7-a) - ENDIF() + ELSE(ANDROID_NDK) + # TODO: use android ndk + ENDIF() + + IF(ANDROID_ABI MATCHES "^armeabi(-v7a)?$") + SET(ANDROID_TOOLCHAIN_NAME arm-linux-androideabi) + IF(ANDROID_ABI STREQUAL "armeabi") + SET(CMAKE_SYSTEM_PROCESSOR armv5te) + SET(ANDROID_CLANG_TRIPLE armv5te-none-linux-androideabi) + ELSEIF(ANDROID_ABI STREQUAL "armeabi-v7a") + SET(CMAKE_SYSTEM_PROCESSOR armv7-a) + SET(ANDROID_CLANG_TRIPLE armv7-none-linux-androideabi) ENDIF() - SET(ANDROID_TOOLCHAIN_PREFIX "${ANDROID_TOOLCHAIN_ROOT}/bin/${ANDROID_TOOLCHAIN_NAME}-") + ELSEIF(ANDROID_ABI STREQUAL "arm64-v8a") + SET(ANDROID_TOOLCHAIN_NAME aarch64-linux-android) + SET(CMAKE_SYSTEM_PROCESSOR aarch64) + SET(ANDROID_CLANG_TRIPLE aarch64-none-linux-android) + ELSE() + MESSAGE(FATAL_ERROR "Invalid Android ABI: ${ANDROID_ABI}.") + ENDIF() + SET(ANDROID_TOOLCHAIN_PREFIX "${ANDROID_TOOLCHAIN_ROOT}/bin/${ANDROID_TOOLCHAIN_NAME}-") + + IF(ANDROID_TOOLCHAIN STREQUAL clang) + SET(ANDROID_C_COMPILER_NAME clang) + SET(ANDROID_CXX_COMPILER_NAME clang++) + SET(CMAKE_C_COMPILER_TARGET ${ANDROID_CLANG_TRIPLE}) + SET(CMAKE_CXX_COMPILER_TARGET ${ANDROID_CLANG_TRIPLE}) + ELSEIF(ANDROID_TOOLCHAIN STREQUAL gcc) + SET(ANDROID_C_COMPILER_NAME gcc) + SET(ANDROID_CXX_COMPILER_NAME g++) + ELSE() + MESSAGE(FATAL_ERROR "Invalid Android toolchain: ${ANDROID_TOOLCHAIN}") ENDIF() # C compiler IF(NOT CMAKE_C_COMPILER) - SET(ANDROID_C_COMPILER "${ANDROID_TOOLCHAIN_PREFIX}gcc") + SET(ANDROID_C_COMPILER "${ANDROID_TOOLCHAIN_PREFIX}${ANDROID_C_COMPILER_NAME}") ELSE() GET_FILENAME_COMPONENT(ANDROID_C_COMPILER ${CMAKE_C_COMPILER} PROGRAM) ENDIF() @@ -121,7 +149,7 @@ IF("${CMAKE_VERSION}" VERSION_LESS "3.7.0") # CXX compiler IF(NOT CMAKE_CXX_COMPILER) - SET(ANDROID_CXX_COMPILER "${ANDROID_TOOLCHAIN_PREFIX}g++") + SET(ANDROID_CXX_COMPILER "${ANDROID_TOOLCHAIN_PREFIX}${ANDROID_CXX_COMPILER_NAME}") ELSE() GET_FILENAME_COMPONENT(ANDROID_CXX_COMPILER ${CMAKE_CXX_COMPILER} PROGRAM) ENDIF() @@ -133,7 +161,7 @@ IF("${CMAKE_VERSION}" VERSION_LESS "3.7.0") SET(CMAKE_CXX_COMPILER ${ANDROID_CXX_COMPILER} CACHE PATH "CXX compiler" FORCE) # Toolchain and ABI specific flags. - SET(ANDROID_COMPILER_FLAGS "-ffunction-sections -fdata-sections -finline-limit=64") + SET(ANDROID_COMPILER_FLAGS "-ffunction-sections -fdata-sections") SET(ANDROID_LINKER_FLAGS "-Wl,--gc-sections") IF(ANDROID_ABI STREQUAL "armeabi") @@ -141,8 +169,7 @@ IF("${CMAKE_VERSION}" VERSION_LESS "3.7.0") -march=armv5te -mtune=xscale -msoft-float) - ENDIF() - IF(ANDROID_ABI STREQUAL "armeabi-v7a") + ELSEIF(ANDROID_ABI STREQUAL "armeabi-v7a") LIST(APPEND ANDROID_COMPILER_FLAGS -march=armv7-a -mfloat-abi=softfp) @@ -152,6 +179,8 @@ IF("${CMAKE_VERSION}" VERSION_LESS "3.7.0") LIST(APPEND ANDROID_COMPILER_FLAGS -mfpu=vfpv3-d16) ENDIF() LIST(APPEND ANDROID_LINKER_FLAGS -Wl,--fix-cortex-a8) + ELSEIF(ANDROID_ABI STREQUAL "arm64-v8a") + LIST(APPEND ANDROID_COMPILER_FLAGS -march=armv8-a) ENDIF() IF(ANDROID_ABI MATCHES "^armeabi(-v7a)?$") @@ -160,6 +189,18 @@ IF("${CMAKE_VERSION}" VERSION_LESS "3.7.0") ELSE() LIST(APPEND ANDROID_COMPILER_FLAGS -mthumb) ENDIF() + IF(ANDROID_TOOLCHAIN STREQUAL clang) + # Disable integrated-as for better compatibility. + LIST(APPEND ANDROID_COMPILER_FLAGS -fno-integrated-as) + ENDIF() + ENDIF() + + IF(ANDROID_TOOLCHAIN STREQUAL clang) + # CMake automatically forwards all compiler flags to the linker, + # and clang doesn't like having -Wa flags being used for linking. + # To prevent CMake from doing this would require meddling with + # the CMAKE__COMPILE_OBJECT rules, which would get quite messy. + LIST(APPEND ANDROID_LINKER_FLAGS -Qunused-arguments) ENDIF() STRING(REPLACE ";" " " ANDROID_COMPILER_FLAGS "${ANDROID_COMPILER_FLAGS}") @@ -186,6 +227,10 @@ ELSE() SET(CMAKE_ANDROID_STANDALONE_TOOLCHAIN ${ANDROID_STANDALONE_TOOLCHAIN}) ENDIF() SET(CMAKE_ANDROID_ARCH_ABI ${ANDROID_ABI}) - SET(CMAKE_ANDROID_ARM_MODE ${ANDROID_ARM_MODE}) - SET(CMAKE_ANDROID_ARM_NEON ${ANDROID_ARM_NEON}) + IF(ANDROID_ABI MATCHES "^armeabi(-v7a)?$") + SET(CMAKE_ANDROID_ARM_MODE ${ANDROID_ARM_MODE}) + IF(ANDROID_ABI STREQUAL "armeabi-v7a") + SET(CMAKE_ANDROID_ARM_NEON ${ANDROID_ARM_NEON}) + ENDIF() + ENDIF() ENDIF() diff --git a/cmake/cross_compiling/ios.cmake b/cmake/cross_compiling/ios.cmake new file mode 100644 index 0000000000000000000000000000000000000000..d3f5bf6852b3b295f3b5806b0577a880b0ce6ba6 --- /dev/null +++ b/cmake/cross_compiling/ios.cmake @@ -0,0 +1,347 @@ +# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# This is a toolchain file for cross-compiling for iOS, and the +# configuration largely refers to public toolchain file: +# https://raw.githubusercontent.com/leetal/ios-cmake/master/ios.toolchain.cmake +# and +# https://github.com/cristeab/ios-cmake +# +# Supports options: +# IOS_PLATFORM = OS (default) or SIMULATOR +# This decides if SDKS will be selected from the iPhoneOS.platform or iPhoneSimulator.platform folders +# OS - the default, used to build for iPhone and iPad physical devices, which have an arm arch. +# SIMULATOR - used to build for the Simulator platforms, which have an x86 arch. +# IOS_ARCH +# The archectures wanted to support, such "arm64", "armv7;arm64" +# IOS_DEPLOYMENT_TARGET +# The minimum iOS deployment version, such as "7.0" +# IOS_ENABLE_BITCODE = ON (default) or OFF +# IOS_USE_VECLIB_FOR_BLAS = OFF (default) or ON +# IOS_DEVELOPER_ROOT = automatic(default) or /path/to/platform/Developer folder +# By default this location is automatcially chosen based on the IOS_PLATFORM value above. +# If set manually, it will override the default location and force the user of a particular Developer Platform +# IOS_SDK_ROOT = automatic(default) or /path/to/platform/Developer/SDKs/SDK folder +# By default this location is automatcially chosen based on the IOS_DEVELOPER_ROOT value. +# In this case it will always be the most up-to-date SDK found in the IOS_DEVELOPER_ROOT path. +# If set manually, this will force the use of a specific SDK version + +# Macros: +# set_xcode_property (TARGET XCODE_PROPERTY XCODE_VALUE) +# A convenience macro for setting xcode specific properties on targets +# example: set_xcode_property (myioslib IPHONEOS_DEPLOYMENT_TARGET "3.1") +# find_host_package (PROGRAM ARGS) +# A macro used to find executable programs on the host system, not within the iOS environment. +# Thanks to the android-cmake project for providing the command + +if(NOT IOS) + return() +endif() + +set(CMAKE_SYSTEM_NAME Darwin) + +# Get the Xcode version being used. +execute_process(COMMAND xcodebuild -version + OUTPUT_VARIABLE XCODE_VERSION + RESULT_VARIABLE XCODE_VERSION_RESULT + ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE) +if(NOT ${XCODE_VERSION_RESULT}) + string(REGEX MATCH "Xcode [0-9\\.]+" XCODE_VERSION "${XCODE_VERSION}") + string(REGEX REPLACE "Xcode ([0-9\\.]+)" "\\1" XCODE_VERSION "${XCODE_VERSION}") + message(STATUS "Building with Xcode version: ${XCODE_VERSION}") +else() + message(FATAL_ERROR "Cannot execute xcodebuild, please check whether xcode is installed.") +endif() + +# Required as of cmake 2.8.10 +set(CMAKE_OSX_DEPLOYMENT_TARGET "" CACHE STRING "Force unset of the deployment target for iOS" FORCE) + +# Setup iOS platform unless specified manually with IOS_PLATFORM +if(NOT DEFINED IOS_PLATFORM) + set(IOS_PLATFORM "OS") +endif() +set(IOS_PLATFORM ${IOS_PLATFORM} CACHE STRING "Type of iOS Platform") + +# Set the architecture for iOS +if(NOT DEFINED IOS_ARCH) + if(IOS_PLATFORM STREQUAL "OS") + set(IOS_ARCH "armv7;armv7s;arm64") + elseif(IOS_PLATFORM STREQUAL "SIMULATOR") + set(IOS_ARCH "i386;x86_64") + endif() +endif() +set(CMAKE_OSX_ARCHITECTURES ${IOS_ARCH} CACHE string "Build architecture for iOS") + +# Specify minimum iOS deployment version +if(NOT DEFINED IOS_DEPLOYMENT_TARGET) + set(IOS_DEPLOYMENT_TARGET "7.0") +endif() +set(IOS_DEPLOYMENT_TARGET ${IOS_DEPLOYMENT_TARGET} CACHE STRING "Minimum iOS version") + +# Whether to enable bitcode +if(NOT DEFINED IOS_ENABLE_BITCODE) + set(IOS_ENABLE_BITCODE ON) +endif() +set(IOS_ENABLE_BITCODE ${IOS_ENABLE_BITCODE} CACHE BOOL "Whether to enable bitcode") + +if(NOT DEFINED IOS_USE_VECLIB_FOR_BLAS) + set(IOS_USE_VECLIB_FOR_BLAS OFF) +endif() +set(IOS_USE_VECLIB_FOR_BLAS ${IOS_UES_VECLIB_FOR_BLAS} CACHE BOOL "Whether to use veclib") + +# Check the platform selection and setup for developer root +if(${IOS_PLATFORM} STREQUAL "OS") + set(IOS_PLATFORM_LOCATION "iPhoneOS.platform") + set(XCODE_IOS_PLATFORM iphoneos) + + # This causes the installers to properly locate the output libraries + set(CMAKE_XCODE_EFFECTIVE_PLATFORMS "-iphoneos") +elseif(${IOS_PLATFORM} STREQUAL "SIMULATOR") + set(IOS_PLATFORM_LOCATION "iPhoneSimulator.platform") + set(XCODE_IOS_PLATFORM iphonesimulator) + + # This causes the installers to properly locate the output libraries + set(CMAKE_XCODE_EFFECTIVE_PLATFORMS "-iphonesimulator") +elseif(${IOS_PLATFORM} STREQUAL "WATCHOS") + set(IOS_PLATFORM_LOCATION "WatchOS.platform") + set(XCODE_IOS_PLATFORM watchos) + + # This causes the installers to properly locate the output libraries + set(CMAKE_XCODE_EFFECTIVE_PLATFORMS "-watchos") +else(${IOS_PLATFORM} STREQUAL "OS") + message(FATAL_ERROR "Unsupported IOS_PLATFORM value selected. Please set to\n" + "\t OS, SIMULATOR, or WATCHOS.") +endif() + +# Check iOS developer toolchain +if(NOT DEFINED IOS_DEVELOPER_ROOT) + # Setup iOS developer location + execute_process(COMMAND xcode-select -print-path + OUTPUT_VARIABLE XCODE_DEVELOPER_DIR + RESULT_VARIABLE XCODE_DEVELOPER_DIR_RESULT + ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE) + # Xcode 4.3 changed the installation location, choose the most recent one available + if(${XCODE_VERSION} VERSION_LESS "4.3.0") + set(IOS_DEVELOPER_ROOT "/Developer/Platforms/${IOS_PLATFORM_LOCATION}/Developer") + else() + set(IOS_DEVELOPER_ROOT "${XCODE_DEVELOPER_DIR}/Platforms/${IOS_PLATFORM_LOCATION}/Developer") + endif() +endif() +if(EXISTS ${IOS_DEVELOPER_ROOT}) + set(IOS_DEVELOPER_ROOT ${IOS_DEVELOPER_ROOT} CACHE PATH "Location of iOS Platform") +else() + message(FATAL_ERROR "Invalid IOS_DEVELOPER_ROOT: ${IOS_DEVELOPER_ROOT} does not exist.") +endif() + +# Check iOS SDK +if(NOT DEFINED IOS_SDK_ROOT) + # Find and use the most recent iOS sdk + file(GLOB IOS_SDK_LISTS "${IOS_DEVELOPER_ROOT}/SDKs/*") + if(IOS_SDK_LISTS) + list(SORT IOS_SDK_LISTS) + list(REVERSE IOS_SDK_LISTS) + list(GET IOS_SDK_LISTS 0 IOS_SDK_ROOT) + else(IOS_SDK_LISTS) + message(FATAL_ERROR "No iOS SDK's found in default search path ${IOS_DEVELOPER_ROOT}." + " Please manually set IOS_SDK_ROOT or install the iOS SDK.") + endif(IOS_SDK_LISTS) +endif() +if(EXISTS ${IOS_SDK_ROOT}) + set(IOS_SDK_ROOT ${IOS_SDK_ROOT} CACHE PATH "Location of the selected iOS SDK") + message(STATUS "iOS toolchain: ${IOS_SDK_ROOT}") +else() + message(FATAL_ERROR "Invalid IOS_SDK_ROOT: ${IOS_SDK_ROOT} does not exist.") +endif() + +# Set the sysroot default to the most recent SDK +set(CMAKE_OSX_SYSROOT ${IOS_SDK_ROOT} CACHE PATH "Sysroot used for iOS support") + +# Get version of iOS SDK +execute_process(COMMAND xcodebuild -sdk ${CMAKE_OSX_SYSROOT} -version SDKVersion + OUTPUT_VARIABLE IOS_SDK_VERSION + RESULT_VARIABLE IOS_SDK_VERSION_RESULT + ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE) +if(${IOS_SDK_VERSION_RESULT}) + string(REGEX MATCH "(([0-9]+)\\.)+([0-9]+)" IOS_SDK_VERSION "${IOS_SDK_ROOT}") +endif() +if(NOT IOS_SDK_VERSION) + message(WARNING "Cannot get SDK's version.") + set(IOS_SDK_VERSION 1) +endif() +set(CMAKE_SYSTEM_VERSION ${IOS_SDK_VERSION}) + +# Find the C & C++ compilers for the specified SDK. +if(NOT CMAKE_C_COMPILER) + # Default to use clang + execute_process(COMMAND xcrun -sdk ${CMAKE_OSX_SYSROOT} -find clang + OUTPUT_VARIABLE IOS_C_COMPILER + RESULT_VARIABLE IOS_C_COMPILER_RESULT + ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE) + if(${IOS_C_COMPILER_RESULT}) + get_filename_component(IOS_C_COMPILER clang PROGRAM) + endif() +else(NOT CMAKE_C_COMPILER) + # User can set it in cmake command + get_filename_component(IOS_C_COMPILER ${CMAKE_C_COMPILER} PROGRAM) +endif(NOT CMAKE_C_COMPILER) +if(NOT EXISTS ${IOS_C_COMPILER}) + message(FATAL_ERROR "Cannot find C compiler: ${IOS_C_COMPILER}") +endif() + +if(NOT CMAKE_CXX_COMPILER) + # Default to use clang++ + execute_process(COMMAND xcrun -sdk ${CMAKE_OSX_SYSROOT} -find clang++ + OUTPUT_VARIABLE IOS_CXX_COMPILER + RESULT_VARIABLE IOS_CXX_COMPILER_RESULT + ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE) + if(${IOS_CXX_COMPILER_RESULT}) + get_filename_component(IOS_CXX_COMPILER clang++ PROGRAM) + endif() +else(NOT CMAKE_CXX_COMPILER) + # User can set it in cmake command + get_filename_component(IOS_CXX_COMPILER ${CMAKE_CXX_COMPILER} PROGRAM) +endif(NOT CMAKE_CXX_COMPILER) +if(NOT EXISTS ${IOS_CXX_COMPILER}) + message(FATAL_ERROR "Cannot find CXX compiler: ${IOS_CXX_COMPILER}") +endif() + +set(CMAKE_C_COMPILER ${IOS_C_COMPILER} CACHE PATH "C compiler" FORCE) +set(CMAKE_CXX_COMPILER ${IOS_CXX_COMPILER} CACHE PATH "CXX compiler" FORCE) + +set(CMAKE_C_OSX_COMPATIBILITY_VERSION_FLAG "-compatibility_version ") +set(CMAKE_C_OSX_CURRENT_VERSION_FLAG "-current_version ") +set(CMAKE_CXX_OSX_COMPATIBILITY_VERSION_FLAG "${CMAKE_C_OSX_COMPATIBILITY_VERSION_FLAG}") +set(CMAKE_CXX_OSX_CURRENT_VERSION_FLAG "${CMAKE_C_OSX_CURRENT_VERSION_FLAG}") + +# Set iOS specific C/C++ flags +if(IOS_PLATFORM STREQUAL "OS") + if(XCODE_VERSION VERSION_LESS "7.0") + set(XCODE_IOS_PLATFORM_VERSION_FLAGS "-mios-version-min=${IOS_DEPLOYMENT_TARGET}") + else() + # Xcode 7.0+ uses flags we can build directly from XCODE_IOS_PLATFORM. + set(XCODE_IOS_PLATFORM_VERSION_FLAGS "-m${XCODE_IOS_PLATFORM}-version-min=${IOS_DEPLOYMENT_TARGET}") + endif() +else() + set(XCODE_IOS_FLATFORM_VERSION_FLAGS "-mios-simulator-version-min=${IOS_DEPLOYMENT_TARGET}") +endif() + +if(IOS_ENABLE_BITCODE) + set(XCODE_IOS_BITCODE_FLAGS "${IOS_COMPILER_FLAGS} -fembed-bitcode") +else() + set(XCODE_IOS_BITCODE_FLAGS "") +endif() + +set(IOS_COMPILER_FLAGS "${XCODE_IOS_PLATFORM_VERSION_FLAGS} ${XCODE_IOS_BITCODE_FLAGS}") + +# Hidden visibilty is required for cxx on iOS +set(CMAKE_C_FLAGS "${IOS_COMPILER_FLAGS} ${CMAKE_C_FLAGS}" CACHE STRING "C flags") +set(CMAKE_CXX_FLAGS "${IOS_COMPILER_FLAGS} -fvisibility=hidden -fvisibility-inlines-hidden ${CMAKE_CXX_FLAGS}" CACHE STRING "CXX flags") + +set(IOS_LINK_FLAGS "${XCODE_IOS_PLATFORM_VERSION_FLAGS} -Wl,-search_paths_first") + +if(IOS_USE_VECLIB_FOR_BLAS) + # Find vecLib for iOS + set(VECLIB_SEARCH_DIRS + ${IOS_SDK_ROOT}/System/Library/Frameworks/Accelerate.framework/Versions/Current/Frameworks + ${IOS_SDK_ROOT}/System/Library/Frameworks/Accelerate.framework/Frameworks + ) + find_path(VECLIB_INC_DIR vecLib.h PATHS ${VECLIB_SEARCH_DIRS}/vecLib.framework/Headers) + + include(FindPackageHandleStandardArgs) + find_package_handle_standard_args(vecLib DEFAULT_MSG VECLIB_INC_DIR) + + if(VECLIB_FOUND) + if(VECLIB_INC_DIR MATCHES "^/System/Library/Frameworks/vecLib.framework.*") + set(IOS_LINK_FLAGS ${IOS_LINK_FLAGS} -lcblas "-framework vecLib") + message(STATUS "Found standalone vecLib.framework") + else() + set(IOS_LINK_FLAGS ${IOS_LINK_FLAGS} -lcblas "-framework Accelerate") + message(STATUS "Found vecLib as part of Accelerate.framework") + endif() + + endif() +endif() + +set(CMAKE_C_LINK_FLAGS "${IOS_LINK_FLAGS} ${CMAKE_C_LINK_FLAGS}") +set(CMAKE_CXX_LINK_FLAGS "${IOS_LINK_FLAGS} ${CMAKE_CXX_LINK_FLAGS}") + +set(CMAKE_PLATFORM_HAS_INSTALLNAME 1) +if(NOT IOS_ENABLE_BITCODE) + set(CMAKE_SHARED_LIBRARY_CREATE_C_FLAGS "-dynamiclib -headerpad_max_install_names") + set(CMAKE_SHARED_MODULE_CREATE_C_FLAGS "-bundle -headerpad_max_install_names") +else() + set(CMAKE_SHARED_LIBRARY_CREATE_C_FLAGS "-dynamiclib") + set(CMAKE_SHARED_MODULE_CREATE_C_FLAGS "-bundle") +endif() +set(CMAKE_SHARED_MODULE_LOADER_C_FLAG "-Wl,-bundle_loader,") +set(CMAKE_SHARED_MODULE_LOADER_CXX_FLAG "-Wl,-bundle_loader,") +set(CMAKE_FIND_LIBRARY_SUFFIXES ".dylib" ".so" ".a") + +# hack: if a new cmake (which uses CMAKE_INSTALL_NAME_TOOL) runs on an old build tree +# (where install_name_tool was hardcoded) and where CMAKE_INSTALL_NAME_TOOL isn't in the cache +# and still cmake didn't fail in CMakeFindBinUtils.cmake (because it isn't rerun) +# hardcode CMAKE_INSTALL_NAME_TOOL here to install_name_tool, so it behaves as it did before, Alex +if(NOT DEFINED CMAKE_INSTALL_NAME_TOOL) + find_program(CMAKE_INSTALL_NAME_TOOL install_name_tool) +endif() + +# Set the find root to the iOS developer roots and to user defined paths +set(CMAKE_FIND_ROOT_PATH ${IOS_DEVELOPER_ROOT} ${IOS_SDK_ROOT} ${CMAKE_PREFIX_PATH} + CACHE string "iOS find search path root") + +# default to searching for frameworks first +set(CMAKE_FIND_FRAMEWORK FIRST) + +# set up the default search directories for frameworks +set(CMAKE_SYSTEM_FRAMEWORK_PATH + ${IOS_SDK_ROOT}/System/Library/Frameworks + ${IOS_SDK_ROOT}/System/Library/PrivateFrameworks + ${IOS_SDK_ROOT}/Developer/Library/Frameworks + ) + +# only search the iOS sdks, not the remainder of the host filesystem +set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER) +set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY) +set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY) + +message(STATUS "iOS: Targeting iOS '${CMAKE_SYSTEM_VERSION}', " + "building for '${IOS_PLATFORM}' platform, with architecture '${CMAKE_OSX_ARCHITECTURES}'") +message(STATUS "System CMAKE_C_FLAGS: ${CMAKE_C_FLAGS}") +message(STATUS "System CMAKE_CXX_FLAGS: ${CMAKE_CXX_FLAGS}") + +# Used in ExternalProject command +string(REPLACE ";" "\\$" EXTERNAL_IOS_ARCHITECTURES "${CMAKE_OSX_ARCHITECTURES}") +set(EXTERNAL_OPTIONAL_ARGS + -DCMAKE_OSX_SYSROOT=${CMAKE_OSX_SYSROOT} + -DCMAKE_OSX_ARCHITECTURES=${EXTERNAL_IOS_ARCHITECTURES}) + +# This little macro lets you set any XCode specific property +macro(set_xcode_property TARGET XCODE_PROPERTY XCODE_VALUE) + set_property (TARGET ${TARGET} PROPERTY XCODE_ATTRIBUTE_${XCODE_PROPERTY} ${XCODE_VALUE}) +endmacro(set_xcode_property) + +# This macro lets you find executable programs on the host system +macro(find_host_package) + set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER) + set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY NEVER) + set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE NEVER) + set(IOS FALSE) + + find_package(${ARGN}) + + set(IOS TRUE) + set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM ONLY) + set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY) + set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY) +endmacro(find_host_package) diff --git a/cmake/cuda.cmake b/cmake/cuda.cmake new file mode 100644 index 0000000000000000000000000000000000000000..6bea7cf3022242ce48cc882915f7e71810937283 --- /dev/null +++ b/cmake/cuda.cmake @@ -0,0 +1,188 @@ +if(NOT WITH_GPU) + return() +endif() + +set(paddle_known_gpu_archs "30 35 50 52 60 61 70") +set(paddle_known_gpu_archs7 "30 35 50 52") +set(paddle_known_gpu_archs8 "30 35 50 52 60 61") + +###################################################################################### +# A function for automatic detection of GPUs installed (if autodetection is enabled) +# Usage: +# detect_installed_gpus(out_variable) +function(detect_installed_gpus out_variable) + if(NOT CUDA_gpu_detect_output) + set(cufile ${PROJECT_BINARY_DIR}/detect_cuda_archs.cu) + + file(WRITE ${cufile} "" + "#include \n" + "int main() {\n" + " int count = 0;\n" + " if (cudaSuccess != cudaGetDeviceCount(&count)) return -1;\n" + " if (count == 0) return -1;\n" + " for (int device = 0; device < count; ++device) {\n" + " cudaDeviceProp prop;\n" + " if (cudaSuccess == cudaGetDeviceProperties(&prop, device))\n" + " std::printf(\"%d.%d \", prop.major, prop.minor);\n" + " }\n" + " return 0;\n" + "}\n") + + execute_process(COMMAND "${CUDA_NVCC_EXECUTABLE}" "-ccbin=${CUDA_HOST_COMPILER}" + "--run" "${cufile}" + WORKING_DIRECTORY "${PROJECT_BINARY_DIR}/CMakeFiles/" + RESULT_VARIABLE nvcc_res OUTPUT_VARIABLE nvcc_out + ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE) + + if(nvcc_res EQUAL 0) + # only keep the last line of nvcc_out + STRING(REGEX REPLACE ";" "\\\\;" nvcc_out "${nvcc_out}") + STRING(REGEX REPLACE "\n" ";" nvcc_out "${nvcc_out}") + list(GET nvcc_out -1 nvcc_out) + string(REPLACE "2.1" "2.1(2.0)" nvcc_out "${nvcc_out}") + set(CUDA_gpu_detect_output ${nvcc_out} CACHE INTERNAL "Returned GPU architetures from detect_installed_gpus tool" FORCE) + endif() + endif() + + if(NOT CUDA_gpu_detect_output) + message(STATUS "Automatic GPU detection failed. Building for all known architectures.") + set(${out_variable} ${paddle_known_gpu_archs} PARENT_SCOPE) + else() + set(${out_variable} ${CUDA_gpu_detect_output} PARENT_SCOPE) + endif() +endfunction() + + +######################################################################## +# Function for selecting GPU arch flags for nvcc based on CUDA_ARCH_NAME +# Usage: +# select_nvcc_arch_flags(out_variable) +function(select_nvcc_arch_flags out_variable) + # List of arch names + set(archs_names "Kepler" "Maxwell" "Pascal" "All" "Manual") + set(archs_name_default "All") + if(NOT CMAKE_CROSSCOMPILING) + list(APPEND archs_names "Auto") + endif() + + # set CUDA_ARCH_NAME strings (so it will be seen as dropbox in CMake-Gui) + set(CUDA_ARCH_NAME ${archs_name_default} CACHE STRING "Select target NVIDIA GPU achitecture.") + set_property( CACHE CUDA_ARCH_NAME PROPERTY STRINGS "" ${archs_names} ) + mark_as_advanced(CUDA_ARCH_NAME) + + # verify CUDA_ARCH_NAME value + if(NOT ";${archs_names};" MATCHES ";${CUDA_ARCH_NAME};") + string(REPLACE ";" ", " archs_names "${archs_names}") + message(FATAL_ERROR "Only ${archs_names} architeture names are supported.") + endif() + + if(${CUDA_ARCH_NAME} STREQUAL "Manual") + set(CUDA_ARCH_BIN ${paddle_known_gpu_archs} CACHE STRING "Specify 'real' GPU architectures to build binaries for, BIN(PTX) format is supported") + set(CUDA_ARCH_PTX "50" CACHE STRING "Specify 'virtual' PTX architectures to build PTX intermediate code for") + mark_as_advanced(CUDA_ARCH_BIN CUDA_ARCH_PTX) + else() + unset(CUDA_ARCH_BIN CACHE) + unset(CUDA_ARCH_PTX CACHE) + endif() + + if(${CUDA_ARCH_NAME} STREQUAL "Kepler") + set(cuda_arch_bin "30 35") + elseif(${CUDA_ARCH_NAME} STREQUAL "Maxwell") + set(cuda_arch_bin "50") + elseif(${CUDA_ARCH_NAME} STREQUAL "Pascal") + set(cuda_arch_bin "60 61") + elseif(${CUDA_ARCH_NAME} STREQUAL "Volta") + set(cuda_arch_bin "70") + elseif(${CUDA_ARCH_NAME} STREQUAL "All") + set(cuda_arch_bin ${paddle_known_gpu_archs}) + elseif(${CUDA_ARCH_NAME} STREQUAL "Auto") + detect_installed_gpus(cuda_arch_bin) + else() # (${CUDA_ARCH_NAME} STREQUAL "Manual") + set(cuda_arch_bin ${CUDA_ARCH_BIN}) + endif() + + # remove dots and convert to lists + string(REGEX REPLACE "\\." "" cuda_arch_bin "${cuda_arch_bin}") + string(REGEX REPLACE "\\." "" cuda_arch_ptx "${CUDA_ARCH_PTX}") + string(REGEX MATCHALL "[0-9()]+" cuda_arch_bin "${cuda_arch_bin}") + string(REGEX MATCHALL "[0-9]+" cuda_arch_ptx "${cuda_arch_ptx}") + list(REMOVE_DUPLICATES cuda_arch_bin) + list(REMOVE_DUPLICATES cuda_arch_ptx) + + set(nvcc_flags "") + set(nvcc_archs_readable "") + + # Tell NVCC to add binaries for the specified GPUs + foreach(arch ${cuda_arch_bin}) + if(arch MATCHES "([0-9]+)\\(([0-9]+)\\)") + # User explicitly specified PTX for the concrete BIN + list(APPEND nvcc_flags -gencode arch=compute_${CMAKE_MATCH_2},code=sm_${CMAKE_MATCH_1}) + list(APPEND nvcc_archs_readable sm_${CMAKE_MATCH_1}) + else() + # User didn't explicitly specify PTX for the concrete BIN, we assume PTX=BIN + list(APPEND nvcc_flags -gencode arch=compute_${arch},code=sm_${arch}) + list(APPEND nvcc_archs_readable sm_${arch}) + endif() + endforeach() + + # Tell NVCC to add PTX intermediate code for the specified architectures + foreach(arch ${cuda_arch_ptx}) + list(APPEND nvcc_flags -gencode arch=compute_${arch},code=compute_${arch}) + list(APPEND nvcc_archs_readable compute_${arch}) + endforeach() + + string(REPLACE ";" " " nvcc_archs_readable "${nvcc_archs_readable}") + set(${out_variable} ${nvcc_flags} PARENT_SCOPE) + set(${out_variable}_readable ${nvcc_archs_readable} PARENT_SCOPE) +endfunction() + +message(STATUS "CUDA detected: " ${CUDA_VERSION}) +if (${CUDA_VERSION} LESS 7.0) + set(paddle_known_gpu_archs ${paddle_known_gpu_archs}) +elseif (${CUDA_VERSION} LESS 8.0) # CUDA 7.x + set(paddle_known_gpu_archs ${paddle_known_gpu_archs7}) + list(APPEND CUDA_NVCC_FLAGS "-D_MWAITXINTRIN_H_INCLUDED") + list(APPEND CUDA_NVCC_FLAGS "-D__STRICT_ANSI__") +elseif (${CUDA_VERSION} LESS 9.0) # CUDA 8.x + set(paddle_known_gpu_archs ${paddle_known_gpu_archs8}) + list(APPEND CUDA_NVCC_FLAGS "-D_MWAITXINTRIN_H_INCLUDED") + list(APPEND CUDA_NVCC_FLAGS "-D__STRICT_ANSI__") + # CUDA 8 may complain that sm_20 is no longer supported. Suppress the + # warning for now. + list(APPEND CUDA_NVCC_FLAGS "-Wno-deprecated-gpu-targets") +endif() + +include_directories(${CUDA_INCLUDE_DIRS}) +list(APPEND EXTERNAL_LIBS ${CUDA_LIBRARIES} ${CUDA_rt_LIBRARY}) +if(NOT WITH_DSO) + list(APPEND EXTERNAL_LIBS ${CUDNN_LIBRARY} ${CUDA_CUBLAS_LIBRARIES} ${CUDA_curand_LIBRARY} ${NCCL_LIBRARY}) +endif(NOT WITH_DSO) + +# setting nvcc arch flags +select_nvcc_arch_flags(NVCC_FLAGS_EXTRA) +list(APPEND CUDA_NVCC_FLAGS ${NVCC_FLAGS_EXTRA}) +message(STATUS "Added CUDA NVCC flags for: ${NVCC_FLAGS_EXTRA_readable}") + +# Set C++11 support +set(CUDA_PROPAGATE_HOST_FLAGS OFF) + +# Release/Debug flags set by cmake. Such as -O3 -g -DNDEBUG etc. +# So, don't set these flags here. +list(APPEND CUDA_NVCC_FLAGS "-std=c++11") +list(APPEND CUDA_NVCC_FLAGS "--use_fast_math") +list(APPEND CUDA_NVCC_FLAGS "-Xcompiler -fPIC") +# Set :expt-relaxed-constexpr to suppress Eigen warnings +list(APPEND CUDA_NVCC_FLAGS "--expt-relaxed-constexpr") + +if(CMAKE_BUILD_TYPE STREQUAL "Debug") + list(APPEND CUDA_NVCC_FLAGS ${CMAKE_CXX_FLAGS_DEBUG}) +elseif(CMAKE_BUILD_TYPE STREQUAL "Release") + list(APPEND CUDA_NVCC_FLAGS ${CMAKE_CXX_FLAGS_RELEASE}) +elseif(CMAKE_BUILD_TYPE STREQUAL "RelWithDebInfo") + list(APPEND CUDA_NVCC_FLAGS ${CMAKE_CXX_FLAGS_RELWITHDEBINFO}) +elseif(CMAKE_BUILD_TYPE STREQUAL "MinSizeRel") + list(APPEND CUDA_NVCC_FLAGS ${CMAKE_CXX_FLAGS_MINSIZEREL}) +endif() + +mark_as_advanced(CUDA_BUILD_CUBIN CUDA_BUILD_EMULATION CUDA_VERBOSE_BUILD) +mark_as_advanced(CUDA_SDK_ROOT_DIR CUDA_SEPARABLE_COMPILATION) diff --git a/cmake/cudnn.cmake b/cmake/cudnn.cmake index 69f40df51680a104c47d9335c070c570dcaff59a..2c84061ff572de4687b4d496f8ded6deee8d1011 100644 --- a/cmake/cudnn.cmake +++ b/cmake/cudnn.cmake @@ -2,7 +2,7 @@ if(NOT WITH_GPU) return() endif() -set(CUDNN_ROOT "" CACHE PATH "CUDNN ROOT") +set(CUDNN_ROOT "/usr" CACHE PATH "CUDNN ROOT") find_path(CUDNN_INCLUDE_DIR cudnn.h PATHS ${CUDNN_ROOT} ${CUDNN_ROOT}/include $ENV{CUDNN_ROOT} $ENV{CUDNN_ROOT}/include ${CUDA_TOOLKIT_INCLUDE} diff --git a/cmake/external/any.cmake b/cmake/external/any.cmake index 62eea42692b4191e53d0bbb0805786fd15ac7944..85cce80b70a1fcf57015ac7a264e4950616b2717 100644 --- a/cmake/external/any.cmake +++ b/cmake/external/any.cmake @@ -2,13 +2,13 @@ INCLUDE(ExternalProject) SET(ANY_SOURCE_DIR ${THIRD_PARTY_PATH}/any) -INCLUDE_DIRECTORIES(${ANY_SOURCE_DIR}/src/linb_any) +INCLUDE_DIRECTORIES(${ANY_SOURCE_DIR}/src/extern_lib_any) ExternalProject_Add( - linb_any + extern_lib_any ${EXTERNAL_PROJECT_LOG_ARGS} - GIT_REPOSITORY "https://github.com/thelink2012/any.git" - GIT_TAG "8fef1e93710a0edf8d7658999e284a1142c4c020" + GIT_REPOSITORY "https://github.com/PaddlePaddle/any.git" + GIT_TAG "15595d8324be9e8a9a80d9ae442fdd12bd66df5d" PREFIX ${ANY_SOURCE_DIR} UPDATE_COMMAND "" CONFIGURE_COMMAND "" @@ -17,5 +17,15 @@ ExternalProject_Add( TEST_COMMAND "" ) +if (${CMAKE_VERSION} VERSION_LESS "3.3.0") + set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/lib_any_dummy.c) + file(WRITE ${dummyfile} "const char * dummy_any = \"${dummyfile}\";") + add_library(lib_any STATIC ${dummyfile}) +else() + add_library(lib_any INTERFACE) +endif() + +add_dependencies(lib_any extern_lib_any) + add_definitions(-DANY_IMPL_ANY_CAST_MOVEABLE) -LIST(APPEND external_project_dependencies linb_any) \ No newline at end of file +LIST(APPEND external_project_dependencies lib_any) diff --git a/cmake/external/boost.cmake b/cmake/external/boost.cmake new file mode 100644 index 0000000000000000000000000000000000000000..c70d83b3f4bb24740ed67b4e2f98a3ced26d1648 --- /dev/null +++ b/cmake/external/boost.cmake @@ -0,0 +1,51 @@ +# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +include(ExternalProject) + +set(BOOST_PROJECT "extern_boost") +set(BOOST_VER "1.41.0") +set(BOOST_TAR "boost_1_41_0") +set(BOOST_URL "http://sourceforge.net/projects/boost/files/boost/${BOOST_VER}/${BOOST_TAR}.tar.gz") +set(BOOST_SOURCES_DIR ${THIRD_PARTY_PATH}/boost) +set(BOOST_DOWNLOAD_DIR "${BOOST_SOURCES_DIR}/src/${BOOST_PROJECT}") +set(BOOST_INCLUDE_DIR "${BOOST_DOWNLOAD_DIR}/${BOOST_TAR}" CACHE PATH "boost include directory." FORCE) + +include_directories(${BOOST_INCLUDE_DIR}) + +ExternalProject_Add( + ${BOOST_PROJECT} + ${EXTERNAL_PROJECT_LOG_ARGS} + DOWNLOAD_DIR ${BOOST_DOWNLOAD_DIR} + DOWNLOAD_COMMAND wget --no-check-certificate ${BOOST_URL} -c -q -O ${BOOST_TAR}.tar.gz + && tar zxf ${BOOST_TAR}.tar.gz + DOWNLOAD_NO_PROGRESS 1 + PREFIX ${BOOST_SOURCES_DIR} + CONFIGURE_COMMAND "" + BUILD_COMMAND "" + INSTALL_COMMAND "" + UPDATE_COMMAND "" +) + +if (${CMAKE_VERSION} VERSION_LESS "3.3.0") + set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/boost_dummy.c) + file(WRITE ${dummyfile} "const char *dummy = \"${dummyfile}\";") + add_library(boost STATIC ${dummyfile}) +else() + add_library(boost INTERFACE) +endif() + +add_dependencies(boost ${BOOST_PROJECT}) +list(APPEND external_project_dependencies boost) +set(Boost_INCLUDE_DIR ${BOOST_INCLUDE_DIR}) diff --git a/cmake/external/cares.cmake b/cmake/external/cares.cmake new file mode 100644 index 0000000000000000000000000000000000000000..aec51410b33669f8a549f2eca193cc6aa2d07a13 --- /dev/null +++ b/cmake/external/cares.cmake @@ -0,0 +1,45 @@ +# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +IF(MOBILE_INFERENCE OR NOT WITH_DISTRIBUTE) + return() +ENDIF() + +include (ExternalProject) + +# NOTE: c-ares is needed when linking with grpc. + +SET(CARES_SOURCES_DIR ${THIRD_PARTY_PATH}/cares) +SET(CARES_INSTALL_DIR ${THIRD_PARTY_PATH}/install/cares) +SET(CARES_INCLUDE_DIR "${CARES_INSTALL_DIR}/include/" CACHE PATH "cares include directory." FORCE) + +ExternalProject_Add( + extern_cares + GIT_REPOSITORY "https://github.com/c-ares/c-ares.git" + GIT_TAG "cares-1_13_0" + PREFIX ${CARES_SOURCES_DIR} + UPDATE_COMMAND "" + CONFIGURE_COMMAND ./buildconf && ./configure --disable-shared --prefix=${CARES_INSTALL_DIR} + BUILD_IN_SOURCE 1 + BUILD_COMMAND make -j8 + INSTALL_COMMAND make install +) + +ADD_LIBRARY(cares STATIC IMPORTED GLOBAL) +SET_PROPERTY(TARGET cares PROPERTY IMPORTED_LOCATION + "${CARES_INSTALL_DIR}/lib/libcares.a") + +include_directories(${CARES_INCLUDE_DIR}) +ADD_DEPENDENCIES(cares extern_cares) diff --git a/cmake/external/eigen.cmake b/cmake/external/eigen.cmake new file mode 100644 index 0000000000000000000000000000000000000000..d49c8d601102cf865287c33349bff5eee6a90f6d --- /dev/null +++ b/cmake/external/eigen.cmake @@ -0,0 +1,36 @@ +INCLUDE(ExternalProject) + +SET(EIGEN_SOURCE_DIR ${THIRD_PARTY_PATH}/eigen3) +SET(EIGEN_INCLUDE_DIR ${EIGEN_SOURCE_DIR}/src/extern_eigen3) +INCLUDE_DIRECTORIES(${EIGEN_INCLUDE_DIR}) + +ExternalProject_Add( + extern_eigen3 + ${EXTERNAL_PROJECT_LOG_ARGS} + GIT_REPOSITORY "https://github.com/RLovelett/eigen.git" + GIT_TAG 70661066beef694cadf6c304d0d07e0758825c10 + PREFIX ${EIGEN_SOURCE_DIR} + UPDATE_COMMAND "" + CONFIGURE_COMMAND "" + BUILD_COMMAND "" + INSTALL_COMMAND "" + TEST_COMMAND "" +) + +if (${CMAKE_VERSION} VERSION_LESS "3.3.0") + set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/eigen3_dummy.c) + file(WRITE ${dummyfile} "const char *dummy_eigen3 = \"${dummyfile}\";") + add_library(eigen3 STATIC ${dummyfile}) +else() + add_library(eigen3 INTERFACE) +endif() + +add_dependencies(eigen3 extern_eigen3) + +LIST(APPEND external_project_dependencies eigen3) + +IF(NOT WITH_C_API AND WITH_FLUID) + INSTALL(FILES ${EIGEN_INCLUDE_DIR}/Eigen/Core DESTINATION third_party/eigen3/Eigen) + INSTALL(DIRECTORY ${EIGEN_INCLUDE_DIR}/Eigen/src DESTINATION third_party/eigen3/Eigen) + INSTALL(DIRECTORY ${EIGEN_INCLUDE_DIR}/unsupported/Eigen DESTINATION third_party/eigen3/unsupported) +ENDIF() diff --git a/cmake/external/gflags.cmake b/cmake/external/gflags.cmake index a0d0a892c4b3cc3743ac725f3cd90444f18abf34..60946304541a20809276c3e665d8524baf209006 100644 --- a/cmake/external/gflags.cmake +++ b/cmake/external/gflags.cmake @@ -18,9 +18,9 @@ SET(GFLAGS_SOURCES_DIR ${THIRD_PARTY_PATH}/gflags) SET(GFLAGS_INSTALL_DIR ${THIRD_PARTY_PATH}/install/gflags) SET(GFLAGS_INCLUDE_DIR "${GFLAGS_INSTALL_DIR}/include" CACHE PATH "gflags include directory." FORCE) IF(WIN32) - set(GFLAGS_LIBRARIES "${GFLAGS_INSTALL_DIR}/lib/gflags.lib" CACHE FILEPATH "GFLAGS_LIBRARIES" FORCE) + set(GFLAGS_LIBRARIES "${GFLAGS_INSTALL_DIR}/lib/gflags.lib" CACHE FILEPATH "GFLAGS_LIBRARIES" FORCE) ELSE(WIN32) - set(GFLAGS_LIBRARIES "${GFLAGS_INSTALL_DIR}/lib/libgflags.a" CACHE FILEPATH "GFLAGS_LIBRARIES" FORCE) + set(GFLAGS_LIBRARIES "${GFLAGS_INSTALL_DIR}/lib/libgflags.a" CACHE FILEPATH "GFLAGS_LIBRARIES" FORCE) ENDIF(WIN32) INCLUDE_DIRECTORIES(${GFLAGS_INCLUDE_DIR}) @@ -29,19 +29,21 @@ ExternalProject_Add( extern_gflags ${EXTERNAL_PROJECT_LOG_ARGS} GIT_REPOSITORY "https://github.com/gflags/gflags.git" + GIT_TAG 77592648e3f3be87d6c7123eb81cbad75f9aef5a PREFIX ${GFLAGS_SOURCES_DIR} UPDATE_COMMAND "" CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} - CMAKE_ARGS -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} - CMAKE_ARGS -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS} - CMAKE_ARGS -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS} - CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${GFLAGS_INSTALL_DIR} - CMAKE_ARGS -DCMAKE_POSITION_INDEPENDENT_CODE=ON - CMAKE_ARGS -DBUILD_TESTING=OFF - CMAKE_ARGS -DCMAKE_BUILD_TYPE=Release + -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} + -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS} + -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS} + -DCMAKE_INSTALL_PREFIX=${GFLAGS_INSTALL_DIR} + -DCMAKE_POSITION_INDEPENDENT_CODE=ON + -DBUILD_TESTING=OFF + -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE} + ${EXTERNAL_OPTIONAL_ARGS} CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${GFLAGS_INSTALL_DIR} -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON - -DCMAKE_BUILD_TYPE:STRING=Release + -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE} ) ADD_LIBRARY(gflags STATIC IMPORTED GLOBAL) @@ -49,3 +51,12 @@ SET_PROPERTY(TARGET gflags PROPERTY IMPORTED_LOCATION ${GFLAGS_LIBRARIES}) ADD_DEPENDENCIES(gflags extern_gflags) LIST(APPEND external_project_dependencies gflags) + +IF(WITH_C_API OR WITH_FLUID) + INSTALL(DIRECTORY ${GFLAGS_INCLUDE_DIR} DESTINATION third_party/gflags) + IF(ANDROID) + INSTALL(FILES ${GFLAGS_LIBRARIES} DESTINATION third_party/gflags/lib/${ANDROID_ABI}) + ELSE() + INSTALL(FILES ${GFLAGS_LIBRARIES} DESTINATION third_party/gflags/lib) + ENDIF() +ENDIF() diff --git a/cmake/external/glog.cmake b/cmake/external/glog.cmake index b70e94a170f17cc61f61673609e6eb941662ea62..382fbda3b5cfeba893f03871cf65498d20804f36 100644 --- a/cmake/external/glog.cmake +++ b/cmake/external/glog.cmake @@ -19,37 +19,60 @@ SET(GLOG_INSTALL_DIR ${THIRD_PARTY_PATH}/install/glog) SET(GLOG_INCLUDE_DIR "${GLOG_INSTALL_DIR}/include" CACHE PATH "glog include directory." FORCE) IF(WIN32) - SET(GLOG_LIBRARIES "${GLOG_INSTALL_DIR}/lib/libglog.lib" CACHE FILEPATH "glog library." FORCE) + SET(GLOG_LIBRARIES "${GLOG_INSTALL_DIR}/lib/libglog.lib" CACHE FILEPATH "glog library." FORCE) ELSE(WIN32) - SET(GLOG_LIBRARIES "${GLOG_INSTALL_DIR}/lib/libglog.a" CACHE FILEPATH "glog library." FORCE) + SET(GLOG_LIBRARIES "${GLOG_INSTALL_DIR}/lib/libglog.a" CACHE FILEPATH "glog library." FORCE) ENDIF(WIN32) INCLUDE_DIRECTORIES(${GLOG_INCLUDE_DIR}) +IF(ANDROID AND ${CMAKE_SYSTEM_VERSION} VERSION_LESS "21") + # Using the unofficial glog for Android API < 21 + SET(GLOG_REPOSITORY "https://github.com/Xreki/glog.git") + SET(GLOG_TAG "8a547150548b284382ccb6582408e9140ff2bea8") +ELSE() + SET(GLOG_REPOSITORY "https://github.com/google/glog.git") + SET(GLOG_TAG "v0.3.5") +ENDIF() + ExternalProject_Add( extern_glog ${EXTERNAL_PROJECT_LOG_ARGS} DEPENDS gflags - GIT_REPOSITORY "https://github.com/google/glog.git" + GIT_REPOSITORY ${GLOG_REPOSITORY} + GIT_TAG ${GLOG_TAG} PREFIX ${GLOG_SOURCES_DIR} UPDATE_COMMAND "" CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} - CMAKE_ARGS -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} - CMAKE_ARGS -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS} - CMAKE_ARGS -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS} - CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${GLOG_INSTALL_DIR} - CMAKE_ARGS -DCMAKE_POSITION_INDEPENDENT_CODE=ON - CMAKE_ARGS -DWITH_GFLAGS=ON - CMAKE_ARGS -Dgflags_DIR=${GFLAGS_INSTALL_DIR}/lib/cmake/gflags - CMAKE_ARGS -DBUILD_TESTING=OFF - CMAKE_ARGS -DCMAKE_BUILD_TYPE=Release + -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} + -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS} + -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS} + -DCMAKE_INSTALL_PREFIX=${GLOG_INSTALL_DIR} + -DCMAKE_INSTALL_LIBDIR=${GLOG_INSTALL_DIR}/lib + -DCMAKE_POSITION_INDEPENDENT_CODE=ON + -DWITH_GFLAGS=ON + -Dgflags_DIR=${GFLAGS_INSTALL_DIR}/lib/cmake/gflags + -DBUILD_TESTING=OFF + -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE} + ${EXTERNAL_OPTIONAL_ARGS} CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${GLOG_INSTALL_DIR} + -DCMAKE_INSTALL_LIBDIR:PATH=${GLOG_INSTALL_DIR}/lib -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON - -DCMAKE_BUILD_TYPE:STRING=Release + -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE} ) ADD_LIBRARY(glog STATIC IMPORTED GLOBAL) SET_PROPERTY(TARGET glog PROPERTY IMPORTED_LOCATION ${GLOG_LIBRARIES}) -ADD_DEPENDENCIES(glog extern_glog) +ADD_DEPENDENCIES(glog extern_glog gflags) +LINK_LIBRARIES(glog gflags) LIST(APPEND external_project_dependencies glog) + +IF(WITH_C_API OR WITH_FLUID) + INSTALL(DIRECTORY ${GLOG_INCLUDE_DIR} DESTINATION third_party/glog) + IF(ANDROID) + INSTALL(FILES ${GLOG_LIBRARIES} DESTINATION third_party/glog/lib/${ANDROID_ABI}) + ELSE() + INSTALL(FILES ${GLOG_LIBRARIES} DESTINATION third_party/glog/lib) + ENDIF() +ENDIF() diff --git a/cmake/external/grpc.cmake b/cmake/external/grpc.cmake new file mode 100644 index 0000000000000000000000000000000000000000..79b2449fe6689993bbee8a24ae7c46b452afe0a0 --- /dev/null +++ b/cmake/external/grpc.cmake @@ -0,0 +1,66 @@ +# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +IF(MOBILE_INFERENCE OR NOT WITH_DISTRIBUTE) + return() +ENDIF() + +include (ExternalProject) + +SET(GRPC_SOURCES_DIR ${THIRD_PARTY_PATH}/grpc) +SET(GRPC_INSTALL_DIR ${THIRD_PARTY_PATH}/install/grpc) +SET(GRPC_INCLUDE_DIR "${GRPC_INSTALL_DIR}/include/" CACHE PATH "grpc include directory." FORCE) +SET(GRPC_CPP_PLUGIN "${GRPC_INSTALL_DIR}/bin/grpc_cpp_plugin" CACHE FILEPATH "GRPC_CPP_PLUGIN" FORCE) +IF(APPLE) + SET(BUILD_CMD make -n HAS_SYSTEM_PROTOBUF=false -s -j8 static grpc_cpp_plugin | sed "s/-Werror//g" | sh) +ELSE() + SET(BUILD_CMD make HAS_SYSTEM_PROTOBUF=false -s -j8 static grpc_cpp_plugin) +ENDIF() + +ExternalProject_Add( + extern_grpc + DEPENDS protobuf zlib + GIT_REPOSITORY "https://github.com/grpc/grpc.git" + GIT_TAG "v1.8.x" + PREFIX ${GRPC_SOURCES_DIR} + UPDATE_COMMAND "" + CONFIGURE_COMMAND "" + BUILD_IN_SOURCE 1 + # NOTE(yuyang18): + # Disable -Werror, otherwise the compile will fail in MacOS. + # It seems that we cannot configure that by make command. + # Just dry run make command and remove `-Werror`, then use a shell to run make commands + BUILD_COMMAND ${BUILD_CMD} + INSTALL_COMMAND make prefix=${GRPC_INSTALL_DIR} install +) + +# FIXME(typhoonzero): hack to get static lib path, try a better way like merge them. +ADD_LIBRARY(grpc++_unsecure STATIC IMPORTED GLOBAL) +SET_PROPERTY(TARGET grpc++_unsecure PROPERTY IMPORTED_LOCATION + "${GRPC_INSTALL_DIR}/lib/libgrpc++_unsecure.a") + +ADD_LIBRARY(grpc++ STATIC IMPORTED GLOBAL) +SET_PROPERTY(TARGET grpc++ PROPERTY IMPORTED_LOCATION + "${GRPC_INSTALL_DIR}/lib/libgrpc++.a") +ADD_LIBRARY(gpr STATIC IMPORTED GLOBAL) +SET_PROPERTY(TARGET gpr PROPERTY IMPORTED_LOCATION + "${GRPC_INSTALL_DIR}/lib/libgpr.a") + +ADD_LIBRARY(grpc_unsecure STATIC IMPORTED GLOBAL) +SET_PROPERTY(TARGET grpc_unsecure PROPERTY IMPORTED_LOCATION + "${GRPC_INSTALL_DIR}/lib/libgrpc_unsecure.a") + +include_directories(${GRPC_INCLUDE_DIR}) +ADD_DEPENDENCIES(grpc++_unsecure extern_grpc) diff --git a/cmake/external/gtest.cmake b/cmake/external/gtest.cmake index 77e06e983e9f8bfaf6320e3c67b85b692ed877fc..5a4aa7a5b71a4fdfd556a46037e6d1846d668fc4 100644 --- a/cmake/external/gtest.cmake +++ b/cmake/external/gtest.cmake @@ -34,26 +34,33 @@ IF(WITH_TESTING) "${GTEST_INSTALL_DIR}/lib/libgtest_main.a" CACHE FILEPATH "gtest main libraries." FORCE) ENDIF(WIN32) + IF(WITH_MKLML) + # wait for mklml downloading completed + SET(GTEST_DEPENDS ${MKLML_PROJECT}) + ENDIF() + ExternalProject_Add( extern_gtest ${EXTERNAL_PROJECT_LOG_ARGS} + DEPENDS ${GTEST_DEPENDS} GIT_REPOSITORY "https://github.com/google/googletest.git" GIT_TAG "release-1.8.0" PREFIX ${GTEST_SOURCES_DIR} UPDATE_COMMAND "" CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} - CMAKE_ARGS -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} - CMAKE_ARGS -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS} - CMAKE_ARGS -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS} - CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${GTEST_INSTALL_DIR} - CMAKE_ARGS -DCMAKE_POSITION_INDEPENDENT_CODE=ON - CMAKE_ARGS -DBUILD_GMOCK=ON - CMAKE_ARGS -Dgtest_disable_pthreads=ON - CMAKE_ARGS -Dgtest_force_shared_crt=ON - CMAKE_ARGS -DCMAKE_BUILD_TYPE=Release + -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} + -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS} + -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS} + -DCMAKE_INSTALL_PREFIX=${GTEST_INSTALL_DIR} + -DCMAKE_POSITION_INDEPENDENT_CODE=ON + -DBUILD_GMOCK=ON + -Dgtest_disable_pthreads=ON + -Dgtest_force_shared_crt=ON + -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE} + ${EXTERNAL_OPTIONAL_ARGS} CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${GTEST_INSTALL_DIR} -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON - -DCMAKE_BUILD_TYPE:STRING=Release + -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE} ) ADD_LIBRARY(gtest STATIC IMPORTED GLOBAL) diff --git a/cmake/external/mkldnn.cmake b/cmake/external/mkldnn.cmake new file mode 100644 index 0000000000000000000000000000000000000000..89fc34796a03ff3f3e5b022ae10b2646832b1ac7 --- /dev/null +++ b/cmake/external/mkldnn.cmake @@ -0,0 +1,92 @@ +# Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +IF(NOT ${WITH_MKLDNN}) + return() +ENDIF(NOT ${WITH_MKLDNN}) + +INCLUDE(ExternalProject) + +SET(MKLDNN_PROJECT "extern_mkldnn") +SET(MKLDNN_SOURCES_DIR ${THIRD_PARTY_PATH}/mkldnn) +SET(MKLDNN_INSTALL_DIR ${THIRD_PARTY_PATH}/install/mkldnn) +SET(MKLDNN_INC_DIR "${MKLDNN_INSTALL_DIR}/include" CACHE PATH "mkldnn include directory." FORCE) + +IF(WIN32 OR APPLE) + MESSAGE(WARNING + "Windows or Mac is not supported with MKLDNN in Paddle yet." + "Force WITH_MKLDNN=OFF") + SET(WITH_MKLDNN OFF CACHE STRING "Disable MKLDNN in Windows and MacOS" FORCE) + return() +ENDIF() + +SET(MKLDNN_LIB "${MKLDNN_INSTALL_DIR}/lib/libmkldnn.so" CACHE FILEPATH "mkldnn library." FORCE) +MESSAGE(STATUS "Set ${MKLDNN_INSTALL_DIR}/lib to runtime path") +SET(CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE) +SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${MKLDNN_INSTALL_DIR}/lib") + +INCLUDE_DIRECTORIES(${MKLDNN_INC_DIR}) + +IF(${CBLAS_PROVIDER} STREQUAL "MKLML") + SET(MKLDNN_DEPENDS ${MKLML_PROJECT}) + MESSAGE(STATUS "Build MKLDNN with MKLML ${MKLML_ROOT}") +ELSE() + MESSAGE(FATAL_ERROR "Should enable MKLML when build MKLDNN") +ENDIF() + +SET(MKLDNN_CFLAG "${CMAKE_C_FLAGS} -Wno-error=strict-overflow") +SET(MKLDNN_CXXFLAG "${CMAKE_CXX_FLAGS} -Wno-error=strict-overflow") +ExternalProject_Add( + ${MKLDNN_PROJECT} + ${EXTERNAL_PROJECT_LOG_ARGS} + DEPENDS ${MKLDNN_DEPENDS} + GIT_REPOSITORY "https://github.com/01org/mkl-dnn.git" + GIT_TAG "v0.11" + PREFIX ${MKLDNN_SOURCES_DIR} + UPDATE_COMMAND "" + CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${MKLDNN_INSTALL_DIR} + CMAKE_ARGS -DMKLROOT=${MKLML_ROOT} + CMAKE_ARGS -DCMAKE_C_FLAGS=${MKLDNN_CFLAG} + CMAKE_ARGS -DCMAKE_CXX_FLAGS=${MKLDNN_CXXFLAG} + CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${MKLDNN_INSTALL_DIR} + -DMKLROOT:PATH=${MKLML_ROOT} +) + +ADD_LIBRARY(shared_mkldnn SHARED IMPORTED GLOBAL) +SET_PROPERTY(TARGET shared_mkldnn PROPERTY IMPORTED_LOCATION ${MKLDNN_LIB}) +ADD_DEPENDENCIES(shared_mkldnn ${MKLDNN_PROJECT}) +MESSAGE(STATUS "MKLDNN library: ${MKLDNN_LIB}") +add_definitions(-DPADDLE_WITH_MKLDNN) +LIST(APPEND external_project_dependencies shared_mkldnn) + +# generate a static dummy target to track mkldnn dependencies +# for cc_library(xxx SRCS xxx.c DEPS mkldnn) +SET(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/mkldnn_dummy.c) +FILE(WRITE ${dummyfile} "const char * dummy = \"${dummyfile}\";") +ADD_LIBRARY(mkldnn STATIC ${dummyfile}) +TARGET_LINK_LIBRARIES(mkldnn ${MKLDNN_LIB} ${MKLML_LIB} ${MKLML_IOMP_LIB}) +ADD_DEPENDENCIES(mkldnn ${MKLDNN_PROJECT}) + +# copy the real so.0 lib to install dir +# it can be directly contained in wheel or capi +SET(MKLDNN_SHARED_LIB ${MKLDNN_INSTALL_DIR}/libmkldnn.so.0) +ADD_CUSTOM_COMMAND(OUTPUT ${MKLDNN_SHARED_LIB} + COMMAND cp ${MKLDNN_LIB} ${MKLDNN_SHARED_LIB} + DEPENDS mkldnn) +ADD_CUSTOM_TARGET(mkldnn_shared_lib ALL DEPENDS ${MKLDNN_SHARED_LIB}) + +IF(WITH_C_API) + INSTALL(FILES ${MKLDNN_SHARED_LIB} DESTINATION lib) +ENDIF() + diff --git a/cmake/external/mklml.cmake b/cmake/external/mklml.cmake new file mode 100644 index 0000000000000000000000000000000000000000..15a07ea3daf5aa606235f20288a8306966334a1a --- /dev/null +++ b/cmake/external/mklml.cmake @@ -0,0 +1,72 @@ +# Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +IF(NOT ${WITH_MKLML}) + return() +ENDIF(NOT ${WITH_MKLML}) + +IF(WIN32 OR APPLE) + MESSAGE(WARNING + "Windows or Mac is not supported with MKLML in Paddle yet." + "Force WITH_MKLML=OFF") + SET(WITH_MKLML OFF CACHE STRING "Disable MKLML package in Windows and MacOS" FORCE) + return() +ENDIF() + +INCLUDE(ExternalProject) + +SET(MKLML_PROJECT "extern_mklml") +SET(MKLML_VER "mklml_lnx_2018.0.1.20171007") +SET(MKLML_URL "https://github.com/01org/mkl-dnn/releases/download/v0.11/${MKLML_VER}.tgz") +SET(MKLML_SOURCE_DIR "${THIRD_PARTY_PATH}/mklml") +SET(MKLML_DOWNLOAD_DIR "${MKLML_SOURCE_DIR}/src/${MKLML_PROJECT}") +SET(MKLML_DST_DIR "mklml") +SET(MKLML_INSTALL_ROOT "${THIRD_PARTY_PATH}/install") +SET(MKLML_INSTALL_DIR ${MKLML_INSTALL_ROOT}/${MKLML_DST_DIR}) +SET(MKLML_ROOT ${MKLML_INSTALL_DIR}/${MKLML_VER}) +SET(MKLML_INC_DIR ${MKLML_ROOT}/include) +SET(MKLML_LIB_DIR ${MKLML_ROOT}/lib) +SET(MKLML_LIB ${MKLML_LIB_DIR}/libmklml_intel.so) +SET(MKLML_IOMP_LIB ${MKLML_LIB_DIR}/libiomp5.so) +SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${MKLML_ROOT}/lib") + +INCLUDE_DIRECTORIES(${MKLML_INC_DIR}) + +FILE(WRITE ${MKLML_DOWNLOAD_DIR}/CMakeLists.txt + "PROJECT(MKLML)\n" + "cmake_minimum_required(VERSION 3.0)\n" + "install(DIRECTORY ${MKLML_VER}\n" + " DESTINATION ${MKLML_DST_DIR})\n") + +ExternalProject_Add( + ${MKLML_PROJECT} + ${EXTERNAL_PROJECT_LOG_ARGS} + PREFIX ${MKLML_SOURCE_DIR} + DOWNLOAD_DIR ${MKLML_DOWNLOAD_DIR} + DOWNLOAD_COMMAND wget --no-check-certificate ${MKLML_URL} -c -q -O ${MKLML_VER}.tgz + && tar zxf ${MKLML_VER}.tgz + DOWNLOAD_NO_PROGRESS 1 + UPDATE_COMMAND "" + CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${MKLML_INSTALL_ROOT} + CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${MKLML_INSTALL_ROOT} +) + +ADD_LIBRARY(mklml SHARED IMPORTED GLOBAL) +SET_PROPERTY(TARGET mklml PROPERTY IMPORTED_LOCATION ${MKLML_LIB}) +ADD_DEPENDENCIES(mklml ${MKLML_PROJECT}) +LIST(APPEND external_project_dependencies mklml) + +IF(WITH_C_API) + INSTALL(FILES ${MKLML_LIB} ${MKLML_IOMP_LIB} DESTINATION lib) +ENDIF() diff --git a/cmake/external/nccl.cmake b/cmake/external/nccl.cmake new file mode 100644 index 0000000000000000000000000000000000000000..fc43766efafc3d3e16f2906ce7f9a3d692c8e4ff --- /dev/null +++ b/cmake/external/nccl.cmake @@ -0,0 +1,67 @@ +# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +if(NOT WITH_GPU) + return() +endif() + +include(ExternalProject) + +set(NCCL_SOURCE_DIR ${THIRD_PARTY_PATH}/nccl) + +include_directories(${NCCL_SOURCE_DIR}/src/extern_nccl/src) + +if(WITH_DSO) + # If we use DSO, we do not build nccl, just download the dependencies + set(NCCL_BUILD_COMMAND "") + set(NCCL_INSTALL_COMMAND "") + set(NCCL_INSTALL_DIR "") +else() + # otherwise, we build nccl and link it. + set(NCCL_INSTALL_DIR ${THIRD_PARTY_PATH}/install/nccl) + # Note: cuda 8.0 is needed to make nccl + # When cuda is not installed on the system directory, need to set CUDA_HOME to your cuda root + set(NCCL_BUILD_COMMAND "make -j 8") + set(NCCL_INSTALL_COMMAND "make install PREFIX=${NCCL_INSTALL_DIR}") +endif() + +ExternalProject_Add( + extern_nccl + ${EXTERNAL_PROJECT_LOG_ARGS} + GIT_REPOSITORY "https://github.com/NVIDIA/nccl.git" + GIT_TAG "v1.3.4-1" + PREFIX "${NCCL_SOURCE_DIR}" + UPDATE_COMMAND "" + CONFIGURE_COMMAND "" + BUILD_COMMAND "${NCCL_BUILD_COMMAND}" + INSTALL_COMMAND "${NCCL_INSTALL_COMMAND}" + INSTALL_DIR "${NCCL_INSTALL_DIR}" + TEST_COMMAND "" +) + +if(WITH_DSO) + if(${CMAKE_VERSION} VERSION_LESS "3.3.0") + set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/lib_nccl_dummy.c) + file(WRITE ${dummyfile} "const char * dummy_nccl = \"${dummyfile}\";") + add_library(nccl STATIC ${dummyfile}) + else() + add_library(nccl INTERFACE) + endif() +else() + add_library(nccl STATIC IMPORTED GLOBAL) + set_property(TARGET nccl PROPERTY IMPORTED_LOCATION + ${NCCL_INSTALL_DIR}/lib/libnccl_static.a) +endif() + +add_dependencies(nccl extern_nccl) diff --git a/cmake/external/nnpack.cmake b/cmake/external/nnpack.cmake new file mode 100644 index 0000000000000000000000000000000000000000..d42bcb0f329041462bd8b568052fbb8226d18e4e --- /dev/null +++ b/cmake/external/nnpack.cmake @@ -0,0 +1,30 @@ +# Find the NNPACK library +# NNPACK_ROOT - where to find NNPACK include and library. +# + +set(NNPACK_FOUND OFF) +set(NNPACK_ROOT $ENV{NNPACK_ROOT} CACHE PATH "Folder contains NNPACK") +find_path(NNPACK_INC_DIR nnpack.h PATHS ${NNPACK_ROOT}/include) +find_library(NNPACK_LIB NAMES nnpack PATHS ${NNPACK_ROOT}/lib) +find_library(PTHREADPOOL_LIB NAMES pthreadpool PATHS ${NNPACK_ROOT}/lib) +find_library(NNPACK_UKERNELS_LIB NAMES nnpack_ukernels PATHS ${NNPACK_ROOT}/lib) +find_library(NNPACK_CPUFEATURES_LIB NAMES cpufeatures PATHS ${NNPACK_ROOT}/lib) + +if(NNPACK_INC_DIR AND NNPACK_LIB AND PTHREADPOOL_LIB) + set(NNPACK_FOUND ON) + INCLUDE_DIRECTORIES(${NNPACK_INC_DIR}) + + set(NNPACK_LIBS) + list(APPEND NNPACK_LIBS ${NNPACK_LIB} ${PTHREADPOOL_LIB}) + if (NNPACK_UKERNELS_LIB) + list(APPEND NNPACK_LIBS ${NNPACK_UKERNELS_LIB}) + endif() + if (NNPACK_CPUFEATURES_LIB) + list(APPEND NNPACK_LIBS ${NNPACK_CPUFEATURES_LIB}) + endif() + if(NOT ANDROID) + list(APPEND NNPACK_LIBS "rt") + endif() +else() + message(FATAL_ERROR "Cannot find NNPACK in (${NNPACK_ROOT})") +endif() diff --git a/cmake/external/openblas.cmake b/cmake/external/openblas.cmake index 2341e3785bd8e951e10e3f6bbf8a32f63e4ae44d..4012a164be1d20bba050fb09749b11afc7b99588 100644 --- a/cmake/external/openblas.cmake +++ b/cmake/external/openblas.cmake @@ -1,17 +1,21 @@ # Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. -# +# # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at -# +# # http://www.apache.org/licenses/LICENSE-2.0 -# +# # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +IF(USE_EIGEN_FOR_BLAS) + return() +ENDIF(USE_EIGEN_FOR_BLAS) + INCLUDE(cblas) IF(NOT ${CBLAS_FOUND}) @@ -21,30 +25,49 @@ IF(NOT ${CBLAS_FOUND}) SET(CBLAS_INSTALL_DIR ${THIRD_PARTY_PATH}/install/openblas) SET(CBLAS_INC_DIR "${CBLAS_INSTALL_DIR}/include" CACHE PATH "openblas include directory." FORCE) - SET(CBLAS_LIBRARIES "${CBLAS_INSTALL_DIR}/lib/${LIBRARY_PREFIX}openblas${STATIC_LIBRARY_SUFFIX}" + SET(CBLAS_LIBRARIES + "${CBLAS_INSTALL_DIR}/lib/${CMAKE_STATIC_LIBRARY_PREFIX}openblas${CMAKE_STATIC_LIBRARY_SUFFIX}" CACHE FILEPATH "openblas library." FORCE) - SET(COMMON_ARGS CC=${CMAKE_C_COMPILER} NO_SHARED=1 NO_LAPACK=1 libs) + SET(OPENBLAS_CC "${CMAKE_C_COMPILER} -Wno-unused-but-set-variable -Wno-unused-variable") + SET(OPENBLAS_COMMIT "v0.2.20") IF(CMAKE_CROSSCOMPILING) + SET(OPTIONAL_ARGS HOSTCC=${HOST_C_COMPILER}) + GET_FILENAME_COMPONENT(CROSS_SUFFIX ${CMAKE_C_COMPILER} DIRECTORY) + SET(CROSS_SUFFIX ${CROSS_SUFFIX}/) IF(ANDROID) - # arm_soft_fp_abi branch of OpenBLAS to support softfp - # https://github.com/xianyi/OpenBLAS/tree/arm_soft_fp_abi - SET(OPENBLAS_COMMIT "b5c96fcfcdc82945502a2303116a64d89985daf5") - SET(OPTIONAL_ARGS HOSTCC=${HOST_C_COMPILER} TARGET=ARMV7 ARM_SOFTFP_ABI=1 USE_THREAD=0) + IF(ANDROID_ABI MATCHES "^armeabi(-v7a)?$") + # use softfp + SET(OPTIONAL_ARGS ${OPTIONAL_ARGS} TARGET=ARMV7 ARM_SOFTFP_ABI=1 USE_THREAD=0) + ELSEIF(ANDROID_ABI STREQUAL "arm64-v8a") + SET(OPTIONAL_ARGS ${OPTIONAL_ARGS} TARGET=ARMV8 BINARY=64 USE_THREAD=0) + ENDIF() + ELSEIF(IOS) + IF(CMAKE_OSX_ARCHITECTURES MATCHES "arm64") + SET(OPENBLAS_CC "${OPENBLAS_CC} ${CMAKE_C_FLAGS} -isysroot ${CMAKE_OSX_SYSROOT}") + SET(OPENBLAS_CC "${OPENBLAS_CC} -arch arm64") + SET(OPTIONAL_ARGS ${OPTIONAL_ARGS} TARGET=ARMV8 BINARY=64 USE_THREAD=0 CROSS_SUFFIX=${CROSS_SUFFIX}) + ELSE() + MESSAGE(FATAL_ERROR "OpenBLAS only support arm64 architectures on iOS. " + "You can set IOS_USE_VECLIB_FOR_BLAS=ON or USE_EIGEN_FOR_BLAS=ON to use other blas library instead.") + ENDIF() ELSEIF(RPI) # use hardfp - SET(OPENBLAS_COMMIT "v0.2.19") - SET(OPTIONAL_ARGS HOSTCC=${HOST_C_COMPILER} TARGET=ARMV7 USE_THREAD=0) + SET(OPTIONAL_ARGS ${OPTIONAL_ARGS} TARGET=ARMV7 USE_THREAD=0) ENDIF() ELSE() - SET(OPENBLAS_COMMIT "v0.2.19") + IF(APPLE) + SET(OPENBLAS_CC "${CMAKE_C_COMPILER} -isysroot ${CMAKE_OSX_SYSROOT}") + ENDIF() SET(OPTIONAL_ARGS "") IF(CMAKE_SYSTEM_PROCESSOR MATCHES "^x86(_64)?$") SET(OPTIONAL_ARGS DYNAMIC_ARCH=1 NUM_THREADS=64) ENDIF() ENDIF() + SET(COMMON_ARGS CC=${OPENBLAS_CC} NO_SHARED=1 NO_LAPACK=1 libs) + ExternalProject_Add( extern_openblas ${EXTERNAL_PROJECT_LOG_ARGS} @@ -58,14 +81,48 @@ IF(NOT ${CBLAS_FOUND}) UPDATE_COMMAND "" CONFIGURE_COMMAND "" ) + SET(CBLAS_PROVIDER openblas) + IF(WITH_C_API) + INSTALL(DIRECTORY ${CBLAS_INC_DIR} DESTINATION third_party/openblas) + # Because libopenblas.a is a symbolic link of another library, thus need to + # install the whole directory. + IF(ANDROID) + SET(TMP_INSTALL_DIR third_party/openblas/lib/${ANDROID_ABI}) + ELSE() + SET(TMP_INSTALL_DIR third_party/openblas/lib) + ENDIF() + INSTALL(CODE "execute_process( + COMMAND ${CMAKE_COMMAND} -E copy_directory ${CBLAS_INSTALL_DIR}/lib + ${CMAKE_INSTALL_PREFIX}/${TMP_INSTALL_DIR} + )" + ) + INSTALL(CODE "MESSAGE(STATUS \"Installing: \" + \"${CBLAS_INSTALL_DIR}/lib -> ${CMAKE_INSTALL_PREFIX}/${TMP_INSTALL_DIR}\" + )" + ) + INSTALL(CODE "execute_process( + COMMAND rm -r ${CMAKE_INSTALL_PREFIX}/${TMP_INSTALL_DIR}/cmake + ${CMAKE_INSTALL_PREFIX}/${TMP_INSTALL_DIR}/pkgconfig + )" + ) + ENDIF() ENDIF(NOT ${CBLAS_FOUND}) MESSAGE(STATUS "BLAS library: ${CBLAS_LIBRARIES}") INCLUDE_DIRECTORIES(${CBLAS_INC_DIR}) -ADD_LIBRARY(cblas STATIC IMPORTED) -SET_PROPERTY(TARGET cblas PROPERTY IMPORTED_LOCATION ${CBLAS_LIBRARIES}) +# FIXME(gangliao): generate cblas target to track all high performance +# linear algebra libraries for cc_library(xxx SRCS xxx.c DEPS cblas) +SET(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/cblas_dummy.c) +FILE(WRITE ${dummyfile} "const char *dummy_cblas = \"${dummyfile}\";") +ADD_LIBRARY(cblas STATIC ${dummyfile}) +TARGET_LINK_LIBRARIES(cblas ${CBLAS_LIBRARIES}) + IF(NOT ${CBLAS_FOUND}) ADD_DEPENDENCIES(cblas extern_openblas) LIST(APPEND external_project_dependencies cblas) +ELSE() + IF("${CBLAS_PROVIDER}" STREQUAL "MKLML") + ADD_DEPENDENCIES(cblas mklml) + ENDIF() ENDIF(NOT ${CBLAS_FOUND}) diff --git a/cmake/external/protobuf.cmake b/cmake/external/protobuf.cmake index 7340394b1e1fad9e1893ac87d62febb8dd72751c..365a370a9cfb708379bcff18ae6aa0725d420ae1 100644 --- a/cmake/external/protobuf.cmake +++ b/cmake/external/protobuf.cmake @@ -13,12 +13,122 @@ # limitations under the License. INCLUDE(ExternalProject) +# Always invoke `FIND_PACKAGE(Protobuf)` for importing function protobuf_generate_cpp +FIND_PACKAGE(Protobuf QUIET) +macro(UNSET_VAR VAR_NAME) + UNSET(${VAR_NAME} CACHE) + UNSET(${VAR_NAME}) +endmacro() +UNSET_VAR(PROTOBUF_INCLUDE_DIR) +UNSET_VAR(PROTOBUF_FOUND) +UNSET_VAR(PROTOBUF_PROTOC_EXECUTABLE) +UNSET_VAR(PROTOBUF_PROTOC_LIBRARY) +UNSET_VAR(PROTOBUF_LITE_LIBRARY) +UNSET_VAR(PROTOBUF_LIBRARY) +UNSET_VAR(PROTOBUF_INCLUDE_DIR) +UNSET_VAR(Protobuf_PROTOC_EXECUTABLE) + +if(NOT COMMAND protobuf_generate_python) # before cmake 3.4, protobuf_genrerate_python is not defined. + function(protobuf_generate_python SRCS) + # shameless copy from https://github.com/Kitware/CMake/blob/master/Modules/FindProtobuf.cmake + if(NOT ARGN) + message(SEND_ERROR "Error: PROTOBUF_GENERATE_PYTHON() called without any proto files") + return() + endif() + + if(PROTOBUF_GENERATE_CPP_APPEND_PATH) + # Create an include path for each file specified + foreach(FIL ${ARGN}) + get_filename_component(ABS_FIL ${FIL} ABSOLUTE) + get_filename_component(ABS_PATH ${ABS_FIL} PATH) + list(FIND _protobuf_include_path ${ABS_PATH} _contains_already) + if(${_contains_already} EQUAL -1) + list(APPEND _protobuf_include_path -I ${ABS_PATH}) + endif() + endforeach() + else() + set(_protobuf_include_path -I ${CMAKE_CURRENT_SOURCE_DIR}) + endif() + + if(DEFINED PROTOBUF_IMPORT_DIRS AND NOT DEFINED Protobuf_IMPORT_DIRS) + set(Protobuf_IMPORT_DIRS "${PROTOBUF_IMPORT_DIRS}") + endif() + + if(DEFINED Protobuf_IMPORT_DIRS) + foreach(DIR ${Protobuf_IMPORT_DIRS}) + get_filename_component(ABS_PATH ${DIR} ABSOLUTE) + list(FIND _protobuf_include_path ${ABS_PATH} _contains_already) + if(${_contains_already} EQUAL -1) + list(APPEND _protobuf_include_path -I ${ABS_PATH}) + endif() + endforeach() + endif() + + set(${SRCS}) + foreach(FIL ${ARGN}) + get_filename_component(ABS_FIL ${FIL} ABSOLUTE) + get_filename_component(FIL_WE ${FIL} NAME_WE) + if(NOT PROTOBUF_GENERATE_CPP_APPEND_PATH) + get_filename_component(FIL_DIR ${FIL} DIRECTORY) + if(FIL_DIR) + set(FIL_WE "${FIL_DIR}/${FIL_WE}") + endif() + endif() + + list(APPEND ${SRCS} "${CMAKE_CURRENT_BINARY_DIR}/${FIL_WE}_pb2.py") + add_custom_command( + OUTPUT "${CMAKE_CURRENT_BINARY_DIR}/${FIL_WE}_pb2.py" + COMMAND ${Protobuf_PROTOC_EXECUTABLE} --python_out ${CMAKE_CURRENT_BINARY_DIR} ${_protobuf_include_path} ${ABS_FIL} + DEPENDS ${ABS_FIL} ${Protobuf_PROTOC_EXECUTABLE} + COMMENT "Running Python protocol buffer compiler on ${FIL}" + VERBATIM ) + endforeach() + + set(${SRCS} ${${SRCS}} PARENT_SCOPE) + endfunction() +endif() +# Print and set the protobuf library information, +# finish this cmake process and exit from this file. macro(PROMPT_PROTOBUF_LIB) + SET(protobuf_DEPS ${ARGN}) + MESSAGE(STATUS "Protobuf protoc executable: ${PROTOBUF_PROTOC_EXECUTABLE}") MESSAGE(STATUS "Protobuf library: ${PROTOBUF_LIBRARY}") MESSAGE(STATUS "Protobuf version: ${PROTOBUF_VERSION}") INCLUDE_DIRECTORIES(${PROTOBUF_INCLUDE_DIR}) + + # Assuming that all the protobuf libraries are of the same type. + IF(${PROTOBUF_LIBRARY} MATCHES "${CMAKE_STATIC_LIBRARY_SUFFIX}$") + SET(protobuf_LIBTYPE STATIC) + ELSEIF(${PROTOBUF_LIBRARY} MATCHES "${CMAKE_SHARED_LIBRARY_SUFFIX}$") + SET(protobuf_LIBTYPE SHARED) + ELSE() + MESSAGE(FATAL_ERROR "Unknown library type: ${PROTOBUF_LIBRARY}") + ENDIF() + + ADD_LIBRARY(protobuf ${protobuf_LIBTYPE} IMPORTED GLOBAL) + SET_PROPERTY(TARGET protobuf PROPERTY IMPORTED_LOCATION ${PROTOBUF_LIBRARY}) + + ADD_LIBRARY(protobuf_lite ${protobuf_LIBTYPE} IMPORTED GLOBAL) + SET_PROPERTY(TARGET protobuf_lite PROPERTY IMPORTED_LOCATION ${PROTOBUF_LITE_LIBRARY}) + + ADD_LIBRARY(libprotoc ${protobuf_LIBTYPE} IMPORTED GLOBAL) + SET_PROPERTY(TARGET libprotoc PROPERTY IMPORTED_LOCATION ${PROTOC_LIBRARY}) + + ADD_EXECUTABLE(protoc IMPORTED GLOBAL) + SET_PROPERTY(TARGET protoc PROPERTY IMPORTED_LOCATION ${PROTOBUF_PROTOC_EXECUTABLE}) + # FIND_Protobuf.cmake uses `Protobuf_PROTOC_EXECUTABLE`. + # make `protobuf_generate_cpp` happy. + SET(Protobuf_PROTOC_EXECUTABLE ${PROTOBUF_PROTOC_EXECUTABLE}) + FOREACH(dep ${protobuf_DEPS}) + ADD_DEPENDENCIES(protobuf ${dep}) + ADD_DEPENDENCIES(protobuf_lite ${dep}) + ADD_DEPENDENCIES(libprotoc ${dep}) + ADD_DEPENDENCIES(protoc ${dep}) + ENDFOREACH() + + LIST(APPEND external_project_dependencies protobuf) RETURN() endmacro() macro(SET_PROTOBUF_VERSION) @@ -28,11 +138,11 @@ endmacro() set(PROTOBUF_ROOT "" CACHE PATH "Folder contains protobuf") if (NOT "${PROTOBUF_ROOT}" STREQUAL "") - find_path(PROTOBUF_INCLUDE_DIR google/protobuf/message.h PATHS ${PROTOBUF_ROOT}/include) - find_library(PROTOBUF_LIBRARY protobuf PATHS ${PROTOBUF_ROOT}/lib) - find_library(PROTOBUF_LITE_LIBRARY protobuf-lite PATHS ${PROTOBUF_ROOT}/lib) - find_library(PROTOBUF_PROTOC_LIBRARY protoc PATHS ${PROTOBUF_ROOT}/lib) - find_program(PROTOBUF_PROTOC_EXECUTABLE protoc PATHS ${PROTOBUF_ROOT}/bin) + find_path(PROTOBUF_INCLUDE_DIR google/protobuf/message.h PATHS ${PROTOBUF_ROOT}/include NO_DEFAULT_PATH) + find_library(PROTOBUF_LIBRARY protobuf PATHS ${PROTOBUF_ROOT}/lib NO_DEFAULT_PATH) + find_library(PROTOBUF_LITE_LIBRARY protobuf-lite PATHS ${PROTOBUF_ROOT}/lib NO_DEFAULT_PATH) + find_library(PROTOBUF_PROTOC_LIBRARY protoc PATHS ${PROTOBUF_ROOT}/lib NO_DEFAULT_PATH) + find_program(PROTOBUF_PROTOC_EXECUTABLE protoc PATHS ${PROTOBUF_ROOT}/bin NO_DEFAULT_PATH) if (PROTOBUF_INCLUDE_DIR AND PROTOBUF_LIBRARY AND PROTOBUF_LITE_LIBRARY AND PROTOBUF_PROTOC_LIBRARY AND PROTOBUF_PROTOC_EXECUTABLE) message(STATUS "Using custom protobuf library in ${PROTOBUF_ROOT}.") SET_PROTOBUF_VERSION() @@ -43,22 +153,23 @@ if (NOT "${PROTOBUF_ROOT}" STREQUAL "") endif() FUNCTION(build_protobuf TARGET_NAME BUILD_FOR_HOST) - SET(PROTOBUF_SOURCES_DIR ${THIRD_PARTY_PATH}/${TARGET_NAME}) - SET(PROTOBUF_INSTALL_DIR ${THIRD_PARTY_PATH}/install/${TARGET_NAME}) + STRING(REPLACE "extern_" "" TARGET_DIR_NAME "${TARGET_NAME}") + SET(PROTOBUF_SOURCES_DIR ${THIRD_PARTY_PATH}/${TARGET_DIR_NAME}) + SET(PROTOBUF_INSTALL_DIR ${THIRD_PARTY_PATH}/install/${TARGET_DIR_NAME}) SET(${TARGET_NAME}_INCLUDE_DIR "${PROTOBUF_INSTALL_DIR}/include" PARENT_SCOPE) SET(PROTOBUF_INCLUDE_DIR "${PROTOBUF_INSTALL_DIR}/include" PARENT_SCOPE) SET(${TARGET_NAME}_LITE_LIBRARY - "${PROTOBUF_INSTALL_DIR}/lib/libprotobuf-lite${STATIC_LIBRARY_SUFFIX}" + "${PROTOBUF_INSTALL_DIR}/lib/libprotobuf-lite${CMAKE_STATIC_LIBRARY_SUFFIX}" PARENT_SCOPE) SET(${TARGET_NAME}_LIBRARY - "${PROTOBUF_INSTALL_DIR}/lib/libprotobuf${STATIC_LIBRARY_SUFFIX}" + "${PROTOBUF_INSTALL_DIR}/lib/libprotobuf${CMAKE_STATIC_LIBRARY_SUFFIX}" PARENT_SCOPE) SET(${TARGET_NAME}_PROTOC_LIBRARY - "${PROTOBUF_INSTALL_DIR}/lib/libprotoc${STATIC_LIBRARY_SUFFIX}" + "${PROTOBUF_INSTALL_DIR}/lib/libprotoc${CMAKE_STATIC_LIBRARY_SUFFIX}" PARENT_SCOPE) SET(${TARGET_NAME}_PROTOC_EXECUTABLE - "${PROTOBUF_INSTALL_DIR}/bin/protoc${EXECUTABLE_SUFFIX}" + "${PROTOBUF_INSTALL_DIR}/bin/protoc${CMAKE_EXECUTABLE_SUFFIX}" PARENT_SCOPE) SET(OPTIONAL_CACHE_ARGS "") @@ -72,46 +183,54 @@ FUNCTION(build_protobuf TARGET_NAME BUILD_FOR_HOST) "-DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}" "-DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}" "-Dprotobuf_WITH_ZLIB=ON" - "-DZLIB_ROOT:FILEPATH=${ZLIB_ROOT}") + "-DZLIB_ROOT:FILEPATH=${ZLIB_ROOT}" + ${EXTERNAL_OPTIONAL_ARGS}) SET(OPTIONAL_CACHE_ARGS "-DZLIB_ROOT:STRING=${ZLIB_ROOT}") ENDIF() + SET(PROTOBUF_REPO "https://github.com/google/protobuf.git") + SET(PROTOBUF_TAG "9f75c5aa851cd877fb0d93ccc31b8567a6706546") + IF(MOBILE_INFERENCE) + # The reason why the official version is not used is described in + # https://github.com/PaddlePaddle/Paddle/issues/6114 + SET(PROTOBUF_REPO "https://github.com/qingqing01/protobuf.git") + SET(PROTOBUF_TAG "v3.2.0") + IF(NOT BUILD_FOR_HOST) + SET(OPTIONAL_ARGS ${OPTIONAL_ARGS} "-Dprotobuf_BUILD_PROTOC_BINARIES=OFF") + ENDIF() + ENDIF() + ExternalProject_Add( ${TARGET_NAME} ${EXTERNAL_PROJECT_LOG_ARGS} PREFIX ${PROTOBUF_SOURCES_DIR} UPDATE_COMMAND "" DEPENDS zlib - GIT_REPOSITORY "https://github.com/google/protobuf.git" - GIT_TAG "9f75c5aa851cd877fb0d93ccc31b8567a6706546" + GIT_REPOSITORY ${PROTOBUF_REPO} + GIT_TAG ${PROTOBUF_TAG} CONFIGURE_COMMAND ${CMAKE_COMMAND} ${PROTOBUF_SOURCES_DIR}/src/${TARGET_NAME}/cmake ${OPTIONAL_ARGS} -Dprotobuf_BUILD_TESTS=OFF -DCMAKE_POSITION_INDEPENDENT_CODE=ON - -DCMAKE_BUILD_TYPE=Release + -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE} -DCMAKE_INSTALL_PREFIX=${PROTOBUF_INSTALL_DIR} -DCMAKE_INSTALL_LIBDIR=lib CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${PROTOBUF_INSTALL_DIR} - -DCMAKE_BUILD_TYPE:STRING=Release + -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE} -DCMAKE_VERBOSE_MAKEFILE:BOOL=OFF -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON ${OPTIONAL_CACHE_ARGS} ) ENDFUNCTION() -SET(PROTOBUF_VERSION 3.1) -IF(NOT CMAKE_CROSSCOMPILING) - FIND_PACKAGE(Protobuf ${PROTOBUF_VERSION}) - - IF(PROTOBUF_FOUND) - SET_PROTOBUF_VERSION() - IF("${PROTOBUF_VERSION}" VERSION_LESS "3.1.0") - SET(PROTOBUF_FOUND OFF) - ENDIF() - ENDIF(PROTOBUF_FOUND) +IF(NOT MOBILE_INFERENCE) + SET(PROTOBUF_VERSION 3.1) ELSE() + SET(PROTOBUF_VERSION 3.2) +ENDIF() +IF(CMAKE_CROSSCOMPILING) build_protobuf(protobuf_host TRUE) LIST(APPEND external_project_dependencies protobuf_host) @@ -120,18 +239,31 @@ ELSE() ENDIF() IF(NOT PROTOBUF_FOUND) - build_protobuf(protobuf FALSE) - LIST(APPEND external_project_dependencies protobuf) + build_protobuf(extern_protobuf FALSE) - SET(PROTOBUF_INCLUDE_DIR ${protobuf_INCLUDE_DIR} + SET(PROTOBUF_INCLUDE_DIR ${extern_protobuf_INCLUDE_DIR} CACHE PATH "protobuf include directory." FORCE) - IF(NOT CMAKE_CROSSCOMPILING) - SET(PROTOBUF_PROTOC_EXECUTABLE ${protobuf_PROTOC_EXECUTABLE} + SET(PROTOBUF_LITE_LIBRARY ${extern_protobuf_LITE_LIBRARY} + CACHE FILEPATH "protobuf lite library." FORCE) + SET(PROTOBUF_LIBRARY ${extern_protobuf_LIBRARY} + CACHE FILEPATH "protobuf library." FORCE) + SET(PROTOBUF_PROTOC_LIBRARY ${extern_protobuf_PROTOC_LIBRARY} + CACHE FILEPATH "protoc library." FORCE) + + IF(WITH_C_API OR WITH_FLUID) + INSTALL(DIRECTORY ${PROTOBUF_INCLUDE_DIR} DESTINATION third_party/protobuf) + IF(ANDROID) + INSTALL(FILES ${PROTOBUF_LITE_LIBRARY} DESTINATION third_party/protobuf/lib/${ANDROID_ABI}) + ELSE() + INSTALL(FILES ${PROTOBUF_LITE_LIBRARY} DESTINATION third_party/protobuf/lib) + ENDIF() + ENDIF() + + IF(CMAKE_CROSSCOMPILING) + PROMPT_PROTOBUF_LIB(protobuf_host extern_protobuf) + ELSE() + SET(PROTOBUF_PROTOC_EXECUTABLE ${extern_protobuf_PROTOC_EXECUTABLE} CACHE FILEPATH "protobuf executable." FORCE) + PROMPT_PROTOBUF_LIB(extern_protobuf) ENDIF() - SET(PROTOBUF_LITE_LIBRARY ${protobuf_LITE_LIBRARY} CACHE FILEPATH "protobuf lite library." FORCE) - SET(PROTOBUF_LIBRARY ${protobuf_LIBRARY} CACHE FILEPATH "protobuf library." FORCE) - SET(PROTOBUF_PROTOC_LIBRARY ${protobuf_PROTOC_LIBRARY} CACHE FILEPATH "protoc library." FORCE) ENDIF(NOT PROTOBUF_FOUND) - -PROMPT_PROTOBUF_LIB() \ No newline at end of file diff --git a/cmake/external/pybind11.cmake b/cmake/external/pybind11.cmake new file mode 100644 index 0000000000000000000000000000000000000000..4e87dc49d8956d1fa6dec777efc5a63c6b0f79a5 --- /dev/null +++ b/cmake/external/pybind11.cmake @@ -0,0 +1,46 @@ +# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +if(NOT WITH_PYTHON) + return() +endif() + +include(ExternalProject) + +set(PYBIND_SOURCE_DIR ${THIRD_PARTY_PATH}/pybind) + +include_directories(${PYBIND_SOURCE_DIR}/src/extern_pybind/include) + +ExternalProject_Add( + extern_pybind + ${EXTERNAL_PROJECT_LOG_ARGS} + GIT_REPOSITORY "https://github.com/pybind/pybind11.git" + GIT_TAG "v2.1.1" + PREFIX ${PYBIND_SOURCE_DIR} + UPDATE_COMMAND "" + CONFIGURE_COMMAND "" + BUILD_COMMAND "" + INSTALL_COMMAND "" + TEST_COMMAND "" +) + +if(${CMAKE_VERSION} VERSION_LESS "3.3.0") + set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/pybind_dummy.c) + file(WRITE ${dummyfile} "const char * dummy_pybind = \"${dummyfile}\";") + add_library(pybind STATIC ${dummyfile}) +else() + add_library(pybind INTERFACE) +endif() + +add_dependencies(pybind extern_pybind) diff --git a/cmake/external/python.cmake b/cmake/external/python.cmake index f4d0daab06c9fcf17f4af59c25f62b415074a52f..46c68cce324f565ec9985ef1a280d6d933f88f1f 100644 --- a/cmake/external/python.cmake +++ b/cmake/external/python.cmake @@ -12,16 +12,19 @@ # See the License for the specific language governing permissions and # limitations under the License. -INCLUDE(ExternalProject) +IF(NOT WITH_PYTHON) + return() +ENDIF() + INCLUDE(python_module) FIND_PACKAGE(PythonInterp 2.7) -IF(WITH_PYTHON) - FIND_PACKAGE(PythonLibs 2.7) -ENDIF(WITH_PYTHON) +FIND_PACKAGE(PythonLibs 2.7) +# Fixme: Maybe find a static library. Get SHARED/STATIC by FIND_PACKAGE. +ADD_LIBRARY(python SHARED IMPORTED GLOBAL) +SET_PROPERTY(TARGET python PROPERTY IMPORTED_LOCATION ${PYTHON_LIBRARIES}) SET(py_env "") -SET(USE_VIRTUALENV_FOR_TEST 1) IF(PYTHONINTERP_FOUND) find_python_module(pip REQUIRED) find_python_module(numpy REQUIRED) @@ -32,198 +35,7 @@ IF(PYTHONINTERP_FOUND) MESSAGE(FATAL_ERROR "Found Python Protobuf ${PY_GOOGLE.PROTOBUF_VERSION} < 3.0.0, " "please use pip to upgrade protobuf. pip install -U protobuf") ENDIF() -ELSE(PYTHONINTERP_FOUND) - MESSAGE(FATAL_ERROR "Please install python 2.7 before building PaddlePaddle.") - ##################################### PYTHON ######################################## - SET(PYTHON_SOURCES_DIR ${THIRD_PARTY_PATH}/python) - SET(PYTHON_INSTALL_DIR ${THIRD_PARTY_PATH}/install/python) - SET(_python_DIR ${PYTHON_INSTALL_DIR}) - - IF(UNIX) - SET(PYTHON_FOUND ON) - SET(PYTHON_INCLUDE_DIR "${PYTHON_INSTALL_DIR}/include/python2.7" CACHE PATH "Python include dir" FORCE) - SET(PYTHON_LIBRARIES "${PYTHON_INSTALL_DIR}/lib/libpython2.7.a" CACHE FILEPATH "Python library" FORCE) - SET(PYTHON_EXECUTABLE ${PYTHON_INSTALL_DIR}/bin/python CACHE FILEPATH "Python executable" FORCE) - SET(PY_SITE_PACKAGES_PATH "${PYTHON_INSTALL_DIR}/lib/python2.7/site-packages" CACHE PATH "Python site-packages path" FORCE) - ELSEIF(WIN32) - SET(PYTHON_FOUND ON) - SET(PYTHON_INCLUDE_DIR "${PYTHON_INSTALL_DIR}/include" CACHE PATH "Python include dir" FORCE) - SET(PYTHON_LIBRARIES "${PYTHON_INSTALL_DIR}/libs/python27.lib" CACHE FILEPATH "Python library" FORCE) - SET(PYTHON_EXECUTABLE "${PYTHON_INSTALL_DIR}/bin/python.exe" CACHE FILEPATH "Python executable" FORCE) - SET(PY_SITE_PACKAGES_PATH "${PYTHON_INSTALL_DIR}/Lib/site-packages" CACHE PATH "Python site-packages path" FORCE) - ELSE() - MESSAGE(FATAL_ERROR "Unknown system !") - ENDIF() - - IF(APPLE) - LIST(APPEND EXTERNAL_PROJECT_OPTIONAL_CMAKE_ARGS - -DCMAKE_BUILD_WITH_INSTALL_RPATH:BOOL=ON - ) - ENDIF() - - SET(EXTERNAL_PROJECT_OPTIONAL_CMAKE_CACHE_ARGS) - - # Force Python build to "Release". - IF(CMAKE_CONFIGURATION_TYPES) - SET(SAVED_CMAKE_CFG_INTDIR ${CMAKE_CFG_INTDIR}) - SET(CMAKE_CFG_INTDIR "Release") - ELSE() - LIST(APPEND EXTERNAL_PROJECT_OPTIONAL_CMAKE_CACHE_ARGS - -DCMAKE_BUILD_TYPE:STRING=Release - ) - ENDIF() - - ExternalProject_Add(python - ${EXTERNAL_PROJECT_LOG_ARGS} - GIT_REPOSITORY "https://github.com/python-cmake-buildsystem/python-cmake-buildsystem.git" - PREFIX ${PYTHON_SOURCES_DIR} - UPDATE_COMMAND "" - CMAKE_ARGS -DPYTHON_VERSION=2.7.12 - CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} - CMAKE_ARGS -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} - CMAKE_CACHE_ARGS - -DCMAKE_INSTALL_PREFIX:PATH=${PYTHON_INSTALL_DIR} - -DBUILD_LIBPYTHON_SHARED:BOOL=OFF - -DUSE_SYSTEM_LIBRARIES:BOOL=OFF - -DZLIB_ROOT:FILEPATH=${ZLIB_ROOT} - -DZLIB_INCLUDE_DIR:PATH=${ZLIB_INCLUDE_DIR} - -DZLIB_LIBRARY:FILEPATH=${ZLIB_LIBRARIES} - -DDOWNLOAD_SOURCES:BOOL=ON - -DINSTALL_WINDOWS_TRADITIONAL:BOOL=OFF - ${EXTERNAL_PROJECT_OPTIONAL_CMAKE_CACHE_ARGS} - ${EXTERNAL_PROJECT_OPTIONAL_CMAKE_ARGS} - DEPENDS zlib - ) - - SET(py_env - PATH=${PYTHON_INSTALL_DIR}/bin - PYTHONHOME=${PYTHON_INSTALL_DIR} - PYTHONPATH=${PYTHON_INSTALL_DIR}/lib:${PYTHON_INSTALL_DIR}/lib/python2.7:${PY_SITE_PACKAGES_PATH}) - #################################################################################### - - ##################################### SETUPTOOLS ################################### - SET(SETUPTOOLS_SOURCES_DIR ${PYTHON_SOURCES_DIR}/setuptools) - ExternalProject_Add(setuptools - ${EXTERNAL_PROJECT_LOG_ARGS} - PREFIX ${SETUPTOOLS_SOURCES_DIR} - URL "https://pypi.python.org/packages/source/s/setuptools/setuptools-18.3.2.tar.gz" - BUILD_IN_SOURCE 1 - PATCH_COMMAND "" - UPDATE_COMMAND "" - CONFIGURE_COMMAND "" - INSTALL_COMMAND "" - BUILD_COMMAND env ${py_env} ${PYTHON_EXECUTABLE} setup.py install - DEPENDS python zlib - ) - ##################################################################################### - - ##################################### SIX ########################################### - SET(SIX_SOURCES_DIR ${PYTHON_SOURCES_DIR}/six) - ExternalProject_Add(six - ${EXTERNAL_PROJECT_LOG_ARGS} - PREFIX ${SIX_SOURCES_DIR} - URL https://pypi.python.org/packages/source/s/six/six-1.10.0.tar.gz - BUILD_IN_SOURCE 1 - PATCH_COMMAND "" - UPDATE_COMMAND "" - CONFIGURE_COMMAND "" - INSTALL_COMMAND "" - BUILD_COMMAND env ${py_env} ${PYTHON_EXECUTABLE} setup.py install - DEPENDS python setuptools - ) - ##################################################################################### - - ##################################### CYTHON ######################################## - SET(CYTHON_SOURCES_DIR ${PYTHON_SOURCES_DIR}/cython) - ExternalProject_Add(cython - ${EXTERNAL_PROJECT_LOG_ARGS} - PREFIX ${CYTHON_SOURCES_DIR} - URL https://github.com/cython/cython/archive/0.25.2.tar.gz - GIT_TAG 0.25.2 - BUILD_IN_SOURCE 1 - CONFIGURE_COMMAND "" - PATCH_COMMAND "" - UPDATE_COMMAND "" - INSTALL_COMMAND "" - BUILD_COMMAND env ${py_env} ${PYTHON_EXECUTABLE} setup.py install - DEPENDS python - ) - #################################################################################### - - ##################################### NUMPY ######################################## - SET(NUMPY_SOURCES_DIR ${PYTHON_SOURCES_DIR}/numpy) - SET(NUMPY_TAG_VERSION "v1.11.3") - SET(NUMPY_VERSION "1.11.3") - - SET(EGG_NAME "") - SET(PYTHON_NUMPY_INCLUDE_DIR "") - IF(WIN32) - SET(EGG_NAME "numpy-${NUMPY_VERSION}-py2.7-${HOST_SYSTEM}.egg") - ELSE(WIN32) - IF(APPLE) - SET(EGG_NAME "numpy-${NUMPY_VERSION}-py2.7-${HOST_SYSTEM}-${MACOS_VERSION}") - ELSE(APPLE) - SET(EGG_NAME "numpy-${NUMPY_VERSION}-py2.7-linux") - SET(EGG_NAME "numpy-${NUMPY_VERSION}-py2.7-linux") - ENDIF(APPLE) - - FOREACH(suffix x86_64 intel fat64 fat32 universal) - LIST(APPEND PYTHON_NUMPY_INCLUDE_DIR ${PY_SITE_PACKAGES_PATH}/${EGG_NAME}-${suffix}.egg/numpy/core/include) - ENDFOREACH() - ENDIF(WIN32) - - ExternalProject_Add(numpy - ${EXTERNAL_PROJECT_LOG_ARGS} - GIT_REPOSITORY https://github.com/numpy/numpy.git - GIT_TAG ${NUMPY_TAG_VERSION} - CONFIGURE_COMMAND "" - UPDATE_COMMAND "" - PREFIX ${NUMPY_SOURCES_DIR} - BUILD_COMMAND env ${py_env} ${PYTHON_EXECUTABLE} setup.py build - INSTALL_COMMAND env ${py_env} ${PYTHON_EXECUTABLE} setup.py install - BUILD_IN_SOURCE 1 - DEPENDS python setuptools cython - ) - #################################################################################### - - ##################################### WHEEL ######################################## - SET(WHEEL_SOURCES_DIR ${PYTHON_SOURCES_DIR}/wheel) - ExternalProject_Add(wheel - ${EXTERNAL_PROJECT_LOG_ARGS} - URL https://pypi.python.org/packages/source/w/wheel/wheel-0.29.0.tar.gz - PREFIX ${WHEEL_SOURCES_DIR} - CONFIGURE_COMMAND "" - UPDATE_COMMAND "" - BUILD_COMMAND "" - INSTALL_COMMAND env ${py_env} ${PYTHON_EXECUTABLE} setup.py install - BUILD_IN_SOURCE 1 - DEPENDS python setuptools - ) - #################################################################################### - - ################################### PROTOBUF ####################################### - SET(PY_PROTOBUF_SOURCES_DIR ${PYTHON_SOURCES_DIR}/protobuf) - ExternalProject_Add(python-protobuf - ${EXTERNAL_PROJECT_LOG_ARGS} - URL https://pypi.python.org/packages/e0/b0/0a1b364fe8a7d177b4b7d4dca5b798500dc57a7273b93cca73931b305a6a/protobuf-3.1.0.post1.tar.gz - URL_MD5 38b5fb160c768d2f8444d0c6d637ff91 - PREFIX ${PY_PROTOBUF_SOURCES_DIR} - BUILD_IN_SOURCE 1 - PATCH_COMMAND "" - CONFIGURE_COMMAND "" - BUILD_COMMAND env ${py_env} ${PYTHON_EXECUTABLE} setup.py build - INSTALL_COMMAND env ${py_env} ${PYTHON_EXECUTABLE} setup.py install - DEPENDS python setuptools six - ) - #################################################################################### - - LIST(APPEND external_project_dependencies python setuptools six cython wheel python-protobuf numpy) - ENDIF(PYTHONINTERP_FOUND) -IF(WITH_PYTHON) - INCLUDE_DIRECTORIES(${PYTHON_INCLUDE_DIR}) - INCLUDE_DIRECTORIES(${PYTHON_NUMPY_INCLUDE_DIR}) -ELSE() - SET(PYTHON_LIBRARIES "") -ENDIF() +INCLUDE_DIRECTORIES(${PYTHON_INCLUDE_DIR}) +INCLUDE_DIRECTORIES(${PYTHON_NUMPY_INCLUDE_DIR}) diff --git a/cmake/external/swig.cmake b/cmake/external/swig.cmake index 744c766ee7b067058b2cb4aa7f7b761cbb9778d4..9db457c7b2d61228e5d5af6827c4cda11a20a463 100644 --- a/cmake/external/swig.cmake +++ b/cmake/external/swig.cmake @@ -1,17 +1,21 @@ # Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. -# +# # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at -# +# # http://www.apache.org/licenses/LICENSE-2.0 -# +# # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +IF(NOT WITH_SWIG_PY) + return() +ENDIF() + FIND_PACKAGE(SWIG) IF(NOT SWIG_FOUND) diff --git a/cmake/external/warpctc.cmake b/cmake/external/warpctc.cmake index 2d7daed9bcd5b8d854ffae6dc1ea191d154c16fe..7cb4efa7bff7164464f1210a2b2188226c219ef6 100644 --- a/cmake/external/warpctc.cmake +++ b/cmake/external/warpctc.cmake @@ -12,29 +12,22 @@ # See the License for the specific language governing permissions and # limitations under the License. +IF(MOBILE_INFERENCE) + return() +ENDIF() + INCLUDE(ExternalProject) SET(WARPCTC_SOURCES_DIR ${THIRD_PARTY_PATH}/warpctc) SET(WARPCTC_INSTALL_DIR ${THIRD_PARTY_PATH}/install/warpctc) -SET(WARPCTC_INCLUDE_DIR "${WARPCTC_INSTALL_DIR}/include" CACHE PATH "Warp-ctc Directory" FORCE) - -INCLUDE_DIRECTORIES(${WARPCTC_INCLUDE_DIR}) -SET(WARPCTC_LIB_DIR "${WARPCTC_INSTALL_DIR}/lib" CACHE PATH "Warp-ctc Library Directory" FORCE) - -IF(WIN32) - SET(WARPCTC_LIBRARIES - "${WARPCTC_INSTALL_DIR}/lib/warpctc.dll" CACHE FILEPATH "Warp-ctc Library" FORCE) -ELSE(WIN32) - IF(APPLE) - SET(_warpctc_SHARED_SUFFIX dylib) - ELSE(APPLE) - SET(_warpctc_SHARED_SUFFIX so) - ENDIF(APPLE) - - SET(WARPCTC_LIBRARIES - "${WARPCTC_INSTALL_DIR}/lib/libwarpctc.${_warpctc_SHARED_SUFFIX}" CACHE FILEPATH "Warp-ctc Library" FORCE) -ENDIF(WIN32) +SET(WARPCTC_INCLUDE_DIR "${WARPCTC_INSTALL_DIR}/include" + CACHE PATH "Warp-ctc Directory" FORCE) +# Used in unit test test_WarpCTCLayer +SET(WARPCTC_LIB_DIR "${WARPCTC_INSTALL_DIR}/lib" + CACHE PATH "Warp-ctc Library Directory" FORCE) +SET(WARPCTC_LIBRARIES "${WARPCTC_INSTALL_DIR}/lib/libwarpctc${CMAKE_SHARED_LIBRARY_SUFFIX}" + CACHE FILEPATH "Warp-ctc Library" FORCE) IF(CMAKE_CXX_COMPILER_ID STREQUAL "Clang" OR CMAKE_CXX_COMPILER_ID STREQUAL "AppleClang" ) SET(USE_OMP OFF) @@ -46,26 +39,31 @@ ExternalProject_Add( extern_warpctc ${EXTERNAL_PROJECT_LOG_ARGS} GIT_REPOSITORY "https://github.com/gangliao/warp-ctc.git" + GIT_TAG b63a0644654a3e0ed624c85a1767bc8193aead09 PREFIX ${WARPCTC_SOURCES_DIR} UPDATE_COMMAND "" CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} - CMAKE_ARGS -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} - CMAKE_ARGS -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS} - CMAKE_ARGS -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS} - CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${WARPCTC_INSTALL_DIR} - CMAKE_ARGS -DWITH_GPU=${WITH_GPU} - CMAKE_ARGS -DWITH_OMP=${USE_OMP} - CMAKE_ARGS -DWITH_TORCH=OFF - CMAKE_ARGS -DCMAKE_DISABLE_FIND_PACKAGE_Torch=ON - CMAKE_ARGS -DBUILD_SHARED=ON - CMAKE_ARGS -DCMAKE_POSITION_INDEPENDENT_CODE=ON - CMAKE_ARGS -DCMAKE_BUILD_TYPE=Release - CMAKE_CACHE_ARGS -DCMAKE_BUILD_TYPE:STRING=Release + -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} + -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS} + -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS} + -DCMAKE_INSTALL_PREFIX=${WARPCTC_INSTALL_DIR} + -DWITH_GPU=${WITH_GPU} + -DWITH_OMP=${USE_OMP} + -DWITH_TORCH=OFF + -DCMAKE_DISABLE_FIND_PACKAGE_Torch=ON + -DBUILD_SHARED=ON + -DCMAKE_POSITION_INDEPENDENT_CODE=ON + -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE} + ${EXTERNAL_OPTIONAL_ARGS} + CMAKE_CACHE_ARGS -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE} -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON -DCMAKE_INSTALL_PREFIX:PATH=${WARPCTC_INSTALL_DIR} ) -ADD_LIBRARY(warpctc STATIC IMPORTED GLOBAL) +MESSAGE(STATUS "warp-ctc library: ${WARPCTC_LIBRARIES}") +INCLUDE_DIRECTORIES(${WARPCTC_INCLUDE_DIR}) + +ADD_LIBRARY(warpctc SHARED IMPORTED GLOBAL) SET_PROPERTY(TARGET warpctc PROPERTY IMPORTED_LOCATION ${WARPCTC_LIBRARIES}) ADD_DEPENDENCIES(warpctc extern_warpctc) diff --git a/cmake/external/zlib.cmake b/cmake/external/zlib.cmake index 45ca5542b7dc30216b45487782f849b93c5f8fca..1638cd8fdfc34575132462859e056a1907f0b2f1 100644 --- a/cmake/external/zlib.cmake +++ b/cmake/external/zlib.cmake @@ -1,11 +1,11 @@ # Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. -# +# # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at -# +# # http://www.apache.org/licenses/LICENSE-2.0 -# +# # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -34,18 +34,30 @@ ExternalProject_Add( GIT_TAG "v1.2.8" PREFIX ${ZLIB_SOURCES_DIR} UPDATE_COMMAND "" - CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} CMAKE_ARGS -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} - CMAKE_ARGS -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS} - CMAKE_ARGS -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS} - CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${ZLIB_INSTALL_DIR} - CMAKE_ARGS -DBUILD_SHARED_LIBS=OFF - CMAKE_ARGS -DCMAKE_POSITION_INDEPENDENT_CODE=ON - CMAKE_ARGS -DCMAKE_MACOSX_RPATH=ON - CMAKE_ARGS -DCMAKE_BUILD_TYPE=Release + -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} + -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS} + -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS} + -DCMAKE_INSTALL_PREFIX=${ZLIB_INSTALL_DIR} + -DBUILD_SHARED_LIBS=OFF + -DCMAKE_POSITION_INDEPENDENT_CODE=ON + -DCMAKE_MACOSX_RPATH=ON + -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE} + ${EXTERNAL_OPTIONAL_ARGS} CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${ZLIB_INSTALL_DIR} -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON - -DCMAKE_BUILD_TYPE:STRING=Release + -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE} ) LIST(APPEND external_project_dependencies zlib) +ADD_LIBRARY(zlib_target STATIC IMPORTED GLOBAL) +SET_PROPERTY(TARGET zlib_target PROPERTY IMPORTED_LOCATION ${ZLIB_LIBRARIES}) + +IF(WITH_C_API) + INSTALL(DIRECTORY ${ZLIB_INCLUDE_DIR} DESTINATION third_party/zlib) + IF(ANDROID) + INSTALL(FILES ${ZLIB_LIBRARIES} DESTINATION third_party/zlib/lib/${ANDROID_ABI}) + ELSE() + INSTALL(FILES ${ZLIB_LIBRARIES} DESTINATION third_party/zlib/lib) + ENDIF() +ENDIF() diff --git a/cmake/flags.cmake b/cmake/flags.cmake index 7a996dea92b13bdac054a987a004a3d54ff02da2..1120677a37e0d44163816b66600121c8f0d545af 100644 --- a/cmake/flags.cmake +++ b/cmake/flags.cmake @@ -109,7 +109,11 @@ set(COMMON_FLAGS -Wno-unused-function -Wno-error=literal-suffix -Wno-error=sign-compare - -Wno-error=unused-local-typedefs) + -Wno-error=unused-local-typedefs + -Wno-error=parentheses-equality # Warnings in pybind11 + -Wno-error=ignored-attributes # Warnings in Eigen, gcc 6.3 + -Wno-error=terminate # Warning in PADDLE_ENFORCE +) set(GPU_COMMON_FLAGS -fPIC @@ -122,11 +126,14 @@ set(GPU_COMMON_FLAGS -Wno-error=literal-suffix -Wno-error=unused-local-typedefs -Wno-error=unused-function # Warnings in Numpy Header. + -Wno-error=array-bounds # Warnings in Eigen::array ) if (APPLE) - # On Mac OS X build fat binaries with x86_64 architectures by default. - set (CMAKE_OSX_ARCHITECTURES "x86_64" CACHE STRING "Build architectures for OSX" FORCE) + if(NOT CMAKE_CROSSCOMPILING) + # On Mac OS X build fat binaries with x86_64 architectures by default. + set (CMAKE_OSX_ARCHITECTURES "x86_64" CACHE STRING "Build architectures for OSX" FORCE) + endif() else() set(GPU_COMMON_FLAGS -Wall @@ -144,57 +151,3 @@ endforeach() foreach(flag ${GPU_COMMON_FLAGS}) safe_set_nvflag(${flag}) endforeach() - - -set(CUDA_PROPAGATE_HOST_FLAGS OFF) - -# Release/Debug flags set by cmake. Such as -O3 -g -DNDEBUG etc. -# So, don't set these flags here. -LIST(APPEND CUDA_NVCC_FLAGS -std=c++11) -LIST(APPEND CUDA_NVCC_FLAGS --use_fast_math) - -if(CMAKE_BUILD_TYPE STREQUAL "Debug") - LIST(APPEND CUDA_NVCC_FLAGS ${CMAKE_CXX_FLAGS_DEBUG}) -elseif(CMAKE_BUILD_TYPE STREQUAL "Release") - LIST(APPEND CUDA_NVCC_FLAGS ${CMAKE_CXX_FLAGS_RELEASE}) -elseif(CMAKE_BUILD_TYPE STREQUAL "RelWithDebInfo") - LIST(APPEND CUDA_NVCC_FLAGS ${CMAKE_CXX_FLAGS_RELWITHDEBINFO}) -elseif(CMAKE_BUILD_TYPE STREQUAL "MinSizeRel") - LIST(APPEND CUDA_NVCC_FLAGS ${CMAKE_CXX_FLAGS_MINSIZEREL}) -endif() - -function(specify_cuda_arch cuda_version cuda_arch) - if(${cuda_version} VERSION_GREATER "8.0") - foreach(capability 61 62) - if(${cuda_arch} STREQUAL ${capability}) - list(APPEND __arch_flags " -gencode arch=compute_${cuda_arch},code=sm_${cuda_arch}") - endif() - endforeach() - elseif(${cuda_version} VERSION_GREATER "7.0" and ${cuda_arch} STREQUAL "53") - list(APPEND __arch_flags " -gencode arch=compute_${cuda_arch},code=sm_${cuda_arch}") - endif() -endfunction() - -# Common gpu architectures: Kepler, Maxwell -foreach(capability 30 35 50) - list(APPEND __arch_flags " -gencode arch=compute_${capability},code=sm_${capability}") -endforeach() - -if (CUDA_VERSION VERSION_GREATER "7.0" OR CUDA_VERSION VERSION_EQUAL "7.0") - list(APPEND __arch_flags " -gencode arch=compute_52,code=sm_52") -endif() - -# Modern gpu architectures: Pascal -if (CUDA_VERSION VERSION_GREATER "8.0" OR CUDA_VERSION VERSION_EQUAL "8.0") - list(APPEND __arch_flags " -gencode arch=compute_60,code=sm_60") -endif() - -# Custom gpu architecture -set(CUDA_ARCH) - -if(CUDA_ARCH) - specify_cuda_arch(${CUDA_VERSION} ${CUDA_ARCH}) -endif() - -set(CUDA_NVCC_FLAGS ${__arch_flags} ${CUDA_NVCC_FLAGS}) - diff --git a/cmake/generic.cmake b/cmake/generic.cmake index 43cd6b398b1caac55b938d576b96eb0282c00fda..18770fe2861380ea1320aef5cb7ec3432147d7ce 100644 --- a/cmake/generic.cmake +++ b/cmake/generic.cmake @@ -11,56 +11,205 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +# -# To simplify the build process of PaddlePaddle, we defined couple of -# fundamental abstractions, e.g., how to build library, binary and -# test in C++, CUDA and Go. +# generic.cmake defines CMakes functions that look like Bazel's +# building rules (https://bazel.build/). +# # # ------------------------------------------- -# C++ CUDA C++ Go +# C++ CUDA C++ Go # ------------------------------------------- -# cc_library nv_library go_library -# cc_binary nv_binary go_binary -# cc_test nv_test go_test +# cc_library nv_library go_library +# cc_binary nv_binary go_binary +# cc_test nv_test go_test # ------------------------------------------- # -# cmake_parse_arguments can help us to achieve this goal. -# https://cmake.org/cmake/help/v3.0/module/CMakeParseArguments.html +# To build a static library example.a from example.cc using the system +# compiler (like GCC): +# +# cc_library(example SRCS example.cc) +# +# To build a static library example.a from multiple source files +# example{1,2,3}.cc: +# +# cc_library(example SRCS example1.cc example2.cc example3.cc) +# +# To build a shared library example.so from example.cc: +# +# cc_library(example SHARED SRCS example.cc) +# +# To build a library using Nvidia's NVCC from .cu file(s), use the nv_ +# prefixed version: +# +# nv_library(example SRCS example.cu) +# +# To specify that a library new_example.a depends on other libraies: +# +# cc_library(new_example SRCS new_example.cc DEPS example) +# +# Static libraries can be composed of other static libraries: +# +# cc_library(composed DEPS dependent1 dependent2 dependent3) +# +# To build an executable binary file from some source files and +# dependent libraries: # +# cc_binary(example SRCS main.cc something.cc DEPS example1 example2) +# +# To build an executable binary file using NVCC, use the nv_ prefixed +# version: +# +# nv_binary(example SRCS main.cc something.cu DEPS example1 example2) +# +# To build a unit test binary, which is an executable binary with +# GoogleTest linked: +# +# cc_test(example_test SRCS example_test.cc DEPS example) +# +# To build a unit test binary using NVCC, use the nv_ prefixed version: +# +# nv_test(example_test SRCS example_test.cu DEPS example) +# +# It is pretty often that executable and test binaries depend on +# pre-defined external libaries like glog and gflags defined in +# /cmake/external/*.cmake: +# +# cc_test(example_test SRCS example_test.cc DEPS example glog gflags) +# +# To build a go static library using Golang, use the go_ prefixed version: +# +# go_library(example STATIC) +# +# To build a go shared library using Golang, use the go_ prefixed version: +# +# go_library(example SHARED) +# + +# including binary directory for generated headers. +include_directories(${CMAKE_CURRENT_BINARY_DIR}) -if(NOT APPLE) +if(NOT APPLE AND NOT ANDROID) find_package(Threads REQUIRED) link_libraries(${CMAKE_THREAD_LIBS_INIT}) -endif(NOT APPLE) - -# cc_library parses tensor.cc and figures out that target also depend on tensor.h. -# cc_library(tensor -# SRCS -# tensor.cc -# DEPS -# variant) + set(CMAKE_CXX_LINK_EXECUTABLE "${CMAKE_CXX_LINK_EXECUTABLE} -pthread -ldl -lrt") +endif(NOT APPLE AND NOT ANDROID) + +function(merge_static_libs TARGET_NAME) + set(libs ${ARGN}) + list(REMOVE_DUPLICATES libs) + + # Get all propagation dependencies from the merged libraries + foreach(lib ${libs}) + list(APPEND libs_deps ${${lib}_LIB_DEPENDS}) + endforeach() + list(REMOVE_DUPLICATES libs_deps) + + # To produce a library we need at least one source file. + # It is created by add_custom_command below and will helps + # also help to track dependencies. + set(target_SRCS ${CMAKE_CURRENT_BINARY_DIR}/${TARGET_NAME}_dummy.c) + + if(APPLE) # Use OSX's libtool to merge archives + # Make the generated dummy source file depended on all static input + # libs. If input lib changes,the source file is touched + # which causes the desired effect (relink). + add_custom_command(OUTPUT ${target_SRCS} + COMMAND ${CMAKE_COMMAND} -E touch ${target_SRCS} + DEPENDS ${libs}) + + # Generate dummy staic lib + file(WRITE ${target_SRCS} "const char *dummy_${TARGET_NAME} = \"${target_SRCS}\";") + add_library(${TARGET_NAME} STATIC ${target_SRCS}) + target_link_libraries(${TARGET_NAME} ${libs_deps}) + + foreach(lib ${libs}) + # Get the file names of the libraries to be merged + set(libfiles ${libfiles} $) + endforeach() + add_custom_command(TARGET ${TARGET_NAME} POST_BUILD + COMMAND rm "${CMAKE_CURRENT_BINARY_DIR}/lib${TARGET_NAME}.a" + COMMAND /usr/bin/libtool -static -o "${CMAKE_CURRENT_BINARY_DIR}/lib${TARGET_NAME}.a" ${libfiles} + ) + else() # general UNIX: use "ar" to extract objects and re-add to a common lib + set(target_DIR ${CMAKE_CURRENT_BINARY_DIR}/${TARGET_NAME}.dir) + + foreach(lib ${libs}) + set(objlistfile ${target_DIR}/${lib}.objlist) # list of objects in the input library + set(objdir ${target_DIR}/${lib}.objdir) + + add_custom_command(OUTPUT ${objdir} + COMMAND ${CMAKE_COMMAND} -E make_directory ${objdir} + DEPENDS ${lib}) + + add_custom_command(OUTPUT ${objlistfile} + COMMAND ${CMAKE_AR} -x "$" + COMMAND ${CMAKE_AR} -t "$" > ${objlistfile} + DEPENDS ${lib} ${objdir} + WORKING_DIRECTORY ${objdir}) + + list(APPEND target_OBJS "${objlistfile}") + endforeach() + + # Make the generated dummy source file depended on all static input + # libs. If input lib changes,the source file is touched + # which causes the desired effect (relink). + add_custom_command(OUTPUT ${target_SRCS} + COMMAND ${CMAKE_COMMAND} -E touch ${target_SRCS} + DEPENDS ${libs} ${target_OBJS}) + + # Generate dummy staic lib + file(WRITE ${target_SRCS} "const char *dummy_${TARGET_NAME} = \"${target_SRCS}\";") + add_library(${TARGET_NAME} STATIC ${target_SRCS}) + target_link_libraries(${TARGET_NAME} ${libs_deps}) + + # Get the file name of the generated library + set(target_LIBNAME "$") + + add_custom_command(TARGET ${TARGET_NAME} POST_BUILD + COMMAND ${CMAKE_AR} crs ${target_LIBNAME} `find ${target_DIR} -name '*.o'` + COMMAND ${CMAKE_RANLIB} ${target_LIBNAME} + WORKING_DIRECTORY ${target_DIR}) + endif() +endfunction(merge_static_libs) + function(cc_library TARGET_NAME) - set(options OPTIONAL) + set(options STATIC static SHARED shared) set(oneValueArgs "") set(multiValueArgs SRCS DEPS) cmake_parse_arguments(cc_library "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) - if (${cc_library_OPTIONAL} STREQUAL "SHARED") - add_library(${TARGET_NAME} SHARED ${cc_library_SRCS}) - else() - add_library(${TARGET_NAME} STATIC ${cc_library_SRCS}) - endif() - if (cc_library_DEPS) - add_dependencies(${TARGET_NAME} ${cc_library_DEPS}) - endif() + if (cc_library_SRCS) + if (cc_library_SHARED OR cc_library_shared) # build *.so + add_library(${TARGET_NAME} SHARED ${cc_library_SRCS}) + else() + add_library(${TARGET_NAME} STATIC ${cc_library_SRCS}) + endif() + if (cc_library_DEPS) + add_dependencies(${TARGET_NAME} ${cc_library_DEPS}) + target_link_libraries(${TARGET_NAME} ${cc_library_DEPS}) + endif() + + # cpplint code style + foreach(source_file ${cc_library_SRCS}) + string(REGEX REPLACE "\\.[^.]*$" "" source ${source_file}) + if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${source}.h) + list(APPEND cc_library_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/${source}.h) + endif() + endforeach() + add_style_check_target(${TARGET_NAME} ${cc_library_SRCS} ${cc_library_HEADERS}) + + else(cc_library_SRCS) + if(cc_library_DEPS) + merge_static_libs(${TARGET_NAME} ${cc_library_DEPS}) + else() + message(FATAL "Please specify source file or library in cc_library.") + endif() + endif(cc_library_SRCS) endfunction(cc_library) -# cc_binary parses tensor.cc and figures out that target also depend on tensor.h. -# cc_binary(tensor -# SRCS -# tensor.cc) function(cc_binary TARGET_NAME) - set(options OPTIONAL) + set(options "") set(oneValueArgs "") set(multiValueArgs SRCS DEPS) cmake_parse_arguments(cc_binary "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) @@ -71,48 +220,56 @@ function(cc_binary TARGET_NAME) endif() endfunction(cc_binary) -# The dependency to target tensor implies that if any of -# tensor{.h,.cc,_test.cc} is changed, tensor_test need to be re-built. -# cc_test(tensor_test -# SRCS -# tensor_test.cc -# DEPS -# tensor) function(cc_test TARGET_NAME) if(WITH_TESTING) set(options "") set(oneValueArgs "") - set(multiValueArgs SRCS DEPS) + set(multiValueArgs SRCS DEPS ARGS) cmake_parse_arguments(cc_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) add_executable(${TARGET_NAME} ${cc_test_SRCS}) - target_link_libraries(${TARGET_NAME} ${cc_test_DEPS} gtest gtest_main) - add_dependencies(${TARGET_NAME} ${cc_test_DEPS} gtest gtest_main) - add_test(${TARGET_NAME} ${TARGET_NAME}) + # Support linking flags: --whole-archive (Linux) / -force_load (MacOS) + target_circle_link_libraries(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main paddle_memory gtest gflags) + if("${cc_test_DEPS}" MATCHES "ARCHIVE_START") + list(REMOVE_ITEM cc_test_DEPS ARCHIVE_START ARCHIVE_END) + endif() + add_dependencies(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main paddle_memory gtest gflags) + add_test(NAME ${TARGET_NAME} + COMMAND ${TARGET_NAME} ${cc_test_ARGS} + WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}) endif() endfunction(cc_test) -# Suppose that ops.cu includes global functions that take Tensor as -# their parameters, so ops depend on tensor. This implies that if -# any of tensor.{h.cc}, ops.{h,cu} is changed, ops need to be re-built. -# nv_library(ops -# SRCS -# ops.cu -# DEPS -# tensor) function(nv_library TARGET_NAME) if (WITH_GPU) - set(options OPTIONAL) + set(options STATIC static SHARED shared) set(oneValueArgs "") set(multiValueArgs SRCS DEPS) cmake_parse_arguments(nv_library "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) - if (${nv_library_OPTIONAL} STREQUAL "SHARED") - cuda_add_library(${TARGET_NAME} SHARED ${nv_library_SRCS}) - else() - cuda_add_library(${TARGET_NAME} STATIC ${nv_library_SRCS}) - endif() - if (nv_library_DEPS) - add_dependencies(${TARGET_NAME} ${nv_library_DEPS}) - endif() + if(nv_library_SRCS) + if (nv_library_SHARED OR nv_library_shared) # build *.so + cuda_add_library(${TARGET_NAME} SHARED ${nv_library_SRCS}) + else() + cuda_add_library(${TARGET_NAME} STATIC ${nv_library_SRCS}) + endif() + if (nv_library_DEPS) + add_dependencies(${TARGET_NAME} ${nv_library_DEPS}) + target_link_libraries(${TARGET_NAME} ${nv_library_DEPS}) + endif() + # cpplint code style + foreach(source_file ${nv_library_SRCS}) + string(REGEX REPLACE "\\.[^.]*$" "" source ${source_file}) + if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${source}.h) + list(APPEND nv_library_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/${source}.h) + endif() + endforeach() + add_style_check_target(${TARGET_NAME} ${nv_library_SRCS} ${nv_library_HEADERS}) + else(nv_library_SRCS) + if (nv_library_DEPS) + merge_static_libs(${TARGET_NAME} ${nv_library_DEPS}) + else() + message(FATAL "Please specify source file or library in nv_library.") + endif() + endif(nv_library_SRCS) endif() endfunction(nv_library) @@ -130,13 +287,6 @@ function(nv_binary TARGET_NAME) endif() endfunction(nv_binary) -# The dependency to target tensor implies that if any of -# ops{.h,.cu,_test.cu} is changed, ops_test need to be re-built. -# nv_test(ops_test -# SRCS -# ops_test.cu -# DEPS -# ops) function(nv_test TARGET_NAME) if (WITH_GPU AND WITH_TESTING) set(options "") @@ -144,50 +294,72 @@ function(nv_test TARGET_NAME) set(multiValueArgs SRCS DEPS) cmake_parse_arguments(nv_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) cuda_add_executable(${TARGET_NAME} ${nv_test_SRCS}) - target_link_libraries(${TARGET_NAME} ${nv_test_DEPS} gtest gtest_main) - add_dependencies(${TARGET_NAME} ${nv_test_DEPS} gtest gtest_main) + target_link_libraries(${TARGET_NAME} ${nv_test_DEPS} paddle_gtest_main paddle_memory gtest gflags) + add_dependencies(${TARGET_NAME} ${nv_test_DEPS} paddle_gtest_main paddle_memory gtest gflags) add_test(${TARGET_NAME} ${TARGET_NAME}) endif() endfunction(nv_test) -set(GOPATH "${CMAKE_CURRENT_BINARY_DIR}/go") -file(MAKE_DIRECTORY ${GOPATH}) - -# Because api.go defines a GO wrapper to ops and tensor, it depends on -# both. This implies that if any of tensor.{h,cc}, ops.{h,cu}, or -# api.go is changed, api need to be re-built. -# go_library(api -# SRCS -# api.go -# DEPS -# tensor # Because ops depend on tensor, this line is optional. -# ops) function(go_library TARGET_NAME) - set(options OPTIONAL) + set(options STATIC static SHARED shared) set(oneValueArgs "") - set(multiValueArgs SRCS DEPS) + set(multiValueArgs DEPS) cmake_parse_arguments(go_library "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) - if (${go_library_OPTIONAL} STREQUAL "SHARED") + + if (go_library_SHARED OR go_library_shared) set(BUILD_MODE "-buildmode=c-shared") - if(APPLE) - set(LIB_NAME "lib${TARGET_NAME}.dylib") - else() - set(LIB_NAME "lib${TARGET_NAME}.so") - endif() + set(${TARGET_NAME}_LIB_NAME "${CMAKE_SHARED_LIBRARY_PREFIX}${TARGET_NAME}${CMAKE_SHARED_LIBRARY_SUFFIX}" CACHE STRING "output library name for target ${TARGET_NAME}") else() set(BUILD_MODE "-buildmode=c-archive") - set(LIB_NAME "lib${TARGET_NAME}.a") + set(${TARGET_NAME}_LIB_NAME "${CMAKE_STATIC_LIBRARY_PREFIX}${TARGET_NAME}${CMAKE_STATIC_LIBRARY_SUFFIX}" CACHE STRING "output library name for target ${TARGET_NAME}") endif() - add_custom_command(OUTPUT ${TARGET_NAME}_timestamp - COMMAND env GOPATH=${GOPATH} ${CMAKE_Go_COMPILER} build ${BUILD_MODE} - -o "${CMAKE_CURRENT_BINARY_DIR}/${LIB_NAME}" - ${go_library_SRCS} - WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}) - add_custom_target(${TARGET_NAME}_lib ALL DEPENDS ${TARGET_NAME}_timestamp ${go_library_DEPS}) - add_library(${TARGET_NAME} STATIC IMPORTED) - set_property(TARGET ${TARGET_NAME} PROPERTY - IMPORTED_LOCATION "${CMAKE_CURRENT_BINARY_DIR}/${LIB_NAME}") - add_dependencies(${TARGET_NAME} ${TARGET_NAME}_lib) + + set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/${TARGET_NAME}_dummy.c) + + # This custom command will always run since it depends on a not + # existing file. + add_custom_command( + OUTPUT dummy_rebulid_${TARGET_NAME} + COMMAND cmake -E touch ${dummyfile} + ) + # Create a custom target that depends on the custom command output + # file, so the custom command can be referenced as a dependency by + # `add_dependencies`. + add_custom_target(rebuild_${TARGET_NAME} + DEPENDS dummy_rebulid_${TARGET_NAME} + ) + + # Add dummy code to support `make target_name` under Terminal Command + file(WRITE ${dummyfile} "const char *dummy_${TARGET_NAME} = \"${dummyfile}\";") + if (go_library_SHARED OR go_library_shared) + add_library(${TARGET_NAME} SHARED ${dummyfile}) + else() + add_library(${TARGET_NAME} STATIC ${dummyfile}) + endif() + if(go_library_DEPS) + add_dependencies(${TARGET_NAME} ${go_library_DEPS}) + endif(go_library_DEPS) + + # The "source file" of the library is `${dummyfile}` which never + # change, so the target will never rebuild. Make the target depends + # on the custom command that touches the library "source file", so + # rebuild will always happen. + add_dependencies(${TARGET_NAME} rebuild_${TARGET_NAME}) + + set(${TARGET_NAME}_LIB_PATH "${CMAKE_CURRENT_BINARY_DIR}/${${TARGET_NAME}_LIB_NAME}" CACHE STRING "output library path for target ${TARGET_NAME}") + + file(GLOB GO_SOURCE RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "*.go") + string(REPLACE "${PADDLE_GO_PATH}/" "" CMAKE_CURRENT_SOURCE_REL_DIR ${CMAKE_CURRENT_SOURCE_DIR}) + + add_custom_command(TARGET ${TARGET_NAME} POST_BUILD + COMMAND rm "${${TARGET_NAME}_LIB_PATH}" + # Golang build source code + COMMAND GOPATH=${GOPATH} ${CMAKE_Go_COMPILER} build ${BUILD_MODE} + -o "${${TARGET_NAME}_LIB_PATH}" + "./${CMAKE_CURRENT_SOURCE_REL_DIR}/${GO_SOURCE}" + # must run under GOPATH + WORKING_DIRECTORY "${PADDLE_IN_GOPATH}/go") + add_dependencies(${TARGET_NAME} go_vendor) endfunction(go_library) function(go_binary TARGET_NAME) @@ -195,32 +367,156 @@ function(go_binary TARGET_NAME) set(oneValueArgs "") set(multiValueArgs SRCS DEPS) cmake_parse_arguments(go_binary "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) + string(REPLACE "${PADDLE_GO_PATH}/" "" CMAKE_CURRENT_SOURCE_REL_DIR ${CMAKE_CURRENT_SOURCE_DIR}) + add_custom_command(OUTPUT ${TARGET_NAME}_timestamp COMMAND env GOPATH=${GOPATH} ${CMAKE_Go_COMPILER} build -o "${CMAKE_CURRENT_BINARY_DIR}/${TARGET_NAME}" - ${go_library_SRCS} - WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}) - add_custom_target(${TARGET_NAME} ALL DEPENDS ${TARGET_NAME}_timestamp ${go_binary_DEPS}) + "./${CMAKE_CURRENT_SOURCE_REL_DIR}/${go_binary_SRCS}" + WORKING_DIRECTORY "${PADDLE_IN_GOPATH}/go") + add_custom_target(${TARGET_NAME} ALL DEPENDS go_vendor ${TARGET_NAME}_timestamp ${go_binary_DEPS}) install(PROGRAMS ${CMAKE_CURRENT_BINARY_DIR}/${TARGET_NAME} DESTINATION bin) endfunction(go_binary) function(go_test TARGET_NAME) set(options OPTIONAL) set(oneValueArgs "") - set(multiValueArgs SRCS DEPS) + set(multiValueArgs DEPS) cmake_parse_arguments(go_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) - add_custom_command(OUTPUT ${TARGET_NAME}_timestamp - COMMAND env GOPATH=${GOPATH} ${CMAKE_Go_COMPILER} test + string(REPLACE "${PADDLE_GO_PATH}" "" CMAKE_CURRENT_SOURCE_REL_DIR ${CMAKE_CURRENT_SOURCE_DIR}) + add_custom_target(${TARGET_NAME} ALL DEPENDS go_vendor ${go_test_DEPS}) + add_custom_command(TARGET ${TARGET_NAME} POST_BUILD + COMMAND env GOPATH=${GOPATH} ${CMAKE_Go_COMPILER} test -race -c -o "${CMAKE_CURRENT_BINARY_DIR}/${TARGET_NAME}" - ${go_test_SRCS} + ".${CMAKE_CURRENT_SOURCE_REL_DIR}" + WORKING_DIRECTORY "${PADDLE_IN_GOPATH}/go") + add_test(NAME ${TARGET_NAME} + COMMAND ${CMAKE_CURRENT_BINARY_DIR}/${TARGET_NAME} WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}) - add_custom_target(${TARGET_NAME} ALL DEPENDS ${TARGET_NAME}_timestamp ${go_test_DEPS}) - add_test(${TARGET_NAME} ${CMAKE_CURRENT_BINARY_DIR}/${TARGET_NAME}) endfunction(go_test) -# go_extern will download extern go project. -# go_extern(target_name extern_source) -# go_extern(go_redis github.com/hoisie/redis) -function(go_extern TARGET_NAME) - add_custom_target(${TARGET_NAME} env GOPATH=${GOPATH} ${CMAKE_Go_COMPILER} get ${ARGN}) -endfunction(go_extern) +# Modification of standard 'protobuf_generate_cpp()' with protobuf-lite support +# Usage: +# paddle_protobuf_generate_cpp( ) + +function(paddle_protobuf_generate_cpp SRCS HDRS) + if(NOT ARGN) + message(SEND_ERROR "Error: paddle_protobuf_generate_cpp() called without any proto files") + return() + endif() + + set(${SRCS}) + set(${HDRS}) + + if (MOBILE_INFERENCE) + set(EXTRA_FLAG "lite:") + else() + set(EXTRA_FLAG "") + endif() + + foreach(FIL ${ARGN}) + get_filename_component(ABS_FIL ${FIL} ABSOLUTE) + get_filename_component(FIL_WE ${FIL} NAME_WE) + + set(_protobuf_protoc_src "${CMAKE_CURRENT_BINARY_DIR}/${FIL_WE}.pb.cc") + set(_protobuf_protoc_hdr "${CMAKE_CURRENT_BINARY_DIR}/${FIL_WE}.pb.h") + list(APPEND ${SRCS} "${_protobuf_protoc_src}") + list(APPEND ${HDRS} "${_protobuf_protoc_hdr}") + + add_custom_command( + OUTPUT "${_protobuf_protoc_src}" + "${_protobuf_protoc_hdr}" + + COMMAND ${CMAKE_COMMAND} -E make_directory "${CMAKE_CURRENT_BINARY_DIR}" + COMMAND ${PROTOBUF_PROTOC_EXECUTABLE} + -I${CMAKE_CURRENT_SOURCE_DIR} + --cpp_out "${EXTRA_FLAG}${CMAKE_CURRENT_BINARY_DIR}" ${ABS_FIL} + DEPENDS ${ABS_FIL} protoc + COMMENT "Running C++ protocol buffer compiler on ${FIL}" + VERBATIM ) + endforeach() + + set_source_files_properties(${${SRCS}} ${${HDRS}} PROPERTIES GENERATED TRUE) + set(${SRCS} ${${SRCS}} PARENT_SCOPE) + set(${HDRS} ${${HDRS}} PARENT_SCOPE) +endfunction() + + +function(proto_library TARGET_NAME) + set(oneValueArgs "") + set(multiValueArgs SRCS DEPS) + cmake_parse_arguments(proto_library "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) + set(proto_srcs) + set(proto_hdrs) + paddle_protobuf_generate_cpp(proto_srcs proto_hdrs ${proto_library_SRCS}) + cc_library(${TARGET_NAME} SRCS ${proto_srcs} DEPS ${proto_library_DEPS} protobuf) +endfunction() + +function(py_proto_compile TARGET_NAME) + set(oneValueArgs "") + set(multiValueArgs SRCS) + cmake_parse_arguments(py_proto_compile "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) + set(py_srcs) + protobuf_generate_python(py_srcs ${py_proto_compile_SRCS}) + add_custom_target(${TARGET_NAME} ALL DEPENDS ${py_srcs}) +endfunction() + +function(py_test TARGET_NAME) + if(WITH_TESTING) + set(options "") + set(oneValueArgs "") + set(multiValueArgs SRCS DEPS ARGS) + cmake_parse_arguments(py_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) + add_test(NAME ${TARGET_NAME} + COMMAND env PYTHONPATH=${PADDLE_PYTHON_BUILD_DIR}/lib-python + ${PYTHON_EXECUTABLE} -u ${py_test_SRCS} ${py_test_ARGS} + WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}) + endif() +endfunction() + +# grpc_library generate grpc code using grpc_cpp_plugin and protoc +# then build the generated protobuf code and grpc code with your +# implementation source codes together. Use SRCS argument for your +# implementation source files and PROTO argument for your .proto +# files. +# +# Usage: grpc_library(my_target SRCS my_client.cc PROTO my_target.proto DEPS my_dep) + +function(grpc_library TARGET_NAME) + set(oneValueArgs PROTO) + set(multiValueArgs SRCS DEPS) + set(options "") + cmake_parse_arguments(grpc_library "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) + + message(STATUS "generating grpc ${grpc_library_PROTO}") + + get_filename_component(ABS_PROTO ${grpc_library_PROTO} ABSOLUTE) + get_filename_component(PROTO_WE ${grpc_library_PROTO} NAME_WE) + get_filename_component(PROTO_PATH ${ABS_PROTO} PATH) + + protobuf_generate_cpp(grpc_proto_srcs grpc_proto_hdrs "${ABS_PROTO}") + set(grpc_grpc_srcs "${CMAKE_CURRENT_BINARY_DIR}/${PROTO_WE}.grpc.pb.cc") + set(grpc_grpc_hdrs "${CMAKE_CURRENT_BINARY_DIR}/${PROTO_WE}.grpc.pb.h") + cc_library("${TARGET_NAME}_proto" SRCS "${grpc_proto_srcs}") + + add_custom_command( + OUTPUT "${grpc_grpc_srcs}" "${grpc_grpc_hdrs}" + COMMAND ${PROTOBUF_PROTOC_EXECUTABLE} + ARGS --grpc_out "${CMAKE_CURRENT_BINARY_DIR}" -I "${PROTO_PATH}" + --plugin=protoc-gen-grpc="${GRPC_CPP_PLUGIN}" "${ABS_PROTO}" + DEPENDS "${ABS_PROTO}" ${PROTOBUF_PROTOC_EXECUTABLE} extern_grpc) + + # FIXME(typhoonzero): grpc generated code do not generate virtual-dtor, mark it + # as compiler warnings instead of error. Should try remove the warnings also. + set_source_files_properties( + ${grpc_grpc_srcs} + PROPERTIES + COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor") + cc_library("${TARGET_NAME}_grpc" SRCS "${grpc_grpc_srcs}") + + set_source_files_properties( + ${grpc_library_SRCS} + PROPERTIES + COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor") + cc_library("${TARGET_NAME}" SRCS "${grpc_library_SRCS}" DEPS "${TARGET_NAME}_grpc" "${TARGET_NAME}_proto" "${grpc_library_DEPS}") +endfunction() diff --git a/cmake/make_resource.py b/cmake/make_resource.py index a9241b0e3e36c2e79c79e46b4f9114b7f6947341..4f9f5546b9d4176d1035311cb9e1acf0c0eeccfd 100644 --- a/cmake/make_resource.py +++ b/cmake/make_resource.py @@ -1,3 +1,17 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import os import re import sys diff --git a/cmake/package.cmake b/cmake/package.cmake index ff49a2d08e8f6004320acfce266339aa301eb9c4..79e02147f3f7cc19c1bf45d8a1d208a9a32416ff 100644 --- a/cmake/package.cmake +++ b/cmake/package.cmake @@ -12,7 +12,7 @@ set(CPACK_PACKAGE_DESCRIPTION "") set(CPACK_DEBIAN_PACKAGE_DEPENDS "libpython2.7-dev, libstdc++6, python-pip, curl, libgfortran3, python-pip-whl") set(CPACK_DEBIAN_PACKAGE_SECTION Devel) set(CPACK_DEBIAN_PACKAGE_VERSION ${PADDLE_VERSION}) -set(CPACK_DEBIAN_PACKAGE_CONTROL_EXTRA "${PROJ_ROOT}/paddle/scripts/deb/postinst") +set(CPACK_DEBIAN_PACKAGE_CONTROL_EXTRA "${PADDLE_SOURCE_DIR}/paddle/scripts/deb/postinst") #set(CPACK_GENERATOR "DEB") # Start cpack include (CMakePackageConfigHelpers) diff --git a/cmake/simd.cmake b/cmake/simd.cmake index 46035a908b588861607a25d3a21cf34b7b6fd4b8..53c2de332ea74b06d1bd6e5bb119cad6af27ed01 100644 --- a/cmake/simd.cmake +++ b/cmake/simd.cmake @@ -1,27 +1,28 @@ # This file is use to check all support level of AVX on your machine # so that PaddlePaddle can unleash the vectorization power of muticore. -INCLUDE(CheckCXXSourceRuns) -INCLUDE(CheckCXXSourceCompiles) +include(CheckCXXSourceRuns) +include(CheckCXXSourceCompiles) -IF(CMAKE_COMPILER_IS_GNUCC OR CMAKE_COMPILER_IS_GNUCXX OR CMAKE_CXX_COMPILER_ID MATCHES "Clang") +if(CMAKE_COMPILER_IS_GNUCC OR CMAKE_COMPILER_IS_GNUCXX OR CMAKE_CXX_COMPILER_ID MATCHES "Clang") set(MMX_FLAG "-mmmx") set(SSE2_FLAG "-msse2") set(SSE3_FLAG "-msse3") - SET(AVX_FLAG "-mavx") - SET(AVX2_FLAG "-mavx2") -ELSEIF(MSVC) + set(AVX_FLAG "-mavx") + set(AVX2_FLAG "-mavx2") +elseif(MSVC) set(MMX_FLAG "/arch:MMX") set(SSE2_FLAG "/arch:SSE2") set(SSE3_FLAG "/arch:SSE3") SET(AVX_FLAG "/arch:AVX") SET(AVX2_FLAG "/arch:AVX2") -ENDIF() +endif() set(CMAKE_REQUIRED_FLAGS_RETAINED ${CMAKE_REQUIRED_FLAGS}) # Check MMX set(CMAKE_REQUIRED_FLAGS ${MMX_FLAG}) +set(MMX_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE) CHECK_CXX_SOURCE_RUNS(" #include int main() @@ -32,6 +33,7 @@ int main() # Check SSE2 set(CMAKE_REQUIRED_FLAGS ${SSE2_FLAG}) +set(SSE2_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE) CHECK_CXX_SOURCE_RUNS(" #include int main() @@ -42,6 +44,7 @@ int main() # Check SSE3 set(CMAKE_REQUIRED_FLAGS ${SSE3_FLAG}) +set(SSE3_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE) CHECK_CXX_SOURCE_RUNS(" #include int main() @@ -55,6 +58,7 @@ int main() # Check AVX set(CMAKE_REQUIRED_FLAGS ${AVX_FLAG}) +set(AVX_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE) CHECK_CXX_SOURCE_RUNS(" #include int main() @@ -67,6 +71,7 @@ int main() # Check AVX 2 set(CMAKE_REQUIRED_FLAGS ${AVX2_FLAG}) +set(AVX2_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE) CHECK_CXX_SOURCE_RUNS(" #include int main() diff --git a/cmake/system.cmake b/cmake/system.cmake index 904652413e026e3a7f3f2a19f48f4e906ce6babb..396bd1a0797edea0522bb1f02349373563b7726a 100644 --- a/cmake/system.cmake +++ b/cmake/system.cmake @@ -24,15 +24,15 @@ IF(WIN32) SET(HOST_SYSTEM "win32") ELSE(WIN32) IF(APPLE) - EXEC_PROGRAM (sw_vers ARGS -productVersion OUTPUT_VARIABLE MACOSX_VERSION) - STRING(REGEX MATCH "[0-9]+.[0-9]+" VERSION "${MACOSX_VERSION}") - SET(MACOS_VERSION ${VERSION}) SET(HOST_SYSTEM "macosx") - IF(NOT DEFINED ENV{MACOSX_DEPLOYMENT_TARGET}) + EXEC_PROGRAM(sw_vers ARGS -productVersion OUTPUT_VARIABLE HOST_SYSTEM_VERSION) + STRING(REGEX MATCH "[0-9]+.[0-9]+" MACOS_VERSION "${HOST_SYSTEM_VERSION}") + IF(NOT DEFINED $ENV{MACOSX_DEPLOYMENT_TARGET}) # Set cache variable - end user may change this during ccmake or cmake-gui configure. SET(CMAKE_OSX_DEPLOYMENT_TARGET ${MACOS_VERSION} CACHE STRING "Minimum OS X version to target for deployment (at runtime); newer APIs weak linked. Set to empty string for default value.") ENDIF() + set(CMAKE_EXE_LINKER_FLAGS "-framework CoreFoundation -framework Security") ELSE(APPLE) IF(EXISTS "/etc/issue") @@ -48,6 +48,8 @@ ELSE(WIN32) ELSEIF(LINUX_ISSUE MATCHES "Fedora") SET(HOST_SYSTEM "fedora") ENDIF() + + STRING(REGEX MATCH "(([0-9]+)\\.)+([0-9]+)" HOST_SYSTEM_VERSION "${LINUX_ISSUE}") ENDIF(EXISTS "/etc/issue") IF(EXISTS "/etc/redhat-release") @@ -69,7 +71,7 @@ CMAKE_HOST_SYSTEM_INFORMATION(RESULT CPU_CORES QUERY NUMBER_OF_LOGICAL_CORES) MARK_AS_ADVANCED(HOST_SYSTEM CPU_CORES) -MESSAGE(STATUS "Found Paddle host system: ${HOST_SYSTEM}") +MESSAGE(STATUS "Found Paddle host system: ${HOST_SYSTEM}, version: ${HOST_SYSTEM_VERSION}") MESSAGE(STATUS "Found Paddle host system's CPU: ${CPU_CORES} cores") # configuration for cross-compiling @@ -81,27 +83,12 @@ IF(DEFINED CMAKE_SYSTEM_NAME) ELSEIF(${CMAKE_SYSTEM_NAME} STREQUAL "RPi") SET(RPI TRUE) INCLUDE(cross_compiling/raspberry_pi) + ELSEIF(${CMAKE_SYSTEM_NAME} STREQUAL "iOS") + SET(IOS TRUE) + INCLUDE(cross_compiling/ios) ENDIF() ENDIF() -# prefix and suffix on different os -IF(WIN32) - SET(LIBRARY_PREFIX "") - SET(SHARED_LIBRARY_SUFFIX ".dll") - SET(STATIC_LIBRARY_SUFFIX ".lib") - SET(EXECUTABLE_SUFFIX ".exe") -ELSE(WIN32) - SET(LIBRARY_PREFIX "lib") - IF(APPLE) - SET(SHARED_LIBRARY_SUFFIX ".dylib") - ELSE(APPLE) - SET(SHARED_LIBRARY_SUFFIX ".so") - ENDIF(APPLE) - - SET(STATIC_LIBRARY_SUFFIX ".a") - SET(EXECUTABLE_SUFFIX "") -ENDIF(WIN32) - # external dependencies log output SET(EXTERNAL_PROJECT_LOG_ARGS LOG_DOWNLOAD 0 # Wrap download in script to log output diff --git a/cmake/util.cmake b/cmake/util.cmake index 8c9143462227e7081142f6be250b1a45e4b6d51b..0dc33ce385175d1e2dc454d41db467d4b9d9cf9a 100644 --- a/cmake/util.cmake +++ b/cmake/util.cmake @@ -25,7 +25,9 @@ function(target_circle_link_libraries TARGET_NAME) endif() endforeach() if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang" OR "${CMAKE_CXX_COMPILER_ID}" STREQUAL "AppleClang") - list(APPEND LIBS "-undefined dynamic_lookup") + if(NOT IOS_ENABLE_BITCODE) + list(APPEND LIBS "-undefined dynamic_lookup") + endif() endif() list(REVERSE libsInArgn) target_link_libraries(${TARGET_NAME} @@ -71,29 +73,52 @@ function(link_paddle_exe TARGET_NAME) generate_rdma_links() endif() - target_circle_link_libraries(${TARGET_NAME} - ARCHIVE_START - paddle_gserver - paddle_function - ARCHIVE_END - paddle_pserver - paddle_trainer_lib - paddle_network - paddle_math - paddle_utils - paddle_parameter - paddle_proto - paddle_cuda - ${EXTERNAL_LIBS} - ${CMAKE_THREAD_LIBS_INIT} - ${CMAKE_DL_LIBS} - ${RDMA_LD_FLAGS} - ${RDMA_LIBS}) + if(MOBILE_INFERENCE) + target_circle_link_libraries(${TARGET_NAME} + ARCHIVE_START + paddle_gserver + paddle_function + ARCHIVE_END + paddle_math + paddle_utils + paddle_parameter + paddle_proto + paddle_cuda + ${EXTERNAL_LIBS} + ${CMAKE_THREAD_LIBS_INIT} + ${CMAKE_DL_LIBS} + ${RDMA_LD_FLAGS} + ${RDMA_LIBS}) + else() + target_circle_link_libraries(${TARGET_NAME} + ARCHIVE_START + paddle_gserver + paddle_function + ARCHIVE_END + paddle_pserver + paddle_trainer_lib + paddle_network + paddle_math + paddle_utils + paddle_parameter + paddle_proto + paddle_cuda + paddle_optimizer + ${EXTERNAL_LIBS} + ${CMAKE_THREAD_LIBS_INIT} + ${CMAKE_DL_LIBS} + ${RDMA_LD_FLAGS} + ${RDMA_LIBS}) + endif() if(ANDROID) target_link_libraries(${TARGET_NAME} log) endif(ANDROID) + if(WITH_MKLML AND MKLML_LIB_DIR AND MKLML_IOMP_LIB) + target_link_libraries(${TARGET_NAME} "-L${MKLML_LIB_DIR} -liomp5 -Wl,--as-needed") + endif() + add_dependencies(${TARGET_NAME} ${external_project_dependencies}) endfunction() @@ -117,7 +142,6 @@ endfunction() macro(add_unittest_without_exec TARGET_NAME) add_executable(${TARGET_NAME} ${ARGN}) link_paddle_test(${TARGET_NAME}) - add_style_check_target(${TARGET_NAME} ${ARGN}) endmacro() # add_unittest @@ -141,17 +165,6 @@ endmacro() function(create_resources res_file output_file) add_custom_command( OUTPUT ${output_file} - COMMAND python ARGS ${PROJ_ROOT}/cmake/make_resource.py ${res_file} ${output_file} - DEPENDS ${res_file} ${PROJ_ROOT}/cmake/make_resource.py) -endfunction() - - -# Create a python unittest using run_python_tests.sh, -# which takes care of making correct running environment -function(add_python_test TEST_NAME) - add_test(NAME ${TEST_NAME} - COMMAND env PADDLE_PACKAGE_DIR=${PADDLE_PYTHON_PACKAGE_DIR} - bash ${PROJ_ROOT}/paddle/scripts/run_python_tests.sh - ${USE_VIRTUALENV_FOR_TEST} ${PYTHON_EXECUTABLE} ${ARGN} - WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}) + COMMAND python ARGS ${PADDLE_SOURCE_DIR}/cmake/make_resource.py ${res_file} ${output_file} + DEPENDS ${res_file} ${PADDLE_SOURCE_DIR}/cmake/make_resource.py) endfunction() diff --git a/cmake/version.cmake b/cmake/version.cmake index ac1583a24c828629c46cb9cf4e965f8da2273732..cde650128a068faf32f4abfff5cdfdeb656d8577 100644 --- a/cmake/version.cmake +++ b/cmake/version.cmake @@ -4,7 +4,7 @@ set(tmp_version "HEAD") while ("${PADDLE_VERSION}" STREQUAL "") execute_process( COMMAND ${GIT_EXECUTABLE} describe --tags --abbrev=0 ${tmp_version} - WORKING_DIRECTORY ${PROJ_ROOT} + WORKING_DIRECTORY ${PADDLE_SOURCE_DIR} OUTPUT_VARIABLE GIT_TAG_NAME RESULT_VARIABLE GIT_RESULT ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE) diff --git a/doc/CMakeLists.txt b/doc/CMakeLists.txt index 6fa42fd0c71e78cc2fa6b0fe2cb970baf4ac89ed..94dd3457fb5b513441c4c8e339e1862de9092517 100644 --- a/doc/CMakeLists.txt +++ b/doc/CMakeLists.txt @@ -27,10 +27,6 @@ sphinx_add_target(paddle_docs ${CMAKE_CURRENT_SOURCE_DIR} ${SPHINX_HTML_DIR_EN}) -add_dependencies(paddle_docs - gen_proto_py) - - # configured documentation tools and intermediate build results set(BINARY_BUILD_DIR_CN "${CMAKE_CURRENT_BINARY_DIR}/cn/_build") @@ -51,6 +47,3 @@ sphinx_add_target(paddle_docs_cn ${SPHINX_CACHE_DIR_CN} ${CMAKE_CURRENT_SOURCE_DIR} ${SPHINX_HTML_DIR_CN}) - -add_dependencies(paddle_docs_cn - gen_proto_py) diff --git a/doc/about/index_cn.md b/doc/about/index_cn.md deleted file mode 100644 index 3bf030004d4de8c6f3cb773c6e78c09f40878c5f..0000000000000000000000000000000000000000 --- a/doc/about/index_cn.md +++ /dev/null @@ -1,11 +0,0 @@ -关于PaddlePaddle -================ - -PaddlePaddle是一个最早由百度科学家和工程师共同研发的并行分布式深度学习平台,兼备易用性、高效性、灵活性和可扩展性,目前已被百度内部多个产品线广泛使用。 -PaddlePaddle目前已经开放源码, 但是远未完善,我们希望能在这个基础上不断的改进、扩展和延伸。 -同时我们希望广大开发者积极提供反馈和贡献源代码,建立一个活跃的开源社区。 - -致谢 --------- - -在此,特别感谢PaddlePaddle的[所有贡献者](https://github.com/PaddlePaddle/Paddle/graphs/contributors)。 diff --git a/doc/about/index_en.rst b/doc/about/index_en.rst deleted file mode 100644 index 065c430cdea802ed3c9f487cd00255b85a5598a5..0000000000000000000000000000000000000000 --- a/doc/about/index_en.rst +++ /dev/null @@ -1,14 +0,0 @@ -ABOUT -======= - -PaddlPaddle is an easy-to-use, efficient, flexible and scalable deep learning platform, -which is originally developed by Baidu scientists and engineers for the purpose of applying deep learning to many products at Baidu. - -PaddlePaddle is now open source but far from complete, which is intended to be built upon, improved, scaled, and extended. -We hope to build an active open source community both by providing feedback and by actively contributing to the source code. - - -Credits --------- - -We owe many thanks to `all contributors and developers `_ of PaddlePaddle! diff --git a/doc/api/index_cn.rst b/doc/api/index_cn.rst index 9be0b370ee5e301aee4a6e31b1cfa905754968e8..84f9097a6cdc2da269bd6a0685796e14e26da37e 100644 --- a/doc/api/index_cn.rst +++ b/doc/api/index_cn.rst @@ -7,3 +7,4 @@ API 模型配置 数据访问 训练与应用 + v2/fluid.rst diff --git a/doc/api/index_en.rst b/doc/api/index_en.rst index 25c1dd00b9cbb3ab647e04cdc2b4c27c552a2332..e6f632e1a5b9c4b50b7c6aa96a120030bd6ce338 100644 --- a/doc/api/index_en.rst +++ b/doc/api/index_en.rst @@ -7,3 +7,4 @@ API v2/model_configs.rst v2/data.rst v2/run_logic.rst + v2/fluid.rst diff --git a/doc/api/v1/data_provider/dataprovider_cn.rst b/doc/api/v1/data_provider/dataprovider_cn.rst deleted file mode 100644 index d08c6b3efacbc35ae274d5b207fe91e747124e79..0000000000000000000000000000000000000000 --- a/doc/api/v1/data_provider/dataprovider_cn.rst +++ /dev/null @@ -1,15 +0,0 @@ -.. _api_dataprovider: - -DataProvider的介绍 -================== - -DataProvider是PaddlePaddle负责提供数据的模块。其作用是将数据传入内存或显存,让神经网络可以进行训练或预测。用户可以通过简单使用Python接口 :ref:`api_pydataprovider2` ,来自定义传数据的过程。如果有更复杂的使用,或者需要更高的效率,用户也可以在C++端自定义一个 ``DataProvider`` 。 - -PaddlePaddle需要用户在网络配置(trainer_config.py)中定义使用哪种DataProvider,并且在DataProvider中实现如何访问训练文件列表(train.list)或测试文件列表(test.list)。 - -- train.list和test.list存放在本地(推荐直接存放到训练目录,以相对路径引用)。一般情况下,两者均为纯文本文件,其中每一行对应一个数据文件地址: - - - 如果数据文件存于本地磁盘,这个地址则为它的绝对路径或相对路径(相对于PaddlePaddle程序运行时的路径)。 - - 地址也可以为hdfs文件路径,或者数据库连接路径等。 - - 由于这个地址会被DataProvider使用,因此,如何解析该地址也是用户自定义DataProvider时需要考虑的地方。 -- 如果没有设置test.list,或设置为None,那么在训练过程中不会执行测试操作;否则,会根据命令行参数指定的测试方式,在训练过程中进行测试,从而防止过拟合。 diff --git a/doc/api/v1/data_provider/dataprovider_en.rst b/doc/api/v1/data_provider/dataprovider_en.rst deleted file mode 100644 index 96efbb1da959daec561009fdcc95d353b191dec8..0000000000000000000000000000000000000000 --- a/doc/api/v1/data_provider/dataprovider_en.rst +++ /dev/null @@ -1,34 +0,0 @@ -Introduction -============== -DataProvider is a module that loads training or testing data into cpu or gpu -memory for the following triaining or testing process. - -For simple use, users can use Python :code:`PyDataProvider` to dynamically reads -the original data in any format or in any form, and then transfer them into a -data format PaddlePaddle requires. The process is extremly flexible and highly -customized, with sacrificing the efficiency only a little. This is extremly -useful when you have to dynamically generate certain kinds of data according to, -for example, the training performance. - -Besides, users also can customize a C++ :code:`DataProvider` for a more -complex usage, or for a higher efficiency. - -The following parameters are required to define in the PaddlePaddle network -configuration file (trainer_config.py): which DataProvider is chosen to used, -and specific parameters for DataProvider, including training file list -(train.list) and testing file list (test.list). - -Train.list and test.list are simply two plain text files, which defines path -of training or testing data. It is recommended that directly placing them into -the training directory, and reference to them by using a relative path ( -relative to the PaddePaddle program). - -Testing or evaluating will not be performed during training if the test.list is -not set or set to None. Otherwise, PaddlePaddle will evaluate the trained model -by the specified tesing data while training, every testing period (a user -defined command line parameter in PaddlePaddle) to prevent over-fitting. - -Each line of train.list and test.list is an absolute or relative path (relative -to the PaddePaddle program runtime) of data file. Fascinatingly more, each line -can also be a HDFS file path or a SQL connection string. As long as the user -assures how to access each file in DataProvider. diff --git a/doc/api/v1/data_provider/pydataprovider2_cn.rst b/doc/api/v1/data_provider/pydataprovider2_cn.rst deleted file mode 100644 index 8f9db31cfb9946e1d2db3872718bd92787d861f0..0000000000000000000000000000000000000000 --- a/doc/api/v1/data_provider/pydataprovider2_cn.rst +++ /dev/null @@ -1,229 +0,0 @@ -.. _api_pydataprovider2: - -PyDataProvider2的使用 -===================== - -PyDataProvider2是PaddlePaddle使用Python提供数据的推荐接口。该接口使用多线程读取数据,并提供了简单的Cache功能;同时可以使用户只关注如何从文件中读取每一条数据,而不用关心数据如何传输,如何存储等等。 - -.. contents:: - -MNIST的使用场景 ---------------- - -我们以MNIST手写识别为例,来说明PyDataProvider2的简单使用场景。 - -样例数据 -++++++++ - -MNIST是一个包含有70,000张灰度图片的数字分类数据集。样例数据 ``mnist_train.txt`` 如下: - -.. literalinclude:: src/mnist_train.txt - -其中每行数据代表一张图片,行内使用 ``;`` 分成两部分。第一部分是图片的标签,为0-9中的一个数字;第二部分是28*28的图片像素灰度值。 对应的 ``train.list`` 即为这个数据文件的名字: - -.. literalinclude:: src/train.list - -dataprovider的使用 -++++++++++++++++++ - -.. literalinclude:: src/mnist_provider.dict.py - -- 首先,引入PaddlePaddle的PyDataProvider2包。 -- 其次,定义一个Python的 `Decorator `_ `@provider`_ 。用于将下一行的数据输入函数标记成一个PyDataProvider2,同时设置它的input_types属性。 - - - `input_types`_:设置这个PyDataProvider2返回什么样的数据。本例根据网络配置中 ``data_layer`` 的名字,显式指定返回的是一个28*28维的稠密浮点数向量和一个[0-9]的10维整数标签。 - - .. literalinclude:: src/mnist_config.py - :lines: 9-10 - - - 注意:如果用户不显示指定返回数据的对应关系,那么PaddlePaddle会根据layer的声明顺序,来确定对应关系。但这个关系可能不正确,所以推荐使用显式指定的方式来设置input_types。 -- 最后,实现数据输入函数(如本例的 ``process`` 函数)。 - - - 该函数的功能是:打开文本文件,读取每一行,将行中的数据转换成与input_types一致的格式,然后返回给PaddlePaddle进程。注意, - - - 返回的顺序需要和input_types中定义的顺序一致。 - - 返回时,必须使用Python关键词 ``yield`` ,相关概念是 ``generator`` 。 - - 一次yield调用,返回一条完整的样本。如果想为一个数据文件返回多条样本,只需要在函数中调用多次yield即可(本例中使用for循环进行多次调用)。 - - - 该函数具有两个参数: - - - settings:在本例中没有使用,具体可以参考 `init_hook`_ 中的说明。 - - filename:为 ``train.list`` 或 ``test.list`` 中的一行,即若干数据文件路径的某一个。 - -网络配置中的调用 -++++++++++++++++ - -在网络配置里,只需要一行代码就可以调用这个PyDataProvider2,如, - -.. literalinclude:: src/mnist_config.py - :lines: 1-7 - -训练数据是 ``train.list`` ,没有测试数据,调用的PyDataProvider2是 ``mnist_provider`` 模块中的 ``process`` 函数。 - -小结 -+++++ - -至此,简单的PyDataProvider2样例就说明完毕了。对用户来说,仅需要知道如何从 **一个文件** 中读取 **一条样本** ,就可以将数据传送给PaddlePaddle了。而PaddlePaddle则会帮用户做以下工作: - -* 将数据组合成Batch进行训练 -* 对训练数据进行Shuffle -* 多线程的数据读取 -* 缓存训练数据到内存(可选) -* CPU->GPU双缓存 - -是不是很简单呢? - -时序模型的使用场景 ------------------- -样例数据 -++++++++ - -时序模型是指数据的某一维度是一个序列形式,即包含时间步信息。所谓时间步信息,不一定和时间有关系,只是说明数据的顺序是重要的。例如,文本信息就是一个序列数据。 - -本例采用英文情感分类的数据,即将一段英文文本数据,分类成正面情绪和负面情绪两类(用0和1表示)。样例数据 ``sentimental_train.txt`` 如下: - -.. literalinclude:: src/sentimental_train.txt - -dataprovider的使用 -++++++++++++++++++ - -相对MNIST而言,这个dataprovider较复杂,主要原因是增加了初始化机制 `init_hook`_。本例的 ``on_init`` 函数就是根据该机制配置的,它会在dataprovider创建的时候执行。 - -- 其中 ``input_types`` 和在 `@provider`_ 中配置的效果一致。本例中的输入特征是词ID的序列,因此使用 ``integer_value_sequence`` 类型来设置。 -- 将 ``dictionary`` 存入settings对象,在 ``process`` 函数中使用。 dictionary是从网络配置中传入的dict对象,即一个将单词字符串映射到单词ID的字典。 - -.. literalinclude:: src/sentimental_provider.py - -网络配置中的调用 -++++++++++++++++ - -调用这个PyDataProvider2的方法,基本上和MNIST样例一致,除了 - -* 在配置中需要读取外部字典。 -* 在声明DataProvider的时候传入dictionary作为参数。 - -.. literalinclude:: src/sentimental_config.py - :emphasize-lines: 12-14 - -参考(Reference) ---------------- - -@provider -+++++++++ - -``@provider`` 是一个Python的 `Decorator`_ ,可以将某一个函数标记成一个PyDataProvider2。如果不了解 `Decorator`_ 是什么也没关系,只需知道这是一个标记属性的方法就可以了。它包含的属性参数如下: - -* input_types:数据输入格式。具体的格式说明,请参考 `input_types`_ 。 -* should_shuffle:是不是要对数据做Shuffle。训练时默认shuffle,测试时默认不shuffle。 -* min_pool_size:设置内存中最小暂存的数据条数,也是PaddlePaddle所能够保证的shuffle粒度。如果为-1,则会预先读取全部数据到内存中。 -* pool_size: 设置内存中暂存的数据条数。如果为-1(默认),则不在乎内存暂存多少条数据。如果设置,则推荐大于训练时batch size的值,并且在内存足够的情况下越大越好。 -* can_over_batch_size:是否允许暂存略微多余pool_size的数据。由于这样做可以避免很多死锁问题,一般推荐设置成True。 -* calc_batch_size:可以传入一个函数,用于自定义每条数据的batch size(默认为1)。 -* cache: 数据缓存的策略,具体请参考 `cache`_ 。 -* init_hook:初始化时调用的函数,具体请参考 `init_hook`_ 。 -* check:如果为true,会根据input_types检查数据的合法性。 -* check_fail_continue:如果为true,那么当check出数据不合法时,会扔到这条数据,继续训练或预测。(对check=false的情况,没有作用) - -input_types -+++++++++++ - -PaddlePaddle的数据包括四种主要类型,和三种序列模式。 - -四种数据类型: - -* dense_vector:稠密的浮点数向量。 -* sparse_binary_vector:稀疏的01向量,即大部分值为0,但有值的地方必须为1。 -* sparse_float_vector:稀疏的向量,即大部分值为0,但有值的部分可以是任何浮点数。 -* integer:整数标签。 - -三种序列模式: - -* SequenceType.NO_SEQUENCE:不是一条序列 -* SequenceType.SEQUENCE:是一条时间序列 -* SequenceType.SUB_SEQUENCE: 是一条时间序列,且序列的每一个元素还是一个时间序列。 - -不同的数据类型和序列模式返回的格式不同,列表如下: - -+----------------------+---------------------+-----------------------------------+------------------------------------------------+ -| | NO_SEQUENCE | SEQUENCE | SUB_SEQUENCE | -+======================+=====================+===================================+================================================+ -| dense_vector | [f, f, ...] | [[f, ...], [f, ...], ...] | [[[f, ...], ...], [[f, ...], ...],...] | -+----------------------+---------------------+-----------------------------------+------------------------------------------------+ -| sparse_binary_vector | [i, i, ...] | [[i, ...], [i, ...], ...] | [[[i, ...], ...], [[i, ...], ...],...] | -+----------------------+---------------------+-----------------------------------+------------------------------------------------+ -| sparse_float_vector | [(i,f), (i,f), ...] | [[(i,f), ...], [(i,f), ...], ...] | [[[(i,f), ...], ...], [[(i,f), ...], ...],...] | -+----------------------+---------------------+-----------------------------------+------------------------------------------------+ -| integer_value | i | [i, i, ...] | [[i, ...], [i, ...], ...] | -+----------------------+---------------------+-----------------------------------+------------------------------------------------+ - -其中,f代表一个浮点数,i代表一个整数。 - -注意:对sparse_binary_vector和sparse_float_vector,PaddlePaddle存的是有值位置的索引。例如, - -- 对一个5维非序列的稀疏01向量 ``[0, 1, 1, 0, 0]`` ,类型是sparse_binary_vector,返回的是 ``[1, 2]`` 。 -- 对一个5维非序列的稀疏浮点向量 ``[0, 0.5, 0.7, 0, 0]`` ,类型是sparse_float_vector,返回的是 ``[(1, 0.5), (2, 0.7)]`` 。 - -init_hook -+++++++++ - -init_hook可以传入一个函数。该函数在初始化的时候会被调用,其参数如下: - -* 第一个参数是settings对象,它和数据传入函数的第一个参数(如本例中 ``process`` 函数的 ``settings`` 参数)必须一致。该对象具有以下两个属性: - * settings.input_types:数据输入格式,具体请参考 `input_types`_ 。 - * settings.logger:一个logging对象。 -* 其他参数使用 ``kwargs`` (key word arguments)传入,包括以下两种: - * PaddlePaddle定义的参数: 1)is_train:bool型参数,表示用于训练或预测;2)file_list:所有文件列表。 - * 用户定义的参数:使用args在网络配置中设置。 - -注意:PaddlePaddle保留添加参数的权力,因此init_hook尽量使用 ``**kwargs`` 来接受不使用的函数以保证兼容性。 - -cache -+++++ - -PyDataProvider2提供了两种简单的Cache策略: - -* CacheType.NO_CACHE:不缓存任何数据,每次都会从python端读取数据 -* CacheType.CACHE_PASS_IN_MEM:第一个pass会从python端读取数据,剩下的pass会直接从内存里 - 读取数据。 - - -注意事项 --------- - -可能的内存泄露问题 -++++++++++++++++++ - -PaddlePaddle将train.list中的每一行都传递给process函数,从而生成多个generator。当训练数据非常多时,就会生成非常多的generator。 - -虽然每个generator在没有调用的时候,是几乎不占内存的;但当调用过一次后,generator便会存下当前的上下文(Context),而这个Context可能会非常大。并且,generator至少需要调用两次才会知道是否停止。所以,即使process函数里面只有一个yield,也需要两次随机选择到相同generator的时候,才会释放该段内存。 - -.. code-block:: python - - def func(): - yield 0 - - f = func() # 创建generator - tmp = next(f) # 调用一次,返回0 - tmp = next(f) # 调用第二次的时候,才会Stop Iteration - -由于顺序调用这些generator不会出现上述问题,因此有两种解决方案: - -1. **最佳推荐**:将样本的地址放入另一个文本文件,train.list写入那个文本文件的地址。即不要将每一个样本都放入train.list。 -2. 在generator的上下文中尽量留下非常少的变量引用,例如 - -.. code-block:: python - - def real_process(fn): - # ... read from fn - return result # 当函数返回的时候,python可以解除掉内部变量的引用。 - - def process(fn): - yield real_process(fn) - -注意:这个问题是PyDataProvider读数据时候的逻辑问题,很难整体修正。 - -内存不够用的情况 -++++++++++++++++ - -PyDataProvider2会尽可能多的使用内存。因此,对于内存较小的机器,推荐使用 ``pool_size`` 变量来设置内存中暂存的数据条。具体请参考 `@provider`_ 中的说明。 - diff --git a/doc/api/v1/data_provider/pydataprovider2_en.rst b/doc/api/v1/data_provider/pydataprovider2_en.rst deleted file mode 100644 index e8fb6292779790765154502bff319ea10ab1e70b..0000000000000000000000000000000000000000 --- a/doc/api/v1/data_provider/pydataprovider2_en.rst +++ /dev/null @@ -1,249 +0,0 @@ -.. _api_pydataprovider2: - -PyDataProvider2 -=============== - -We highly recommand users to use PyDataProvider2 to provide training or testing -data to PaddlePaddle. The user only needs to focus on how to read a single -sample from the original data file by using PyDataProvider2, leaving all of the -trivial work, including, transfering data into cpu/gpu memory, shuffle, binary -serialization to PyDataProvider2. PyDataProvider2 uses multithreading and a -fanscinating but simple cache strategy to optimize the efficiency of the data -providing process. - -DataProvider for the non-sequential model ------------------------------------------ - -Here we use the MNIST handwriting recognition data as an example to illustrate -how to write a simple PyDataProvider. - -MNIST is a handwriting classification data set. It contains 70,000 digital -grayscale images. Labels of the training sample range from 0 to 9. All the -images have been size-normalized and centered into images with the same size -of 28 x 28 pixels. - -A small part of the original data as an example is shown as below: - -.. literalinclude:: src/mnist_train.txt - -Each line of the data contains two parts, separated by :code:`;`. The first part is -label of an image. The second part contains 28x28 pixel float values. - -Just write path of the above data into train.list. It looks like this: - -.. literalinclude:: src/train.list - -The corresponding dataprovider is shown as below: - -.. literalinclude:: src/mnist_provider.dict.py - -The first line imports PyDataProvider2 package. -The main function is the process function, that has two parameters. -The first parameter is the settings, which is not used in this example. -The second parameter is the filename, that is exactly each line of train.list. -This parameter is passed to the process function by PaddlePaddle. - -:code:`@provider` is a Python -`Decorator `_ . -It sets some properties to DataProvider, and constructs a real PaddlePaddle -DataProvider from a very simple user implemented python function. It does not -matter if you are not familiar with `Decorator`_. You can keep it simple by -just taking :code:`@provider` as a fixed mark above the provider function you -implemented. - -`input_types`_ defines the data format that a DataProvider returns. -In this example, it is set to a 28x28-dimensional dense vector and an integer -scalar, whose value ranges from 0 to 9. -`input_types`_ can be set to several kinds of input formats, please refer to the -document of `input_types`_ for more details. - - -The process method is the core part to construct a real DataProvider in -PaddlePaddle. It implements how to open the text file, how to read one sample -from the original text file, convert them into `input_types`_, and give them -back to PaddlePaddle process at line 23. -Note that data yielded by the process function must follow the same order that -`input_types`_ are defined. - - -With the help of PyDataProvider2, user can focus on how to generate ONE traning -sample by using keywords :code:`yield`. -:code:`yield` is a python keyword, and a concept related to it includes -:code:`generator`. - -Only a few lines of codes need to be added into the training configuration file, -you can take this as an example. - -.. literalinclude:: src/mnist_config.py - -Here we specify training data by :code:`train.list`, and no testing data is specified. -The method which actually provide data is :code:`process`. - -User also can use another style to provide data, which defines the -:code:`data_layer`'s name explicitly when `yield`. For example, -the :code:`dataprovider` is shown as below. - -.. literalinclude:: src/mnist_provider.dict.py - :linenos: - -If user did't give the :code:`data_layer`'s name, PaddlePaddle will use -the order of :code:`data_layer` definition roughly to determine which feature to -which :code:`data_layer`. This order may be not correct, so TO DEFINE THE -:code:`data_layer`'s NAMES EXPLICITLY IS THE RECOMMANDED WAY TO PROVIDER DATA. - -Now, this simple example of using PyDataProvider is finished. -The only thing that the user should know is how to generte **one sample** from -**one data file**. -And PaddlePadle will do all of the rest things\: - -* Form a training batch -* Shuffle the training data -* Read data with multithreading -* Cache the training data (Optional) -* CPU-> GPU double buffering. - -Is this cool? - -.. _api_pydataprovider2_sequential_model: - -DataProvider for the sequential model -------------------------------------- -A sequence model takes sequences as its input. A sequence is made up of several -timesteps. The so-called timestep, is not necessary to have something to do -with time. It can also be explained to that the order of data are taken into -consideration into model design and training. -For example, the sentence can be interpreted as a kind of sequence data in NLP -tasks. - -Here is an example on data proivider for English sentiment classification data. -The original input data are simple English text, labeled into positive or -negative sentiment (marked by 0 and 1 respectively). - -A small part of the original data as an example can be found in the path below: - -.. literalinclude:: src/sentimental_train.txt - -The corresponding data provider can be found in the path below: - -.. literalinclude:: src/sentimental_provider.py - -This data provider for sequential model is a little more complex than that -for MINST dataset. -A new initialization method is introduced here. -The method :code:`on_init` is configured to DataProvider by :code:`@provider`'s -:code:`init_hook` parameter, and it will be invoked once DataProvider is -initialized. The :code:`on_init` function has the following parameters: - -* The first parameter is the settings object. -* The rest parameters are passed by key word arguments. Some of them are passed - by PaddlePaddle, see reference for `init_hook`_. - The :code:`dictionary` object is a python dict object passed from the trainer - configuration file, and it maps word string to word id. - -To pass these parameters into DataProvider, the following lines should be added -into trainer configuration file. - -.. literalinclude:: src/sentimental_config.py - -The definition is basically same as MNIST example, except: -* Load dictionary in this configuration -* Pass it as a parameter to the DataProvider - -The `input_types` is configured in method :code:`on_init`. It has the same -effect to configure them by :code:`@provider`'s :code:`input_types` parameter. -However, the :code:`input_types` is set at runtime, so we can set it to -different types according to the input data. Input of the neural network is a -sequence of word id, so set :code:`seq_type` to :code:`integer_value_sequence`. - -Durning :code:`on_init`, we save :code:`dictionary` variable to -:code:`settings`, and it will be used in :code:`process`. Note the settings -parameter for the process function and for the on_init's function are a same -object. - -The basic processing logic is the same as MNIST's :code:`process` method. Each -sample in the data file is given back to PaddlePaddle process. - -Thus, the basic usage of PyDataProvider is here. -Please refer to the following section reference for details. - -Reference ---------- - -@provider -+++++++++ - -.. autofunction:: paddle.trainer.PyDataProvider2.provider - -input_types -+++++++++++ - -PaddlePaddle has four data types, and three sequence types. -The four data types are: - -* :code:`dense_vector`: dense float vector. -* :code:`sparse_binary_vector`: sparse binary vector, most of the value is 0, and - the non zero elements are fixed to 1. -* :code:`sparse_float_vector`: sparse float vector, most of the value is 0, and some - non zero elements can be any float value. They are given by the user. -* :code:`integer`: an integer scalar, that is especially used for label or word index. - -The three sequence types are: - -* :code:`SequenceType.NO_SEQUENCE` means the sample is not a sequence. -* :code:`SequenceType.SEQUENCE` means the sample is a sequence. -* :code:`SequenceType.SUB_SEQUENCE` means it is a nested sequence, that each timestep of - the input sequence is also a sequence. - -Different input type has a defferenct input format. Their formats are shown -in the above table. - -+----------------------+---------------------+-----------------------------------+------------------------------------------------+ -| | NO_SEQUENCE | SEQUENCE | SUB_SEQUENCE | -+======================+=====================+===================================+================================================+ -| dense_vector | [f, f, ...] | [[f, ...], [f, ...], ...] | [[[f, ...], ...], [[f, ...], ...],...] | -+----------------------+---------------------+-----------------------------------+------------------------------------------------+ -| sparse_binary_vector | [i, i, ...] | [[i, ...], [i, ...], ...] | [[[i, ...], ...], [[i, ...], ...],...] | -+----------------------+---------------------+-----------------------------------+------------------------------------------------+ -| sparse_float_vector | [(i,f), (i,f), ...] | [[(i,f), ...], [(i,f), ...], ...] | [[[(i,f), ...], ...], [[(i,f), ...], ...],...] | -+----------------------+---------------------+-----------------------------------+------------------------------------------------+ -| integer_value | i | [i, i, ...] | [[i, ...], [i, ...], ...] | -+----------------------+---------------------+-----------------------------------+------------------------------------------------+ - -where f represents a float value, i represents an integer value. - -init_hook -+++++++++ - -init_hook is a function that is invoked once the data provoder is initialized. -Its parameters lists as follows: - -* The first parameter is a settings object, which is the same to :code:`settings` - in :code:`process` method. The object contains several attributes, including: - - * :code:`settings.input_types`: the input types. Reference `input_types`_. - * :code:`settings.logger`: a logging object. - -* The rest parameters are the key word arguments. It is made up of PaddpePaddle - pre-defined parameters and user defined parameters. - - * PaddlePaddle-defined parameters including: - - * :code:`is_train` is a bool parameter that indicates the DataProvider is used in - training or testing. - * :code:`file_list` is the list of all files. - - * User-defined parameters args can be set in training configuration. - -Note, PaddlePaddle reserves the right to add pre-defined parameter, so please -use :code:`**kwargs` in init_hook to ensure compatibility by accepting the -parameters which your init_hook does not use. - -cache -+++++ -DataProvider provides two simple cache strategy. They are: - -* :code:`CacheType.NO_CACHE` means do not cache any data, then data is read at runtime by - the user implemented python module every pass. -* :code:`CacheType.CACHE_PASS_IN_MEM` means the first pass reads data by the user - implemented python module, and the rest passes will directly read data from - memory. diff --git a/doc/api/v1/data_provider/src/mnist_config.py b/doc/api/v1/data_provider/src/mnist_config.py deleted file mode 100644 index 429338c57f8f865f0c5835d933445b65ee2ea7aa..0000000000000000000000000000000000000000 --- a/doc/api/v1/data_provider/src/mnist_config.py +++ /dev/null @@ -1,10 +0,0 @@ -from paddle.trainer_config_helpers import * - -define_py_data_sources2( - train_list='train.list', - test_list=None, - module='mnist_provider', - obj='process') - -img = data_layer(name='pixel', size=784) -label = data_layer(name='label', size=10) diff --git a/doc/api/v1/data_provider/src/mnist_provider.dict.py b/doc/api/v1/data_provider/src/mnist_provider.dict.py deleted file mode 100644 index 2ba0b126a0d6239f84950e130410aaaa6e1f24cd..0000000000000000000000000000000000000000 --- a/doc/api/v1/data_provider/src/mnist_provider.dict.py +++ /dev/null @@ -1,24 +0,0 @@ -from paddle.trainer.PyDataProvider2 import * - - -# Define a py data provider -@provider( - input_types={'pixel': dense_vector(28 * 28), - 'label': integer_value(10)}) -def process(settings, filename): # settings is not used currently. - f = open(filename, 'r') # open one of training file - - for line in f: # read each line - label, pixel = line.split(';') - - # get features and label - pixels_str = pixel.split(' ') - - pixels_float = [] - for each_pixel_str in pixels_str: - pixels_float.append(float(each_pixel_str)) - - # give data to paddle. - yield {"pixel": pixels_float, 'label': int(label)} - - f.close() # close file diff --git a/doc/api/v1/data_provider/src/mnist_train.txt b/doc/api/v1/data_provider/src/mnist_train.txt deleted file mode 100644 index 34be718ad9ea49e7d9aabc34ad70099479df3b31..0000000000000000000000000000000000000000 --- a/doc/api/v1/data_provider/src/mnist_train.txt +++ /dev/null @@ -1,3 +0,0 @@ -5;0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0.215686 0.533333 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0.67451 0.992157 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0.070588 0.886275 0.992157 0 0 0 0 0 0 0 0 0 0 0.192157 0.070588 0 0 0 0 0 0 0 0 0 0 0 0 0 0.670588 0.992157 0.992157 0 0 0 0 0 0 0 0 0 0.117647 0.933333 0.858824 0.313725 0 0 0 0 0 0 0 0 0 0 0 0.090196 0.858824 0.992157 0.831373 0 0 0 0 0 0 0 0 0 0.141176 0.992157 0.992157 0.611765 0.054902 0 0 0 0 0 0 0 0 0 0 0.258824 0.992157 0.992157 0.529412 0 0 0 0 0 0 0 0 0 0.368627 0.992157 0.992157 0.419608 0.003922 0 0 0 0 0 0 0 0 0 0.094118 0.835294 0.992157 0.992157 0.517647 0 0 0 0 0 0 0 0 0 0.603922 0.992157 0.992157 0.992157 0.603922 0.545098 0.043137 0 0 0 0 0 0 0 0.447059 0.992157 0.992157 0.956863 0.062745 0 0 0 0 0 0 0 0 0.011765 0.666667 0.992157 0.992157 0.992157 0.992157 0.992157 0.745098 0.137255 0 0 0 0 0 0.152941 0.866667 0.992157 0.992157 0.521569 0 0 0 0 0 0 0 0 0 0.070588 0.992157 0.992157 0.992157 0.803922 0.352941 0.745098 0.992157 0.945098 0.317647 0 0 0 0 0.580392 0.992157 0.992157 0.764706 0.043137 0 0 0 0 0 0 0 0 0 0.070588 0.992157 0.992157 0.776471 0.043137 0 0.007843 0.27451 0.882353 0.941176 0.176471 0 0 0.180392 0.898039 0.992157 0.992157 0.313725 0 0 0 0 0 0 0 0 0 0 0.070588 0.992157 0.992157 0.713725 0 0 0 0 0.627451 0.992157 0.729412 0.062745 0 0.509804 0.992157 0.992157 0.776471 0.035294 0 0 0 0 0 0 0 0 0 0 0.494118 0.992157 0.992157 0.968627 0.168627 0 0 0 0.423529 0.992157 0.992157 0.364706 0 0.717647 0.992157 0.992157 0.317647 0 0 0 0 0 0 0 0 0 0 0 0.533333 0.992157 0.984314 0.945098 0.603922 0 0 0 0.003922 0.466667 0.992157 0.988235 0.976471 0.992157 0.992157 0.788235 0.007843 0 0 0 0 0 0 0 0 0 0 0 0.686275 0.882353 0.364706 0 0 0 0 0 0 0.098039 0.588235 0.992157 0.992157 0.992157 0.980392 0.305882 0 0 0 0 0 0 0 0 0 0 0 0 0.101961 0.67451 0.321569 0 0 0 0 0 0 0 0.105882 0.733333 0.976471 0.811765 0.713725 0 0 0 0 0 0 0 0 0 0 0 0 0 0.65098 0.992157 0.321569 0 0 0 0 0 0 0 0 0 0.25098 0.007843 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0.94902 0.219608 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0.968627 0.764706 0.152941 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0.498039 0.25098 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0; -0;0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0.298039 0.333333 0.333333 0.333333 0.337255 0.333333 0.333333 0.109804 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0.027451 0.223529 0.776471 0.964706 0.988235 0.988235 0.988235 0.992157 0.988235 0.988235 0.780392 0.098039 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0.14902 0.698039 0.988235 0.992157 0.988235 0.901961 0.87451 0.568627 0.882353 0.976471 0.988235 0.988235 0.501961 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0.188235 0.647059 0.988235 0.988235 0.745098 0.439216 0.098039 0 0 0 0.572549 0.988235 0.988235 0.988235 0 0 0 0 0 0 0 0 0 0 0 0 0 0.2 0.933333 0.992157 0.941176 0.247059 0 0 0 0 0 0 0.188235 0.898039 0.992157 0.992157 0 0 0 0 0 0 0 0 0 0 0 0.039216 0.639216 0.933333 0.988235 0.913725 0.278431 0 0 0 0 0 0 0 0.113725 0.843137 0.988235 0.988235 0 0 0 0 0 0 0 0 0 0 0 0.235294 0.988235 0.992157 0.988235 0.815686 0.07451 0 0 0 0 0 0 0 0.333333 0.988235 0.988235 0.552941 0 0 0 0 0 0 0 0 0 0 0.211765 0.878431 0.988235 0.992157 0.701961 0.329412 0.109804 0 0 0 0 0 0 0 0.698039 0.988235 0.913725 0.145098 0 0 0 0 0 0 0 0 0 0.188235 0.890196 0.988235 0.988235 0.745098 0.047059 0 0 0 0 0 0 0 0 0 0.882353 0.988235 0.568627 0 0 0 0 0 0 0 0 0 0.2 0.933333 0.992157 0.992157 0.992157 0.447059 0.294118 0 0 0 0 0 0 0 0 0.447059 0.992157 0.768627 0 0 0 0 0 0 0 0 0 0 0.623529 0.988235 0.988235 0.988235 0.988235 0.992157 0.47451 0 0 0 0 0 0 0 0.188235 0.933333 0.87451 0.509804 0 0 0 0 0 0 0 0 0 0 0.992157 0.988235 0.937255 0.792157 0.988235 0.894118 0.082353 0 0 0 0 0 0 0.027451 0.647059 0.992157 0.654902 0 0 0 0 0 0 0 0 0 0 0 0.623529 0.988235 0.913725 0.329412 0.376471 0.184314 0 0 0 0 0 0 0.027451 0.513725 0.988235 0.635294 0.219608 0 0 0 0 0 0 0 0 0 0 0 0.196078 0.929412 0.988235 0.988235 0.741176 0.309804 0 0 0 0 0 0 0.529412 0.988235 0.678431 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0.223529 0.992157 0.992157 1 0.992157 0.992157 0.992157 0.992157 1 0.992157 0.992157 0.882353 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0.023529 0.478431 0.654902 0.658824 0.952941 0.988235 0.988235 0.988235 0.992157 0.988235 0.729412 0.278431 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0.196078 0.647059 0.764706 0.764706 0.768627 0.580392 0.047059 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0; -4;0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0.180392 0.470588 0.623529 0.623529 0.623529 0.588235 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0.243137 0.494118 0.862745 0.870588 0.960784 0.996078 0.996078 0.996078 0.996078 0.992157 0.466667 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0.317647 0.639216 0.639216 0.639216 0.639216 0.639216 0.470588 0.262745 0.333333 0.929412 0.694118 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0.811765 0.694118 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0.811765 0.694118 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0.811765 0.694118 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0.184314 0.992157 0.694118 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0.192157 0.996078 0.384314 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0.454902 0.980392 0.219608 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0.564706 0.941176 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0.588235 0.776471 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0.945098 0.560784 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0.054902 0.952941 0.356863 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0.337255 0.917647 0.109804 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0.698039 0.701961 0.019608 0.4 0.662745 0.662745 0.662745 0.662745 0.662745 0.662745 0.662745 0.376471 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0.090196 0.639216 0.972549 0.945098 0.913725 0.996078 0.996078 0.996078 0.996078 1 0.996078 0.996078 1 0.996078 0 0 0 0 0 0 0 0 0 0 0.007843 0.105882 0.717647 0.776471 0.905882 0.996078 0.996078 0.988235 0.980392 0.862745 0.537255 0.223529 0.223529 0.368627 0.376471 0.6 0.6 0.6 0 0 0 0 0 0 0 0 0.262745 0.470588 0.6 0.996078 0.996078 0.996078 0.996078 0.847059 0.356863 0.156863 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0.909804 0.705882 0.823529 0.635294 0.490196 0.219608 0.113725 0.062745 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0.152941 0.152941 0.156863 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0; diff --git a/doc/api/v1/data_provider/src/sentimental_config.py b/doc/api/v1/data_provider/src/sentimental_config.py deleted file mode 100644 index 7ce71608a2372b2484ae40ccf01f0621728ddef2..0000000000000000000000000000000000000000 --- a/doc/api/v1/data_provider/src/sentimental_config.py +++ /dev/null @@ -1,14 +0,0 @@ -from paddle.trainer_config_helpers import * - -dictionary = dict() -... # read dictionary from outside - -define_py_data_sources2( - train_list='train.list', - test_list=None, - module='sentimental_provider', - obj='process', - # above codes same as mnist sample. - args={ # pass to provider. - 'dictionary': dictionary - }) diff --git a/doc/api/v1/data_provider/src/sentimental_provider.py b/doc/api/v1/data_provider/src/sentimental_provider.py deleted file mode 100644 index 14bd0e05a921dbfd5212d8483524d3af3e4ae98f..0000000000000000000000000000000000000000 --- a/doc/api/v1/data_provider/src/sentimental_provider.py +++ /dev/null @@ -1,43 +0,0 @@ -from paddle.trainer.PyDataProvider2 import * - - -def on_init(settings, dictionary, **kwargs): - # on_init will invoke when data provider is initialized. The dictionary - # is passed from trainer_config, and is a dict object with type - # (word string => word id). - - # set input types in runtime. It will do the same thing as - # @provider(input_types) will do, but it is set dynamically during runtime. - settings.input_types = { - # The text is a sequence of integer values, and each value is a word id. - # The whole sequence is the sentences that we want to predict its - # sentimental. - 'data': integer_value_sequence(len(dictionary)), # text input - 'label': integer_value(2) # label positive/negative - } - - # save dictionary as settings.dictionary. - # It will be used in process method. - settings.dictionary = dictionary - - -@provider(init_hook=on_init) -def process(settings, filename): - f = open(filename, 'r') - - for line in f: # read each line of file - label, sentence = line.split('\t') # get label and sentence - words = sentence.split(' ') # get words - - # convert word string to word id - # the word not in dictionary will be ignored. - word_ids = [] - - for each_word in words: - if each_word in settings.dictionary: - word_ids.append(settings.dictionary[each_word]) - - # give data to paddle. - yield word_ids, int(label) - - f.close() diff --git a/doc/api/v1/data_provider/src/sentimental_train.txt b/doc/api/v1/data_provider/src/sentimental_train.txt deleted file mode 100644 index 0060ac267c4bf884a8e19553bc4bdea48c89c9b2..0000000000000000000000000000000000000000 --- a/doc/api/v1/data_provider/src/sentimental_train.txt +++ /dev/null @@ -1,3 +0,0 @@ -0 I saw this movie at the AFI Dallas festival . It all takes place at a lake house and it looks wonderful . -1 This documentary makes you travel all around the globe . It contains rare and stunning sequels from the wilderness . -... diff --git a/doc/api/v1/data_provider/src/train.list b/doc/api/v1/data_provider/src/train.list deleted file mode 100644 index 92bdc0a8b4c21b3091341d93afc5c536c3cffe47..0000000000000000000000000000000000000000 --- a/doc/api/v1/data_provider/src/train.list +++ /dev/null @@ -1 +0,0 @@ -mnist_train.txt diff --git a/doc/api/v1/index_cn.rst b/doc/api/v1/index_cn.rst deleted file mode 100644 index 3718cd73a2003b8ef6c406a9bd51dc68e76402dc..0000000000000000000000000000000000000000 --- a/doc/api/v1/index_cn.rst +++ /dev/null @@ -1,37 +0,0 @@ -API中文手册 -============ - -DataProvider API ----------------- - -.. toctree:: - :maxdepth: 1 - - data_provider/dataprovider_cn.rst - data_provider/pydataprovider2_cn.rst - -.. _api_trainer_config: - -Model Config API ----------------- - -.. toctree:: - :maxdepth: 1 - - trainer_config_helpers/optimizers.rst - trainer_config_helpers/data_sources.rst - trainer_config_helpers/layers.rst - trainer_config_helpers/activations.rst - trainer_config_helpers/poolings.rst - trainer_config_helpers/networks.rst - trainer_config_helpers/evaluators.rst - trainer_config_helpers/attrs.rst - - -Applications API ----------------- - -.. toctree:: - :maxdepth: 1 - - predict/swig_py_paddle_cn.rst diff --git a/doc/api/v1/index_en.rst b/doc/api/v1/index_en.rst deleted file mode 100644 index 10c297a71d6988c002de868e804ed9ee2345fbd7..0000000000000000000000000000000000000000 --- a/doc/api/v1/index_en.rst +++ /dev/null @@ -1,37 +0,0 @@ -API -=== - -DataProvider API ----------------- - -.. toctree:: - :maxdepth: 1 - - data_provider/dataprovider_en.rst - data_provider/pydataprovider2_en.rst - -.. _api_trainer_config: - -Model Config API ----------------- - -.. toctree:: - :maxdepth: 1 - - trainer_config_helpers/optimizers.rst - trainer_config_helpers/data_sources.rst - trainer_config_helpers/layers.rst - trainer_config_helpers/activations.rst - trainer_config_helpers/poolings.rst - trainer_config_helpers/networks.rst - trainer_config_helpers/evaluators.rst - trainer_config_helpers/attrs.rst - - -Applications API ----------------- - -.. toctree:: - :maxdepth: 1 - - predict/swig_py_paddle_en.rst diff --git a/doc/api/v1/predict/src/predict_sample.py b/doc/api/v1/predict/src/predict_sample.py deleted file mode 100644 index 51349250e80ce11e476d952991f0d046f65286b4..0000000000000000000000000000000000000000 --- a/doc/api/v1/predict/src/predict_sample.py +++ /dev/null @@ -1,135 +0,0 @@ -# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from py_paddle import swig_paddle, DataProviderConverter -from paddle.trainer.PyDataProvider2 import dense_vector -from paddle.trainer.config_parser import parse_config - -TEST_DATA = [[[ - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.215686, 0.533333, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.67451, 0.992157, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0.070588, 0.886275, 0.992157, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.192157, - 0.070588, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.670588, 0.992157, - 0.992157, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.117647, 0.933333, 0.858824, 0.313725, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.090196, 0.858824, 0.992157, 0.831373, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0.141176, 0.992157, 0.992157, 0.611765, 0.054902, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.258824, 0.992157, 0.992157, 0.529412, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0.368627, 0.992157, 0.992157, 0.419608, 0.003922, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0.094118, 0.835294, 0.992157, 0.992157, 0.517647, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0.603922, 0.992157, 0.992157, 0.992157, 0.603922, - 0.545098, 0.043137, 0, 0, 0, 0, 0, 0, 0, 0.447059, 0.992157, 0.992157, - 0.956863, 0.062745, 0, 0, 0, 0, 0, 0, 0, 0, 0.011765, 0.666667, 0.992157, - 0.992157, 0.992157, 0.992157, 0.992157, 0.745098, 0.137255, 0, 0, 0, 0, 0, - 0.152941, 0.866667, 0.992157, 0.992157, 0.521569, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0.070588, 0.992157, 0.992157, 0.992157, 0.803922, 0.352941, 0.745098, - 0.992157, 0.945098, 0.317647, 0, 0, 0, 0, 0.580392, 0.992157, 0.992157, - 0.764706, 0.043137, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.070588, 0.992157, 0.992157, - 0.776471, 0.043137, 0, 0.007843, 0.27451, 0.882353, 0.941176, 0.176471, 0, - 0, 0.180392, 0.898039, 0.992157, 0.992157, 0.313725, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0.070588, 0.992157, 0.992157, 0.713725, 0, 0, 0, 0, 0.627451, - 0.992157, 0.729412, 0.062745, 0, 0.509804, 0.992157, 0.992157, 0.776471, - 0.035294, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.494118, 0.992157, 0.992157, - 0.968627, 0.168627, 0, 0, 0, 0.423529, 0.992157, 0.992157, 0.364706, 0, - 0.717647, 0.992157, 0.992157, 0.317647, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0.533333, 0.992157, 0.984314, 0.945098, 0.603922, 0, 0, 0, 0.003922, - 0.466667, 0.992157, 0.988235, 0.976471, 0.992157, 0.992157, 0.788235, - 0.007843, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.686275, 0.882353, 0.364706, 0, - 0, 0, 0, 0, 0, 0.098039, 0.588235, 0.992157, 0.992157, 0.992157, 0.980392, - 0.305882, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.101961, 0.67451, 0.321569, - 0, 0, 0, 0, 0, 0, 0, 0.105882, 0.733333, 0.976471, 0.811765, 0.713725, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.65098, 0.992157, 0.321569, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0.25098, 0.007843, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, - 0.94902, 0.219608, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0.968627, 0.764706, 0.152941, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.498039, 0.25098, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0 -]], [[ - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0.298039, 0.333333, 0.333333, 0.333333, 0.337255, - 0.333333, 0.333333, 0.109804, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0.027451, 0.223529, 0.776471, 0.964706, 0.988235, 0.988235, 0.988235, - 0.992157, 0.988235, 0.988235, 0.780392, 0.098039, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0.14902, 0.698039, 0.988235, 0.992157, 0.988235, 0.901961, - 0.87451, 0.568627, 0.882353, 0.976471, 0.988235, 0.988235, 0.501961, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.188235, 0.647059, 0.988235, 0.988235, - 0.745098, 0.439216, 0.098039, 0, 0, 0, 0.572549, 0.988235, 0.988235, - 0.988235, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.2, 0.933333, 0.992157, - 0.941176, 0.247059, 0, 0, 0, 0, 0, 0, 0.188235, 0.898039, 0.992157, - 0.992157, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.039216, 0.639216, 0.933333, - 0.988235, 0.913725, 0.278431, 0, 0, 0, 0, 0, 0, 0, 0.113725, 0.843137, - 0.988235, 0.988235, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.235294, 0.988235, - 0.992157, 0.988235, 0.815686, 0.07451, 0, 0, 0, 0, 0, 0, 0, 0.333333, - 0.988235, 0.988235, 0.552941, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.211765, - 0.878431, 0.988235, 0.992157, 0.701961, 0.329412, 0.109804, 0, 0, 0, 0, 0, - 0, 0, 0.698039, 0.988235, 0.913725, 0.145098, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0.188235, 0.890196, 0.988235, 0.988235, 0.745098, 0.047059, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0.882353, 0.988235, 0.568627, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.2, - 0.933333, 0.992157, 0.992157, 0.992157, 0.447059, 0.294118, 0, 0, 0, 0, 0, - 0, 0, 0, 0.447059, 0.992157, 0.768627, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0.623529, 0.988235, 0.988235, 0.988235, 0.988235, 0.992157, 0.47451, 0, 0, - 0, 0, 0, 0, 0, 0.188235, 0.933333, 0.87451, 0.509804, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0.992157, 0.988235, 0.937255, 0.792157, 0.988235, 0.894118, - 0.082353, 0, 0, 0, 0, 0, 0, 0.027451, 0.647059, 0.992157, 0.654902, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0.623529, 0.988235, 0.913725, 0.329412, 0.376471, - 0.184314, 0, 0, 0, 0, 0, 0, 0.027451, 0.513725, 0.988235, 0.635294, - 0.219608, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.196078, 0.929412, 0.988235, - 0.988235, 0.741176, 0.309804, 0, 0, 0, 0, 0, 0, 0.529412, 0.988235, - 0.678431, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.223529, 0.992157, - 0.992157, 1, 0.992157, 0.992157, 0.992157, 0.992157, 1, 0.992157, 0.992157, - 0.882353, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.023529, - 0.478431, 0.654902, 0.658824, 0.952941, 0.988235, 0.988235, 0.988235, - 0.992157, 0.988235, 0.729412, 0.278431, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0.196078, 0.647059, 0.764706, 0.764706, 0.768627, - 0.580392, 0.047059, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0 -]]] - - -def main(): - conf = parse_config("./mnist_model/trainer_config.py", "") - print conf.data_config.load_data_args - network = swig_paddle.GradientMachine.createFromConfigProto( - conf.model_config) - assert isinstance(network, swig_paddle.GradientMachine) # For code hint. - network.loadParameters("./mnist_model/") - converter = DataProviderConverter([dense_vector(784)]) - inArg = converter(TEST_DATA) - print network.forwardTest(inArg) - - -if __name__ == '__main__': - swig_paddle.initPaddle("--use_gpu=0") - main() diff --git a/doc/api/v1/predict/swig_py_paddle_cn.rst b/doc/api/v1/predict/swig_py_paddle_cn.rst deleted file mode 100644 index 42f333dba2e996e70572b3cda085b83e402ede8e..0000000000000000000000000000000000000000 --- a/doc/api/v1/predict/swig_py_paddle_cn.rst +++ /dev/null @@ -1,58 +0,0 @@ -.. _api_swig_py_paddle: - -基于Python的预测 -================ - -预测流程 --------- - -PaddlePaddle使用swig对常用的预测接口进行了封装,通过编译会生成py_paddle软件包,安装该软件包就可以在python环境下实现模型预测。可以使用python的 ``help()`` 函数查询软件包相关API说明。 - -基于Python的模型预测,主要包括以下五个步骤。 - -1. 初始化PaddlePaddle环境 - - 在程序开始阶段,通过调用 ``swig_paddle.initPaddle()`` 并传入相应的命令行参数初始化PaddlePaddle。 - -2. 解析模型配置文件 - - 初始化之后,可以通过调用 ``parse_config()`` 解析训练模型时用的配置文件。注意预测数据通常不包含label, 同时预测网络通常直接输出最后一层的结果而不是像训练网络一样再接一层cost layer,所以一般需要对训练用的模型配置文件稍作相应修改才能在预测时使用。 - -3. 构造paddle.GradientMachine - - 通过调用 ``swig_paddle.GradientMachine.createFromConfigproto()`` 传入上一步解析出来的模型配置就可以创建一个 ``GradientMachine``。 - -4. 准备预测数据 - - swig_paddle中的预测接口的参数是自定义的C++数据类型,py_paddle里面提供了一个工具类 ``DataProviderConverter`` 可以用于接收和PyDataProvider2一样的输入数据并转换成预测接口所需的数据类型。 - -5. 模型预测 - - 通过调用 ``forwardTest()`` 传入预测数据,直接返回计算结果。 - - -预测Demo --------- - -如下是一段使用mnist model来实现手写识别的预测代码。完整的代码见 ``src_root/doc/ui/predict/predict_sample.py`` 。mnist model可以通过 ``src_root\demo\mnist`` 目录下的demo训练出来。 - -.. literalinclude:: src/predict_sample.py - :language: python - :lines: 15-18,121-136 - - -Demo预测输出如下,其中value即为softmax层的输出。由于TEST_DATA包含两条预测数据,所以输出的value包含两个向量 。 - -.. code-block:: text - - [{'id': None, 'value': array( - [[ 5.53018653e-09, 1.12194102e-05, 1.96644767e-09, - 1.43630644e-02, 1.51111044e-13, 9.85625684e-01, - 2.08823112e-10, 2.32777140e-08, 2.00186201e-09, - 1.15501715e-08], - [ 9.99982715e-01, 1.27787406e-10, 1.72296313e-05, - 1.49316648e-09, 1.36540484e-11, 6.93137714e-10, - 2.70634608e-08, 3.48565123e-08, 5.25639710e-09, - 4.48684503e-08]], dtype=float32)}] - - diff --git a/doc/api/v1/predict/swig_py_paddle_en.rst b/doc/api/v1/predict/swig_py_paddle_en.rst deleted file mode 100644 index 1c628e6971fa5643e6a9ca629488049957686193..0000000000000000000000000000000000000000 --- a/doc/api/v1/predict/swig_py_paddle_en.rst +++ /dev/null @@ -1,59 +0,0 @@ -Python Prediction -================== - -PaddlePaddle offers a set of clean prediction interfaces for python with the help of -SWIG. The main steps of predict values in python are: - -* Parse training configurations -* Construct GradientMachine -* Prepare data -* Predict - -Here is a sample python script that shows the typical prediction process for the -MNIST classification problem. A complete sample code could be found at -:code:`src_root/doc/ui/predict/predict_sample.py`. - -.. literalinclude:: src/predict_sample.py - :language: python - :lines: 15-18,90-100,101-104 - -The module that does the most of the job is py_paddle.swig_paddle, it's -generated by SWIG and has complete documents, for more details you can use -python's :code:`help()` function. Let's walk through the above python script: - -* At the beginning, use :code:`swig_paddle.initPaddle()` to initialize - PaddlePaddle with command line arguments, for more about command line arguments - see :ref:`cmd_detail_introduction` . -* Parse the configuration file that is used in training with :code:`parse_config()`. - Because data to predict with always have no label, and output of prediction work - normally is the output layer rather than the cost layer, so you should modify - the configuration file accordingly before using it in the prediction work. -* Create a neural network with - :code:`swig_paddle.GradientMachine.createFromConfigproto()`, which takes the - parsed configuration :code:`conf.model_config` as argument. Then load the - trained parameters from the model with :code:`network.loadParameters()`. -* Create a data converter object of utility class :code:`DataProviderConverter`. - - Note: As swig_paddle can only accept C++ matrices, we offer a utility - class DataProviderConverter that can accept the same input data with - PyDataProvider2, for more information please refer to document - of :ref:`api_pydataprovider2` . -* Do the prediction with :code:`forwardTest()`, which takes the converted - input data and outputs the activations of the output layer. - -Here is a typical output: - -.. code-block:: text - - [{'id': None, 'value': array([[ 5.53018653e-09, 1.12194102e-05, 1.96644767e-09, - 1.43630644e-02, 1.51111044e-13, 9.85625684e-01, - 2.08823112e-10, 2.32777140e-08, 2.00186201e-09, - 1.15501715e-08], - [ 9.99982715e-01, 1.27787406e-10, 1.72296313e-05, - 1.49316648e-09, 1.36540484e-11, 6.93137714e-10, - 2.70634608e-08, 3.48565123e-08, 5.25639710e-09, - 4.48684503e-08]], dtype=float32)}] - -:code:`value` is the output of the output layer, each row represents result of -the corresponding row in the input data, each element represents activation of -the corresponding neuron in the output layer. - diff --git a/doc/api/v2/config/activation.rst b/doc/api/v2/config/activation.rst index eca3ce03bcdc599edca802d8dfca48d4f28275a2..5317e66b64bbd85c61f19700a9d2c1d239dee573 100644 --- a/doc/api/v2/config/activation.rst +++ b/doc/api/v2/config/activation.rst @@ -99,3 +99,10 @@ STanh .. automodule:: paddle.v2.activation :members: STanh :noindex: + +SoftSign +======== + +.. automodule:: paddle.v2.activation + :members: SoftSign + :noindex: diff --git a/doc/api/v2/config/evaluators.rst b/doc/api/v2/config/evaluators.rst index 39db51fa4abc370855ca3f2778b47464f33b6fce..9ac972fb193a2fb525edc507f7ba1303d2c8eabe 100644 --- a/doc/api/v2/config/evaluators.rst +++ b/doc/api/v2/config/evaluators.rst @@ -99,3 +99,12 @@ value_printer .. automodule:: paddle.v2.evaluator :members: value_printer :noindex: + +Detection +===== + +detection_map +------------- +.. automodule:: paddle.v2.evaluator + :members: detection_map + :noindex: diff --git a/doc/api/v2/config/layer.rst b/doc/api/v2/config/layer.rst index 1efa74ecda4170332d96603ca2253c68468474f9..ddf0b055a92d80295b24255a5462d477e0d9c796 100644 --- a/doc/api/v2/config/layer.rst +++ b/doc/api/v2/config/layer.rst @@ -54,18 +54,23 @@ img_conv .. _api_v2.layer_context_projection: -context_projection +context_projection ------------------ .. autoclass:: paddle.v2.layer.context_projection :noindex: +row_conv +-------- +.. autoclass:: paddle.v2.layer.row_conv + :noindex: + Image Pooling Layer =================== img_pool -------- .. autoclass:: paddle.v2.layer.img_pool - :noindex: + :noindex: spp --- @@ -77,6 +82,11 @@ maxout .. autoclass:: paddle.v2.layer.maxout :noindex: +roi_pool +-------- +.. autoclass:: paddle.v2.layer.roi_pool + :noindex: + Norm Layer ========== @@ -94,12 +104,17 @@ sum_to_one_norm --------------- .. autoclass:: paddle.v2.layer.sum_to_one_norm :noindex: - + cross_channel_norm ------------------ .. autoclass:: paddle.v2.layer.cross_channel_norm :noindex: - + +row_l2_norm +----------- +.. autoclass:: paddle.v2.layer.row_l2_norm + :noindex: + Recurrent Layers ================ @@ -130,7 +145,7 @@ recurrent_group --------------- .. autoclass:: paddle.v2.layer.recurrent_group :noindex: - + lstm_step --------- .. autoclass:: paddle.v2.layer.lstm_step @@ -145,12 +160,12 @@ beam_search ------------ .. autoclass:: paddle.v2.layer.beam_search :noindex: - + get_output ---------- .. autoclass:: paddle.v2.layer.get_output :noindex: - + Mixed Layer =========== @@ -193,6 +208,10 @@ identity_projection .. autoclass:: paddle.v2.layer.identity_projection :noindex: +slice_projection +------------------- +.. autoclass:: paddle.v2.layer.slice_projection + :noindex: table_projection ---------------- @@ -203,7 +222,7 @@ trans_full_matrix_projection ---------------------------- .. autoclass:: paddle.v2.layer.trans_full_matrix_projection :noindex: - + Aggregate Layers ================ @@ -233,6 +252,11 @@ first_seq .. autoclass:: paddle.v2.layer.first_seq :noindex: +sub_seq +--------- +.. autoclass:: paddle.v2.layer.sub_seq + :noindex: + concat ------ .. autoclass:: paddle.v2.layer.concat @@ -243,6 +267,21 @@ seq_concat .. autoclass:: paddle.v2.layer.seq_concat :noindex: +seq_slice +--------- +.. autoclass:: paddle.v2.layer.seq_slice + :noindex: + +kmax_sequence_score +------------------- +.. autoclass:: paddle.v2.layer.kmax_sequence_score + :noindex: + +sub_nested_seq +-------------- +.. autoclass:: paddle.v2.layer.sub_nested_seq + :noindex: + Reshaping Layers ================ @@ -301,6 +340,16 @@ bilinear_interp .. autoclass:: paddle.v2.layer.bilinear_interp :noindex: +dot_prod +--------- +.. autoclass:: paddle.v2.layer.dot_prod + :noindex: + +out_prod +-------- +.. autoclass:: paddle.v2.layer.out_prod + :noindex: + power ----- .. autoclass:: paddle.v2.layer.power @@ -311,6 +360,16 @@ scaling .. autoclass:: paddle.v2.layer.scaling :noindex: +clip +---- +.. autoclass:: paddle.v2.layer.clip + :noindex: + +resize +------ +.. autoclass:: paddle.v2.layer.resize + :noindex: + slope_intercept --------------- .. autoclass:: paddle.v2.layer.slope_intercept @@ -328,11 +387,21 @@ cos_sim .. autoclass:: paddle.v2.layer.cos_sim :noindex: +l2_distance +----------- +.. autoclass:: paddle.v2.layer.l2_distance + :noindex: + trans ----- .. autoclass:: paddle.v2.layer.trans :noindex: +scale_shift +----------- +.. autoclass:: paddle.v2.layer.scale_shift + :noindex: + Sampling Layers =============== @@ -346,6 +415,19 @@ sampling_id .. autoclass:: paddle.v2.layer.sampling_id :noindex: +multiplex +--------- +.. autoclass:: paddle.v2.layer.multiplex + :noindex: + +Factorization Machine Layer +============================ + +factorization_machine +--------------------- +.. autoclass:: paddle.v2.layer.factorization_machine + :noindex: + Slicing and Joining Layers ========================== @@ -374,9 +456,14 @@ multi_binary_label_cross_entropy_cost .. autoclass:: paddle.v2.layer.multi_binary_label_cross_entropy_cost :noindex: -huber_cost ----------- -.. autoclass:: paddle.v2.layer.huber_cost +huber_regression_cost +------------------------- +.. autoclass:: paddle.v2.layer.huber_regression_cost + :noindex: + +huber_classification_cost +------------------------- +.. autoclass:: paddle.v2.layer.huber_classification_cost :noindex: lambda_cost @@ -384,9 +471,9 @@ lambda_cost .. autoclass:: paddle.v2.layer.lambda_cost :noindex: -mse_cost --------- -.. autoclass:: paddle.v2.layer.mse_cost +square_error_cost +----------------- +.. autoclass:: paddle.v2.layer.square_error_cost :noindex: rank_cost @@ -434,10 +521,44 @@ smooth_l1_cost .. autoclass:: paddle.v2.layer.smooth_l1_cost :noindex: -Check Layer +multibox_loss +-------------- +.. autoclass:: paddle.v2.layer.multibox_loss + :noindex: + +Check Layer ============ eos --- .. autoclass:: paddle.v2.layer.eos :noindex: + +Miscs +===== + +dropout +-------- +.. autoclass:: paddle.v2.layer.dropout + :noindex: + +Activation with learnable parameter +=================================== + +prelu +-------- +.. autoclass:: paddle.v2.layer.prelu + :noindex: + +gated_unit +----------- +.. autoclass:: paddle.v2.layer.gated_unit + :noindex: + +Detection output Layer +====================== + +detection_output +---------------- +.. autoclass:: paddle.v2.layer.detection_output + :noindex: diff --git a/doc/api/v2/config/networks.rst b/doc/api/v2/config/networks.rst index b2a617fff134035c04eeabbbaf6d9cbe2a525f1c..048379cf01f4aec5e73e2fe3ddfa728f3c17a5d1 100644 --- a/doc/api/v2/config/networks.rst +++ b/doc/api/v2/config/networks.rst @@ -125,11 +125,8 @@ simple_attention :members: simple_attention :noindex: -Miscs -===== - -dropout_layer --------------- +dot_product_attention +--------------------- .. automodule:: paddle.v2.networks - :members: dropout_layer + :members: dot_product_attention :noindex: diff --git a/doc/api/v2/data.rst b/doc/api/v2/data.rst index fef87c4fbdb452771ecdb361c6eeae5b32bcee14..b56c7332cc284649c7e04328e51a7faa78593a39 100644 --- a/doc/api/v2/data.rst +++ b/doc/api/v2/data.rst @@ -2,112 +2,9 @@ Data Reader Interface and DataSets ================================== +.. toctree:: + :maxdepth: 1 -DataTypes -========= - -.. automodule:: paddle.v2.data_type - :members: - :noindex: - -DataFeeder -========== - -.. automodule:: paddle.v2.data_feeder - :members: - :noindex: - -Reader -====== - -.. automodule:: paddle.v2.reader - :members: - :noindex: - -.. automodule:: paddle.v2.reader.creator - :members: - :noindex: - -minibatch -========= - -.. automodule:: paddle.v2.minibatch - :members: - :noindex: - -Dataset -======= - -.. automodule:: paddle.v2.dataset - :members: - :noindex: - -mnist -+++++ - -.. automodule:: paddle.v2.dataset.mnist - :members: - :noindex: - -cifar -+++++ - -.. automodule:: paddle.v2.dataset.cifar - :members: - :noindex: - -conll05 -+++++++ - -.. automodule:: paddle.v2.dataset.conll05 - :members: get_dict,get_embedding,test - :noindex: - -imdb -++++ - -.. automodule:: paddle.v2.dataset.imdb - :members: - :noindex: - -imikolov -++++++++ - -.. automodule:: paddle.v2.dataset.imikolov - :members: - :noindex: - -movielens -+++++++++ - -.. automodule:: paddle.v2.dataset.movielens - :members: - :noindex: - -.. autoclass:: paddle.v2.dataset.movielens.MovieInfo - :noindex: - -.. autoclass:: paddle.v2.dataset.movielens.UserInfo - :noindex: - -sentiment -+++++++++ - -.. automodule:: paddle.v2.dataset.sentiment - :members: - :noindex: - -uci_housing -+++++++++++ - -.. automodule:: paddle.v2.dataset.uci_housing - :members: - :noindex: - -wmt14 -+++++ - -.. automodule:: paddle.v2.dataset.wmt14 - :members: - :noindex: - + data/data_reader.rst + data/image.rst + data/dataset.rst diff --git a/doc/api/v2/data/data_reader.rst b/doc/api/v2/data/data_reader.rst new file mode 100644 index 0000000000000000000000000000000000000000..2ccfec9c284877a7576e9751526b169a4ac78d8e --- /dev/null +++ b/doc/api/v2/data/data_reader.rst @@ -0,0 +1,36 @@ +===================== +Data Reader Interface +===================== + + +DataTypes +========= + +.. automodule:: paddle.v2.data_type + :members: + :noindex: + +DataFeeder +========== + +.. automodule:: paddle.v2.data_feeder + :members: + :noindex: + +Reader +====== + +.. automodule:: paddle.v2.reader + :members: + :noindex: + +.. automodule:: paddle.v2.reader.creator + :members: + :noindex: + +minibatch +========= + +.. automodule:: paddle.v2.minibatch + :members: + :noindex: diff --git a/doc/api/v2/data/dataset.rst b/doc/api/v2/data/dataset.rst new file mode 100644 index 0000000000000000000000000000000000000000..6a8ecc5bb1d855e0ded3719943ab3adb810de365 --- /dev/null +++ b/doc/api/v2/data/dataset.rst @@ -0,0 +1,75 @@ +Dataset +======= + +.. automodule:: paddle.v2.dataset + :members: + :noindex: + +mnist ++++++ + +.. automodule:: paddle.v2.dataset.mnist + :members: + :noindex: + +cifar ++++++ + +.. automodule:: paddle.v2.dataset.cifar + :members: + :noindex: + +conll05 ++++++++ + +.. automodule:: paddle.v2.dataset.conll05 + :members: get_dict,get_embedding,test + :noindex: + +imdb +++++ + +.. automodule:: paddle.v2.dataset.imdb + :members: + :noindex: + +imikolov +++++++++ + +.. automodule:: paddle.v2.dataset.imikolov + :members: + :noindex: + +movielens ++++++++++ + +.. automodule:: paddle.v2.dataset.movielens + :members: + :noindex: + +.. autoclass:: paddle.v2.dataset.movielens.MovieInfo + :noindex: + +.. autoclass:: paddle.v2.dataset.movielens.UserInfo + :noindex: + +sentiment ++++++++++ + +.. automodule:: paddle.v2.dataset.sentiment + :members: + :noindex: + +uci_housing ++++++++++++ + +.. automodule:: paddle.v2.dataset.uci_housing + :members: + :noindex: + +wmt14 ++++++ + +.. automodule:: paddle.v2.dataset.wmt14 + :members: + :noindex: diff --git a/doc/api/v2/data/image.rst b/doc/api/v2/data/image.rst new file mode 100644 index 0000000000000000000000000000000000000000..97651ffa6be56cf3ecaca2caca38a353fa5c1f49 --- /dev/null +++ b/doc/api/v2/data/image.rst @@ -0,0 +1,5 @@ +Image Interface +=============== + +.. automodule:: paddle.v2.image + :members: diff --git a/doc/api/v2/fluid.rst b/doc/api/v2/fluid.rst new file mode 100644 index 0000000000000000000000000000000000000000..5f15cad2b530dfb3702357b3c26885ac2a7b7beb --- /dev/null +++ b/doc/api/v2/fluid.rst @@ -0,0 +1,18 @@ +====================== +Fluid +====================== + +.. toctree:: + :maxdepth: 1 + + fluid/layers.rst + fluid/data_feeder.rst + fluid/executor.rst + fluid/initializer.rst + fluid/evaluator.rst + fluid/nets.rst + fluid/optimizer.rst + fluid/param_attr.rst + fluid/profiler.rst + fluid/regularizer.rst + fluid/io.rst diff --git a/doc/api/v2/fluid/data_feeder.rst b/doc/api/v2/fluid/data_feeder.rst new file mode 100644 index 0000000000000000000000000000000000000000..0fa78f7dfb04c13be7eb83b7fd35cb03f2f4a7fa --- /dev/null +++ b/doc/api/v2/fluid/data_feeder.rst @@ -0,0 +1,9 @@ +=========== +DataFeeder +=========== + +DataFeeder +----------- +.. automodule:: paddle.v2.fluid.data_feeder + :members: DataFeeder + :noindex: diff --git a/doc/api/v2/fluid/evaluator.rst b/doc/api/v2/fluid/evaluator.rst new file mode 100644 index 0000000000000000000000000000000000000000..a23f3301d0331e0ea3733f06444515eb4680cd31 --- /dev/null +++ b/doc/api/v2/fluid/evaluator.rst @@ -0,0 +1,9 @@ +=========== +Evaluator +=========== + +Evaluator +----------- +.. automodule:: paddle.v2.fluid.evaluator + :members: Evaluator + :noindex: diff --git a/doc/api/v2/fluid/executor.rst b/doc/api/v2/fluid/executor.rst new file mode 100644 index 0000000000000000000000000000000000000000..3a283538c120cfa1ef646c390bb71c6251c23675 --- /dev/null +++ b/doc/api/v2/fluid/executor.rst @@ -0,0 +1,9 @@ +=========== +Executor +=========== + +Executor +----------- +.. automodule:: paddle.v2.fluid.executor + :members: Executor + :noindex: diff --git a/doc/api/v2/fluid/initializer.rst b/doc/api/v2/fluid/initializer.rst new file mode 100644 index 0000000000000000000000000000000000000000..8f587837e9873370722062404f511654a9460587 --- /dev/null +++ b/doc/api/v2/fluid/initializer.rst @@ -0,0 +1,50 @@ +=========== +Initializer +=========== + + + +Initializer +----------- +.. automodule:: paddle.v2.fluid.initializer + :members: Initializer + :noindex: + + + +ConstantInitializer +------------------- +.. automodule:: paddle.v2.fluid.initializer + :members: ConstantInitializer + :noindex: + + + +UniformInitializer +------------------ +.. automodule:: paddle.v2.fluid.initializer + :members: UniformInitializer + :noindex: + + + +NormalInitializer +----------------- +.. automodule:: paddle.v2.fluid.initializer + :members: NormalInitializer + :noindex: + + +XavierInitializer +----------------- +.. automodule:: paddle.v2.fluid.initializer + :members: XavierInitializer + :noindex: + + +MSRAInitializer +--------------- +.. automodule:: paddle.v2.fluid.initializer + :members: MSRAInitializer + :noindex: + diff --git a/doc/api/v2/fluid/io.rst b/doc/api/v2/fluid/io.rst new file mode 100644 index 0000000000000000000000000000000000000000..67f68c4e9e16b379207b8de114cdf769e056f78e --- /dev/null +++ b/doc/api/v2/fluid/io.rst @@ -0,0 +1,10 @@ +=========== +IO +=========== + + + +is_parameter +----------- +.. autofunction:: paddle.v2.fluid.io.is_parameter + :noindex: diff --git a/doc/api/v2/fluid/layers.rst b/doc/api/v2/fluid/layers.rst new file mode 100644 index 0000000000000000000000000000000000000000..231ec2d4ba102a5d31c47cbc7a5d484ef17a7f3a --- /dev/null +++ b/doc/api/v2/fluid/layers.rst @@ -0,0 +1,546 @@ +========== +Layers +========== + + +fc +--- +.. autofunction:: paddle.v2.fluid.layers.fc + :noindex: + +embedding +--------- +.. autofunction:: paddle.v2.fluid.layers.embedding + :noindex: + +dynamic_lstm +------------ +.. autofunction:: paddle.v2.fluid.layers.dynamic_lstm + :noindex: + +dynamic_lstmp +------------- +.. autofunction:: paddle.v2.fluid.layers.dynamic_lstmp + :noindex: + +dynamic_gru +----------- +.. autofunction:: paddle.v2.fluid.layers.dynamic_gru + :noindex: + +data +---- +.. autofunction:: paddle.v2.fluid.layers.data + :noindex: + +mean +---- +.. autofunction:: paddle.v2.fluid.layers.mean + :noindex: + +mul +--- +.. autofunction:: paddle.v2.fluid.layers.mul + :noindex: + +elementwise_add +--------------- +.. autofunction:: paddle.v2.fluid.layers.elementwise_add + :noindex: + +elementwise_sub +--------------- +.. autofunction:: paddle.v2.fluid.layers.elementwise_sub + :noindex: + +elementwise_mul +--------------- +.. autofunction:: paddle.v2.fluid.layers.elementwise_mul + :noindex: + +elementwise_div +--------------- +.. autofunction:: paddle.v2.fluid.layers.elementwise_div + :noindex: + + +dropout +------- +.. autofunction:: paddle.v2.fluid.layers.dropout + :noindex: + + +reshape +-------- +.. autofunction:: paddle.v2.fluid.layers.reshape + :noindex: + + +sigmoid +--------- +.. autofunction:: paddle.v2.fluid.layers.sigmoid + :noindex: + + +scale +--------- +.. autofunction:: paddle.v2.fluid.layers.scale + :noindex: + + +transpose +--------- +.. autofunction:: paddle.v2.fluid.layers.transpose + :noindex: + + +sigmoid_cross_entropy_with_logits +--------------------------------- +.. autofunction:: paddle.v2.fluid.layers.esigmoid_cross_entropy_with_logits + :noindex: + + +cast +---- +.. autofunction:: paddle.v2.fluid.layers.cast + :noindex: + + +concat +------- +.. autofunction:: paddle.v2.fluid.layers.concat + :noindex: + + +sums +---- +.. autofunction:: paddle.v2.fluid.layers.sums + :noindex: + + +linear_chain_crf +---------------- +.. autofunction:: paddle.v2.fluid.layers.linear_chain_crf + :noindex: + + +assign +------- +.. autofunction:: paddle.v2.fluid.layers.embedding + :noindex: + + +split_lod_tensor +---------------- +.. autofunction:: paddle.v2.fluid.layers.split_lod_tensor + :noindex: + + +merge_lod_tensor +---------------- +.. autofunction:: paddle.v2.fluid.layers.merge_lod_tensor + :noindex: + +cos_sim +-------- +.. autofunction:: paddle.v2.fluid.layers.cos_sim + :noindex: + + +cross_entropy +------------- +.. autofunction:: paddle.v2.fluid.layers.cross_entropy + :noindex: + + + +square_error_cost +----------------- +.. autofunction:: paddle.v2.fluid.layers.square_error_cost + :noindex: + + +accuracy +--------- +.. autofunction:: paddle.v2.fluid.layers.accuracy + :noindex: + + +sequence_conv +------------- +.. autofunction:: paddle.v2.fluid.layers.sequence_conv + :noindex: + + +conv2d +------ +.. autofunction:: paddle.v2.fluid.layers.conv2d + :noindex: + + +sequence_pool +------------- +.. autofunction:: paddle.v2.fluid.layers.sequence_pool + :noindex: + + +sequence_first_step +------------------- +.. autofunction:: paddle.v2.fluid.layers.sequence_first_step + :noindex: + + +sequence_last_step +------------------ +.. autofunction:: paddle.v2.fluid.layers.sequence_last_step + :noindex: + + +pool2d +------ +.. autofunction:: paddle.v2.fluid.layers.pool2d + :noindex: + + +batch_norm +---------- +.. autofunction:: paddle.v2.fluid.layers.batch_norm + :noindex: + + +beam_search_decode +------------------ +.. autofunction:: paddle.v2.fluid.layers.beam_search_decode + :noindex: + + +lod_rank_table +-------------- +.. autofunction:: paddle.v2.fluid.layers.lod_rank_table + :noindex: + + +max_sequence_len +---------------- +.. autofunction:: paddle.v2.fluid.layers.max_sequence_len + :noindex: + + +topk +----- +.. autofunction:: paddle.v2.fluid.layers.topk + :noindex: + + +lod_tensor_to_array +------------------- +.. autofunction:: paddle.v2.fluid.layers.lod_tensor_to_array + :noindex: + + + +array_to_lod_tensor +------------------- +.. autofunction:: paddle.v2.fluid.layers.array_to_lod_tensor + :noindex: + + + + +fill_constant +------------- +.. autofunction:: paddle.v2.fluid.layers.fill_constant + :noindex: + + + +fill_constant_batch_size_like +----------------------------- +.. autofunction:: paddle.v2.fluid.layers.fill_constant_batch_size_like + :noindex: + + +ones +---- +.. autofunction:: paddle.v2.fluid.layers.ones + :noindex: + + +zeros +----- +.. autofunction:: paddle.v2.fluid.layers.zeros + :noindex: + + +increment +--------- +.. autofunction:: paddle.v2.fluid.layers.increment + :noindex: + + +array_write +----------- +.. autofunction:: paddle.v2.fluid.layers.array_write + :noindex: + + + +create_array +------------ +.. autofunction:: paddle.v2.fluid.layers.create_array + :noindex: + + +less_than +--------- +.. autofunction:: paddle.v2.fluid.layers.less_than + :noindex: + + +array_read +---------- +.. autofunction:: paddle.v2.fluid.layers.array_read + :noindex: + + +shrink_memory +-------------- +.. autofunction:: paddle.v2.fluid.layers.shrink_memory + :noindex: + + +array_length +------------- +.. autofunction:: paddle.v2.fluid.layers.array_length + :noindex: + + +conv2d_transpose +---------------- +.. autofunction:: paddle.v2.fluid.layers.conv2d_transpose + :noindex: + + +sequence_expand +--------------- +.. autofunction:: paddle.v2.fluid.layers.sequence_expand + :noindex: + + +gru_unit +-------- +.. autofunction:: paddle.v2.fluid.layers.gru_unit + :noindex: + + +lstm_unit +--------- +.. autofunction:: paddle.v2.fluid.layers.lstm_unit + :noindex: + + +sequence_softmax +---------------- +.. autofunction:: paddle.v2.fluid.layers.sequence_softmax + :noindex: + + +reduce_sum +---------- +.. autofunction:: paddle.v2.fluid.layers.reduce_sum + :noindex: + + +reduce_mean +----------- +.. autofunction:: paddle.v2.fluid.layers.reduce_mean + :noindex: + + +reduce_max +---------- +.. autofunction:: paddle.v2.fluid.layers.reduce_max + :noindex: + + +reduce_min +---------- +.. autofunction:: paddle.v2.fluid.layers.reduce_min + :noindex: + + +split +----- +.. autofunction:: paddle.v2.fluid.layers.split + :noindex: + + +matmul +------ +.. autofunction:: paddle.v2.fluid.layers.matmul + :noindex: + +logsigmoid +---------- +.. autofunction:: paddle.v2.fluid.layers.logsigmoid + :noindex: + +exp +--- +.. autofunction:: paddle.v2.fluid.layers.exp + :noindex: + +relu +---- +.. autofunction:: paddle.v2.fluid.layers.relu + :noindex: + +tanh +---- +.. autofunction:: paddle.v2.fluid.layers.tanh + :noindex: + +tanh_shrink +----------- +.. autofunction:: paddle.v2.fluid.layers.tanh_shrink + :noindex: + +softshrink +---------- +.. autofunction:: paddle.v2.fluid.layers.softshrink + :noindex: + +sqrt +---- +.. autofunction:: paddle.v2.fluid.layers.sqrt + :noindex: + +abs +---- +.. autofunction:: paddle.v2.fluid.layers.abs + :noindex: + +ceil +---- +.. autofunction:: paddle.v2.fluid.layers.ceil + :noindex: + +floor +----- +.. autofunction:: paddle.v2.fluid.layers.floor + :noindex: + +round +----- +.. autofunction:: paddle.v2.fluid.layers.round + :noindex: + +reciprocal +---------- +.. autofunction:: paddle.v2.fluid.layers.reciprocal + :noindex: + +log +--- +.. autofunction:: paddle.v2.fluid.layers.log + :noindex: + +square +------ +.. autofunction:: paddle.v2.fluid.layers.square + :noindex: + +softplus +-------- +.. autofunction:: paddle.v2.fluid.layers.softplus + :noindex: + +softsign +--------- +.. autofunction:: paddle.v2.fluid.layers.softsign + :noindex: + +brelu +----- +.. autofunction:: paddle.v2.fluid.layers.brelu + :noindex: + +leaky_relu +---------- +.. autofunction:: paddle.v2.fluid.layers.leaky_relu + :noindex: + +soft_relu +--------- +.. autofunction:: paddle.v2.fluid.layers.soft_relu + :noindex: + +elu +---- +.. autofunction:: paddle.v2.fluid.layers.elu + :noindex: + +relu6 +----- +.. autofunction:: paddle.v2.fluid.layers.relu6 + :noindex: + +pow +---- +.. autofunction:: paddle.v2.fluid.layers.pow + :noindex: + +hard_shrink +----------- +.. autofunction:: paddle.v2.fluid.layers.hard_shrink + :noindex: + +thresholded_relu +---------------- +.. autofunction:: paddle.v2.fluid.layers.thresholded_relu + :noindex: + +hard_sigmoid +------------- +.. autofunction:: paddle.v2.fluid.layers.hard_sigmoid + :noindex: + +swish +------ +.. autofunction:: paddle.v2.fluid.layers.swish + :noindex: + +im2sequence +------ +.. autofunction:: paddle.v2.fluid.layers.im2sequence + :noindex: + +edit_distance +--------------- +.. autofunction:: paddle.v2.fluid.layers.edit_distance_error + :noindex: + +ctc_greedy_decoder +--------------- +.. autofunction:: paddle.v2.fluid.layers.ctc_greedy_decoder + :noindex: + +l2_normalize +------------ +.. autofunction:: paddle.v2.fluid.layers.l2_normalize + :noindex: + +sequence_reshape +---------------- +.. autofunction:: paddle.v2.fluid.layers.sequence_reshape + :noindex: + +row_conv +-------- +.. autofunction:: paddle.v2.fluid.layers.row_conv + :noindex: + +multiplex +--------- +.. autofunction:: paddle.v2.fluid.layers.multiplex + :noindex: diff --git a/doc/api/v2/fluid/nets.rst b/doc/api/v2/fluid/nets.rst new file mode 100644 index 0000000000000000000000000000000000000000..500019bc507f859c4c91de5d322a82eb1e78e2de --- /dev/null +++ b/doc/api/v2/fluid/nets.rst @@ -0,0 +1,33 @@ +=========== +Nets +=========== + +simple_img_conv_pool +-------------------- +.. autofunction:: paddle.v2.fluid.nets.simple_img_conv_pool + :noindex: + + +img_conv_group +--------------- +.. autofunction:: paddle.v2.fluid.nets.img_conv_group + :noindex: + + +sequence_conv_pool +------------------ +.. autofunction:: paddle.v2.fluid.nets.sequence_conv_pool + :noindex: + + +glu +--- +.. autofunction:: paddle.v2.fluid.nets.glu + :noindex: + + +scaled_dot_product_attention +---------------------------- +.. autofunction:: paddle.v2.fluid.nets.scaled_dot_product_attention + :noindex: + diff --git a/doc/api/v2/fluid/optimizer.rst b/doc/api/v2/fluid/optimizer.rst new file mode 100644 index 0000000000000000000000000000000000000000..19b4940f08de3e2f7dc177f2961e538946d10a78 --- /dev/null +++ b/doc/api/v2/fluid/optimizer.rst @@ -0,0 +1,54 @@ +=========== +Optimizer +=========== + +Optimizer +----------- +.. automodule:: paddle.v2.fluid.optimizer + :members: Optimizer + :noindex: + + +SGDOptimizer +----------- +.. automodule:: paddle.v2.fluid.optimizer + :members: SGDOptimizer + :noindex: + + + +MomentumOptimizer +----------------- +.. automodule:: paddle.v2.fluid.optimizer + :members: MomentumOptimizer + :noindex: + + + +AdagradOptimizer +---------------- +.. automodule:: paddle.v2.fluid.optimizer + :members: AdagradOptimizer + :noindex: + + +AdamOptimizer +------------- +.. automodule:: paddle.v2.fluid.optimizer + :members: AdamOptimizer + :noindex: + + +AdamaxOptimizer +----------- +.. automodule:: paddle.v2.fluid.optimizer + :members: AdamaxOptimizer + :noindex: + + +DecayedAdagradOptimizer +----------------------- +.. automodule:: paddle.v2.fluid.optimizer + :members: DecayedAdagradOptimizer + :noindex: + diff --git a/doc/api/v2/fluid/param_attr.rst b/doc/api/v2/fluid/param_attr.rst new file mode 100644 index 0000000000000000000000000000000000000000..ca0c8af9e8c4f2271de7a131ad0d27c0e8635f50 --- /dev/null +++ b/doc/api/v2/fluid/param_attr.rst @@ -0,0 +1,11 @@ +=========== +ParamAttr +=========== + + + +ParamAttr +----------- +.. automodule:: paddle.v2.fluid.param_attr + :members: ParamAttr + :noindex: diff --git a/doc/api/v2/fluid/profiler.rst b/doc/api/v2/fluid/profiler.rst new file mode 100644 index 0000000000000000000000000000000000000000..7d4042d1f41c12c4a551ba6576559d612116872a --- /dev/null +++ b/doc/api/v2/fluid/profiler.rst @@ -0,0 +1,10 @@ +=========== +Profiler +=========== + + + +Profiler +----------- +.. autofunction:: paddle.v2.fluid.profiler.cuda_profiler + :noindex: diff --git a/doc/api/v2/fluid/regularizer.rst b/doc/api/v2/fluid/regularizer.rst new file mode 100644 index 0000000000000000000000000000000000000000..868e225ed3d59e79aeb217fb88081ea25f80fa2c --- /dev/null +++ b/doc/api/v2/fluid/regularizer.rst @@ -0,0 +1,25 @@ +=========== +Regularizer +=========== + +WeightDecayRegularizer +---------------------- +.. automodule:: paddle.v2.fluid.regularizer + :members: WeightDecayRegularizer + :noindex: + + +L2DecayRegularizer +------------------ +.. automodule:: paddle.v2.fluid.regularizer + :members: L2DecayRegularizer + :noindex: + + + +L1DecayRegularizer +------------------- +.. automodule:: paddle.v2.fluid.regularizer + :members: L1DecayRegularizer + + diff --git a/doc/design/api.md b/doc/design/api.md index 8185d2af0ea264a2e7b4e28b9ed05279e4a22014..e6a4638d9100d9b07c3ee6b92b530a17eae1c162 100644 --- a/doc/design/api.md +++ b/doc/design/api.md @@ -3,7 +3,7 @@ ## Ingredients As our design principle is starting from the essence: how could we -allow users to express and solve their problems at neural networks. +allow users to express and solve their problems as neural networks. Some essential concepts that our API have to provide include: 1. A *topology* is an expression of *layers*. @@ -233,7 +233,7 @@ paddle.dist_train(model, num_parameter_servers=15) ``` -The pseudo code if `paddle.dist_train` is as follows: +The pseudo code of `paddle.dist_train` is as follows: ```python def dist_train(topology, parameters, trainer, reader, ...): diff --git a/doc/design/auto_gradient_check.md b/doc/design/auto_gradient_check.md new file mode 100644 index 0000000000000000000000000000000000000000..f9991541bc51c6e13ffce4e9cec60f73dc800121 --- /dev/null +++ b/doc/design/auto_gradient_check.md @@ -0,0 +1,146 @@ +## Auto Gradient Checker Design + +## Backgraound: +- Generally, it is easy to check whether the forward computation of an Operator is correct or not. However, backpropagation is a notoriously difficult algorithm to debug and get right: + 1. you should get the right backpropagation formula according to the forward computation. + 2. you should implement it right in CPP. + 3. it's difficult to prepare test data. + +- Auto gradient checking gets a numerical gradient by forward Operator and use it as a reference of the backward Operator's result. It has several advantages: + 1. numerical gradient checker only need forward operator. + 2. user only need to prepare the input data for forward Operator. + +## Mathematical Theory +The following two document from Stanford has a detailed explanation of how to get numerical gradient and why it's useful. + +- [Gradient checking and advanced optimization(en)](http://deeplearning.stanford.edu/wiki/index.php/Gradient_checking_and_advanced_optimization) +- [Gradient checking and advanced optimization(cn)](http://ufldl.stanford.edu/wiki/index.php/%E6%A2%AF%E5%BA%A6%E6%A3%80%E9%AA%8C%E4%B8%8E%E9%AB%98%E7%BA%A7%E4%BC%98%E5%8C%96) + + +## Numeric Gradient Implementation +### Python Interface +```python +def get_numerical_gradient(op, + input_values, + output_name, + input_to_check, + delta=0.005, + local_scope=None): + """ + Get Numeric Gradient for an operator's input. + + :param op: C++ operator instance, could be an network + :param input_values: The input variables. Should be an dictionary, whose key is + variable name, and value is numpy array. + :param output_name: The final output variable name. + :param input_to_check: The input variable with respect to which to compute the gradient. + :param delta: The perturbation value for numeric gradient method. The + smaller delta is, the more accurate result will get. But if that delta is + too small, it will suffer from numerical stability problem. + :param local_scope: The local scope used for get_numeric_gradient. + :return: The gradient array in numpy format. + """ +``` + +### Explaination: + +- Why need `output_name` + - An Operator may have multiple Output, one can get independent gradient from each Output. So caller should specify the name of the output variable. + +- Why need `input_to_check` + - One operator may have multiple inputs. Gradient Op can calculate the gradient of these inputs at the same time. But Numeric Gradient needs to calculate them one by one. So `get_numeric_gradient` is designed to calculate the gradient for one input. If you need to compute multiple inputs, you can call `get_numeric_gradient` multiple times. + + +### Core Algorithm Implementation + + +```python + # we only compute gradient of one element a time. + # we use a for loop to compute the gradient of each element. + for i in xrange(tensor_size): + # get one input element by its index i. + origin = tensor_to_check.get_float_element(i) + + # add delta to it, run op and then get the new value of the result tensor. + x_pos = origin + delta + tensor_to_check.set_float_element(i, x_pos) + y_pos = get_output() + + # plus delta to this element, run op and get the new value of the result tensor. + x_neg = origin - delta + tensor_to_check.set_float_element(i, x_neg) + y_neg = get_output() + + # restore old value + tensor_to_check.set_float_element(i, origin) + + # compute the gradient of this element and store it into a numpy array. + gradient_flat[i] = (y_pos - y_neg) / delta / 2 + + # reshape the gradient result to the shape of the source tensor. + return gradient_flat.reshape(tensor_to_check.get_dims()) +``` + +## Auto Graident Checker Framework + +Each Operator Kernel has three kinds of Gradient: + +1. Numerical gradient +2. CPU kernel gradient +3. GPU kernel gradient (if supported) + +The numerical gradient only relies on forward Operator. So we use the numerical gradient as the reference value. And the gradient checking is performed in the following three steps: + +1. calculate the numerical gradient +2. calculate CPU kernel gradient with the backward Operator and compare it with the numerical gradient +3. calculate GPU kernel gradient with the backward Operator and compare it with the numeric gradient (if supported) + +#### Python Interface + +```python + def check_grad(self, + forward_op, + input_vars, + inputs_to_check, + output_name, + no_grad_set=None, + only_cpu=False, + max_relative_error=0.005): + """ + :param forward_op: used to create backward_op + :param input_vars: numpy value of input variable. The following + computation will use these variables. + :param inputs_to_check: the input variable with respect to which to compute the gradient. + :param output_name: The final output variable name. + :param max_relative_error: The relative tolerance parameter. + :param no_grad_set: used when create backward ops + :param only_cpu: only compute and check gradient on cpu kernel. + :return: + """ +``` + +### How to check if two numpy array is close enough? +if `abs_numerical_grad` is nearly zero, then use abs error for numerical_grad + +```python +numerical_grad = ... +operator_grad = numpy.array(scope.find_var(grad_var_name(name)).get_tensor()) + +abs_numerical_grad = numpy.abs(numerical_grad) +# if abs_numerical_grad is nearly zero, then use abs error for numeric_grad, not relative +# error. +abs_numerical_grad[abs_numerical_grad < 1e-3] = 1 + +diff_mat = numpy.abs(abs_numerical_grad - operator_grad) / abs_numerical_grad +max_diff = numpy.max(diff_mat) +``` + + +#### Notes: +The Input data for auto gradient checker should be reasonable to avoid numerical stability problem. + + +#### Refs: + +- [Gradient checking and advanced optimization(en)](http://deeplearning.stanford.edu/wiki/index.php/Gradient_checking_and_advanced_optimization) +- [Gradient checking and advanced optimization(cn)](http://ufldl.stanford.edu/wiki/index.php/%E6%A2%AF%E5%BA%A6%E6%A3%80%E9%AA%8C%E4%B8%8E%E9%AB%98%E7%BA%A7%E4%BC%98%E5%8C%96) diff --git a/doc/design/backward.md b/doc/design/backward.md new file mode 100644 index 0000000000000000000000000000000000000000..20fda7a98f514a3f1c1c2d0ba7447ec954b21d5a --- /dev/null +++ b/doc/design/backward.md @@ -0,0 +1,158 @@ +# Backward Building + +## Motivation + +In Neural Network, most models are solved by the backpropagation algorithm(known as **BP**) at present. Technically, BP calculates the gradient of the loss function, then propagates it back through the networks following the chain rule. However, when configuring the model structure, users do not need to define the backward part. So a mechanism is required by the framework which can complete the model's backward part automatically according to the given forward part. + +When implementing a specific `op`, the developer is also asked to implement its backward version, called `grad_op`. A `grad_op` takes gradients of its corresponding `op`'s outputs, and calculate gradients of the `op`'s inputs. During the building of a model's backward part, the framework creates each forward `op`'s `grad_op`, and then string them together in reverse order of forwarding part. In this way, gradients spread from the end to the beginning of the model, in another word, from the loss to parameters. + +## Challenges + +The motivation of backward building is apparent. However, implementation it correctly is not so easy. In the **Fluid** design, a deep learning model is described by `Program`, `Block`, `Op` and `Variable`. The `Block` itself can be nested. It means that the `op`s and `variable`s are scattered across different blocks rather than all be gathered in a single graph. Our backward building algorithm shall visit blocks in recursive order and be able to insert `grad_op`s and new created `variable`s into the right place. + +## Usage + +Although the whole algorithm is comprised of many functions, only one is exposed as API: + +```python +def append_backward(loss, parameter_list=None, no_grad_set=None): + """ + Append backward part to main_program + + Args: + loss(Variable): The variable generated by the cost function. + parameter_list(list): Parameters that need to be updated by optimizers. + If None, it means all parameters need to be updated. + + no_grad_set(set): Variables that have no gradients in Block 0. + If None, the set will be generated inside the function and + contains all variables with `step_gradient=True` from all blocks. + + Return: + (list[Variable]): list of (parameters, gradients) pair. + """ +``` + +By invoking this API, the framework appends backward part of the program where the `loss` is. It takes three arguments. `loss` means the final loss value. It must be a scalar and is usually the output of the loss layer. It is also where the gradient generated and backpropagation starts. `parameter_list` marks all parameters needs updating. If it's `None`, all parameter will be updated by optimizers. `no_grad_set` marks variables without gradient. if all outputs of some `grad_op` are in `no_grad_set`, the `grad_op` will not be run. + +This API will be invoked automatically before optimizer building. +As a result, in most cases, users do not need to invoke the API by themselves to append backward part. + +## Implementation + +The implementation of backward building algorithm is in `backward.py` file. The whole algorithm can be divided into two independent parts: creating `grad_op`s and creating new variables. + +### Creating `grad_op`s + +The creating of `grad_op`s is implemented by: + +```python +def _append_backward_ops_(target, + block, + target_block, + no_grad_dict, + grad_to_var): + """ + Create all grad ops, and insert them into given block + + Args: + target(Variable): the target variable of forward pass + block(Block): the block where forward ops are + target_block(Block): the block which is going to hold new generated grad ops + no_grad_dict(dict): + key(int) block index + val(set) a set of varibale names. These varibales have no gradient + grad_to_var(dict)(output argument): + key(str): grad variable name + val(str): corresponding forward variable name + """ +``` + +Given a `block`, the function will traverses all `op`s in this block in reverse order, gets corresponding `grad_op` from the C++ core via `core.get_grad_op_desc()`, then append it to `target_block`. + +However, some specific `op`(e.g. `while_op`, `if_else_op`) can hold its own sub-block. For these sub-blocks contains `op`s as well, the `grad_op` creating should be recursive. + +During the reverse traversal, we check each `op` whether it has an attribute named `sub_block`. If so, it means there is a sub-block and we need to deal with it first. After creating a new block whose father is the one in `op`'s attribute, we invoke `_append_backward_ops_()` recursively, assigning the new block to parameter `target_block` and the one in `op`'s attribute to `block`. The *pseudo-code* shows this process: + +``` +******* pseudo-code ******** +for op in reversed(block.ops): + if op has an attribute named 'sub_block': + Get the sub-block(`s_block`) from op's attribute. + Create a new block(`grad_s_block`), whose father is `s_block`. + Invoke _append_backward_ops_(), with `block=s_block` and `target_block=grad_s_block` + + Invoke `core.get_grad_op_desc()` to get op's grad_op. + Insert name correspondings between variables and their gradients of the grad_op to grad_to_var + Assign grad_s_block to grad_op as it's 'sub_block' attribute. + Append grad_op to current target_block. +``` + +The first invoking of `_append_backward_ops_()` is initiated by `append_backward()`, in which parameters `block` and `target_block` are all assigned with root block(the block with index 0). + +### Corner Cases of `grad_op` Creating + +In the previous section, we show the regular process of `grad_op` creating. However, in some corner cases, the conventional algorithm is not enough to get the correct result and appending handling is required. These additional processes run after the algorithm mentioned above and do some special adjusts on its output `grad_op`s. + +#### Shared Variables + +If a variable is read by more than one `op` in the forward pass, its gradient is likely to be written by more than one `grad_op`s in the next backward pass. To make the gradient result being the sum of all `grad_op`s' outputs instead of the last running one, we assign each output with a temporary variable and then add a `sum_op` to add them up. + +For the debug convenience, if the final gradient name is `w@GRAD`, it's corresponding temporary variables will be named as `w@GRAD@RENAME@0`, `w@GRAD@RENAME@1`... + +See function `_addup_repetitive_outputs_` in `backward.py` for implementation details. + +#### No Gradient Variables + +In our framework, variables can be marked as *no_gradient*, it means that the gradient of this variable is unnecessary and can be considered as zero in model training. Apparently, when all the outputs of some `grad_op` are marked as *no_gradient*, the `grad_op` itself can be skipped in backward pass. + +Another situation is all the gradient inputs of some `grad_op` are marked as *no_gradient*, which means all of them can be considered as zeros. For `grad_op`s are in essence the propagation of gradients, all the outputs are definitely zeros when all gradient inputs are zeros. Therefore the `grad_op` can also be skipped. + +It should be noted that all these zero gradients still need to be creating and initialized by something, otherwise following `grad_op`s who take these gradients as inputs take the risk of using uninitialized memory. In our code, we employ `fill_zeros_like_op` to initialize them as all zeros. + +This features are implemented in function `_remove_no_grad_branch_`. It checks new created `grad_op`s one-by-one, removes who can be skipped and inserts `fill_zeros_like_op` when its necessary. We can get the `no_grad_set` from the `_append_backward_ops_` argument `no_grad_dict` or generate it on the fly by scanning all variables' `no_gradient` attribute(True or False). + +### Creating Backward Variables + +Up to now, we have completed all creating and adjusting jobs of `grad_op`s. However, backward variables have not been created. Now they are only represented by `grad_op`'s input and output arguments. The backward variable creating job will be done by: + +```python +def _append_backward_vars_(block, + start_op_idx, + grad_to_var, + grad_info_map): + """ + Create new variables required by backward pass. + + Args: + block(Block): the block where new variables will be created + start_op_idx(int): Only variables required by ops in block.ops[start_op_idx : ] will be created + grad_to_var(dict): + key(str): grad variable name + val(str): corresponding forward variable name + In most cases, this dict is generated by _append_backward_ops_() + grad_info_map(dict)(output argument): + key(str): forward variable name + val(tuple): a tuple of (str, int), str is the corresponding grad name, int is the block index + """ +``` + +Given a `block`, this function traverses all the `grad_op`s in it(The argument `start_op_idx` indicates where the grad_op sequence starts.) and creates all the uncreated outputs. The *pseudo-code* shows this process: + +``` +for op in block.ops[start_op_idx : ]: + + if op has an attribute named 'sub_block': + Get the sub-block(`s_block`) from op's attribute. + Invoke _append_backward_vars_(), with `block=s_block` + + for var_name in op.all_output_names(): + if block.has_var_recursive(var_name) or var_name is the name of empty variable: + continue + create a new variable named 'var_name' in block + if grad_to_var.has_key(var_name): + set grad_info_map[grad_to_var[var_name]] as a tuple of (var_name. block) + + do op's var type inference + do op's shape inference +``` diff --git a/doc/design/block.md b/doc/design/block.md new file mode 100644 index 0000000000000000000000000000000000000000..907a2def557fd472ac4d679c73447bd9107d1190 --- /dev/null +++ b/doc/design/block.md @@ -0,0 +1,336 @@ +# Design Doc: Block and Scope + +## The Representation of Computation + +Both deep learning systems and programming languages help users describe computation procedures. These systems use various representations of computation: + +- Caffe, Torch, and Paddle: sequences of layers. +- TensorFlow, Caffe2, Mxnet: graph of operators. +- PaddlePaddle: nested blocks, like C++ and Java programs. + +## Block in Programming Languages and Deep Learning + +In programming languages, a block is a pair of curly braces that includes local variables definitions and a sequence of instructions or operators. + +Blocks work with control flow structures like `if`, `else`, and `for`, which have equivalents in deep learning: + +| programming languages | PaddlePaddle | +|-----------------------|-----------------------| +| for, while loop | RNN, WhileOp | +| if, if-else, switch | IfElseOp, SwitchOp | +| sequential execution | a sequence of layers | + +A key difference is that a C++ program describes a one pass computation, whereas a deep learning program describes both the forward and backward passes. + +## Stack Frames and the Scope Hierarchy + +The existence of the backward pass makes the execution of a block of PaddlePaddle different from traditional programs: + +| programming languages | PaddlePaddle | +|-----------------------|---------------------------------| +| stack | scope hierarchy | +| stack frame | scope | +| push at entering block| push at entering block | +| pop at leaving block | destroy when minibatch completes| + +1. In traditional programs: + + - When the execution enters the left curly brace of a block, the runtime pushes a frame into the stack, where it realizes local variables. + - After the execution leaves the right curly brace, the runtime pops the frame. + - The maximum number of frames in the stack is the maximum depth of nested blocks. + +1. In PaddlePaddle + + - When the execution enters a block, PaddlePaddle adds a new scope, where it realizes variables. + - PaddlePaddle doesn't pop a scope after the execution of the block because variables therein are used by the backward pass. So it has a stack forest known as a *scope hierarchy*. + - The height of the highest tree is the maximum depth of nested blocks. + - After the processing of a minibatch, PaddlePaddle destroys the scope hierarchy. + +## Use Blocks in C++ and PaddlePaddle Programs + +Let us consolidate the discussion by presenting some examples. + +### Blocks with `if-else` and `IfElseOp` + +The following C++ programs shows how blocks are used with the `if-else` structure: + +```c++ +namespace pd = paddle; + +int x = 10; +int y = 1; +int z = 10; +bool cond = false; +int o1, o2; +if (cond) { + int z = x + y; + o1 = z; + o2 = pd::layer::softmax(z); +} else { + int d = pd::layer::fc(z); + o1 = d; + o2 = d+1; +} + +``` + +An equivalent PaddlePaddle program from the design doc of the [IfElseOp operator](./if_else_op.md) is as follows: + +```python +import paddle as pd + +x = minibatch([10, 20, 30]) # shape=[None, 1] +y = var(1) # shape=[1], value=1 +z = minibatch([10, 20, 30]) # shape=[None, 1] +cond = larger_than(x, 15) # [false, true, true] + +ie = pd.ifelse() +with ie.true_block(): + d = pd.layer.add_scalar(x, y) + ie.output(d, pd.layer.softmax(d)) +with ie.false_block(): + d = pd.layer.fc(z) + ie.output(d, d+1) +o1, o2 = ie(cond) +``` + +In both examples, the left branch computes `x+y` and `softmax(x+y)`, the right branch computes `fc(x)` and `x+1` . + +The difference is that variables in the C++ program contain scalar values, whereas those in the PaddlePaddle programs are mini-batches of instances. + + +### Blocks with `for` and `RNNOp` + +The following RNN model in PaddlePaddle from the [RNN design doc](./rnn.md) : + +```python +x = sequence([10, 20, 30]) # shape=[None, 1] +m = var(0) # shape=[1] +W = var(0.314, param=true) # shape=[1] +U = var(0.375, param=true) # shape=[1] + +rnn = pd.rnn() +with rnn.step(): + h = rnn.memory(init = m) + h_prev = rnn.previous_memory(h) + a = layer.fc(W, x) + b = layer.fc(U, h_prev) + s = pd.add(a, b) + act = pd.sigmoid(s) + rnn.update_memory(h, act) + rnn.output(a, b) +o1, o2 = rnn() +``` +has its equivalent C++ program as follows + +```c++ +int* x = {10, 20, 30}; +int* m = {0}; +int* W = {0.314}; +int* U = {0.375}; + +int mem[sizeof(x) / sizeof(x[0]) + 1]; +int o1[sizeof(x) / sizeof(x[0]) + 1]; +int o2[sizeof(x) / sizeof(x[0]) + 1]; +for (int i = 1; i <= sizeof(x)/sizeof(x[0]); ++i) { + int x = x[i-1]; + if (i == 1) mem[0] = m; + int a = W * x; + int b = Y * mem[i-1]; + int s = fc_out + hidden_out; + int act = sigmoid(sum); + mem[i] = act; + o1[i] = act; + o2[i] = hidden_out; +} +``` + +## Compilation and Execution + +Like TensorFlow, a PaddlePaddle program is written in Python. The first part describes a neural network as a protobuf message, and the rest executes the message for training or inference. + +The generation of this protobuf message is similar to how a compiler generates a binary executable file. The execution of the message is similar to how the OS executes the binary file. + +## The "Binary Executable File Format" + +The definition of the protobuf message is as follows: + +```protobuf +message BlockDesc { + repeated VarDesc vars = 1; + repeated OpDesc ops = 2; +} +``` + +The step net in above RNN example would look like + +``` +BlockDesc { + vars = { + VarDesc {...} // x + VarDesc {...} // h + VarDesc {...} // fc_out + VarDesc {...} // hidden_out + VarDesc {...} // sum + VarDesc {...} // act + } + ops = { + OpDesc {...} // matmul + OpDesc {...} // add_two + OpDesc {...} // sigmoid + } +}; +``` + +Also, the RNN operator in above example is serialized into a protobuf message of type `OpDesc` and would look like: + +``` +OpDesc { + inputs = {0} // the index of x in vars of BlockDesc above + outputs = {5, 3} // indices of act and hidden_out in vars of BlockDesc above + attrs { + "states" : {1} // the index of h + "step_net" : + } +}; +``` + +This `OpDesc` value is in the `ops` field of the `BlockDesc` value representing the global block. + + +## The Compilation of Blocks + +During the generation of the Protobuf message, the Block should store VarDesc (the Protobuf message which describes Variable) and OpDesc (the Protobuf message which describes Operator). + +VarDesc in a block should have its name scope to avoid local variables affecting parent block's name scope. +Child block's name scopes should inherit the parent's so that OpDesc in child block can reference a VarDesc that is stored in the parent block. For example: + +```python +a = pd.Variable(shape=[20, 20]) +b = pd.fc(a, params=["fc.w", "fc.b"]) + +rnn = pd.create_rnn() +with rnn.stepnet(): + x = a.as_step_input() + # reuse fc's parameter + fc_without_b = pd.get_variable("fc.w") + rnn.output(fc_without_b) + +out = rnn() +``` +The method `pd.get_variable` can help retrieve a Variable by the name. The Variable may be stored in a parent block, but might be retrieved in a child block, so block should have a variable scope that supports inheritance. + +In compiler design, the symbol table is a data structure created and maintained by compilers to store information about the occurrence of various entities such as variable names, function names, classes, etc. + +To store the definition of variables and operators, we define a C++ class `SymbolTable`, like the one used in compilers. + +`SymbolTable` can do the following: + +- store the definitions (some names and attributes) of variables and operators, +- verify if a variable was declared, +- make it possible to implement type checking (offer Protobuf message pointers to `InferShape` handlers). + + +```c++ +// Information in SymbolTable is enough to trace the dependency graph. So maybe +// the Eval() interface takes a SymbolTable is enough. +class SymbolTable { + public: + SymbolTable(SymbolTable* parent) : parent_(parent) {} + + OpDesc* NewOp(const string& name=""); + + // TODO determine whether name is generated by python or C++. + // Currently assume that a unique name will be generated by C++ if the + // argument name is left default. + VarDesc* Var(const string& name=""); + + // find a VarDesc by name, if recursive is true, find parent's SymbolTable + // recursively. + // this interface is introduced to support InferShape, find protobuf messages + // of variables and operators, pass pointers into InferShape. + // + // NOTE maybe some C++ classes such as VarDescBuilder and OpDescBuilder should + // be proposed and embedded into pybind to enable python operation on C++ pointers. + VarDesc* FindVar(const string& name, bool recursive=true); + + OpDesc* FindOp(const string& name); + + BlockDesc Compile() const; + + private: + SymbolTable* parent_; + + map ops_; + map vars_; +}; +``` + +After all the description of variables and operators is added into SymbolTable, +the block has enough information to run. + +The `Block` class takes a `BlockDesc` as input, and provides `Run` and `InferShape` functions. + + +```c++ +namespace { + +class Block : OperatorBase { +public: + Block(const BlockDesc& desc) desc_(desc) {} + + void InferShape(const framework::Scope& scope) const override { + if (!symbols_ready_) { + CreateVariables(scope); + CreateOperators(); + } + // should run InferShape first. + for (auto& op : runtime_table_.ops()) { + op->InferShape(scope); + } + } + + void Run(const framework::Scope& scope, + const platform::Place& place) const override { + PADDLE_ENFORCE(symbols_ready_, "operators and variables should be created first."); + for (auto& op : runtime_table_.ops()) { + op->Run(scope, place); + } + } + + void CreateVariables(const framework::Scope& scope); + void CreateOperators(); + + // some other necessary interfaces of NetOp are listed below + // ... + +private: + BlockDesc desc_; + bool symbols_ready_{false}; +}; +``` + +## The Execution of Blocks + +Block inherits from OperatorBase, which has a Run method. +Block's Run method will run its operators sequentially. + +There is another important interface called `Eval`, which takes some arguments called targets and generates a minimal graph which treats targets as the end points and creates a new Block. After `Run`, `Eval` will get the latest value and return the targets. + +The definition of Eval is as follows: + +```c++ +// clean a block description by targets using the corresponding dependency graph. +// return a new BlockDesc with minimal number of operators. +// NOTE: The return type is not a Block but the block's description so that this can be distributed +// to a cluster. +BlockDesc Prune(const BlockDesc& desc, vector targets); + +void Block::Eval(const vector& targets, + const framework::Scope& scope, + const platform::DeviceContext& dev_ctx) { + BlockDesc min_desc = Prune(desc_, targets); + Block min_block(min_desc); + min_block.Run(scope, dev_ctx); +} +``` diff --git a/doc/design/build_system/README.md b/doc/design/build_system/README.md index 310739f37ae48934afe1d042e87efef85b98f1fc..bf0e4dddc1b640ecbce489f65820aaf8a4b3b1e7 100644 --- a/doc/design/build_system/README.md +++ b/doc/design/build_system/README.md @@ -105,3 +105,48 @@ shared_library(api ### Implementation As above example CMakeLists.txt executes, each function invocation adds "nodes" to a dependency graph. It also use this graph to generate CMake commands including `add_executable`, `add_dependencies`, `target_link_libraries`, and `add_test`. + +### Using Package Manager For Go + +Building Go binaries and libraries need to satisfy their dependencies, generally +we can do `go get ./...` to download and compile all external dependencies. The +problems are: + +1. `go get` will always get the latest code from the default branch of the + remote repo, so changes of dependents might break the build. This is very + different with what we already have in `cmake/external` which download a + specific version or commit id of the dependency. +1. Some locations can not access external dependencies through the internet, as mentioned + in https://github.com/PaddlePaddle/Paddle/issues/2605. Using package management + tools can package the dependencies as a "vendor" package, which can be mirrored + at many cloud file hosting, so users what to compile paddle by themselves can + download this "vendor" package from a mirror site. + +#### Choose A Suitable Tool + +As mentioned by @wangkuiyi, [Here](https://github.com/golang/go/wiki/PackageManagementTools) +list dozens of Go package managers. We choose the tool using following principles: + +- Most "active" projects with more stars, more pull requests or commits +- Widely used project + +After comparing all these projects, we shall choose between the most popular +tools: Godep and Glide. + +Here's a brief comparison between Godep and Glide +: https://github.com/Masterminds/glide/wiki/Go-Package-Manager-Comparison. There are +also many complaints about using `Godep`. There's also a new "official" pakcage +management tool has been started at: https://github.com/golang/dep to resolve +such problems, but it's currently at Alpha stage. So the best choice now is +glide obviously. + +#### Manage Go Packages + +- Dependencies: `go/glide.yaml` will store the dependencies and their versions which + is directly imported by paddle. `go/glide.lock` will store all dependencies recursively + with their commit id. Builds will "lock" to these packages if we don't `glide up` + them +- Vendor package: `go/vendor` directory will generated when running `cmake` command. `cmake` + will download the code corresponding to `go/glide.lock`. If we put a vendor folder + under `go/`, cmake will just check the commit id to the packages under the folder, + if commit id matches, there will be no download at all. diff --git a/doc/design/ci_build_whl.png b/doc/design/ci_build_whl.png new file mode 100644 index 0000000000000000000000000000000000000000..232762b82a9ae3e979a1f38a7beb715c87438f40 Binary files /dev/null and b/doc/design/ci_build_whl.png differ diff --git a/doc/design/cluster_train/README.md b/doc/design/cluster_train/README.md index 74961f80050c6b2723889b51416a2e8048174b00..177a5f5d54bd924fab34795219ce1f7b270c8e25 100644 --- a/doc/design/cluster_train/README.md +++ b/doc/design/cluster_train/README.md @@ -54,17 +54,18 @@ The life cycle of a single task is illustrated below: 1. When a new pass of training starts, all tasks will be placed in the todo queue. -1. The master server will dispatch few tasks to each trainer at a time, puts them in the pending queue and waits for completion. -1. The trainer will work on its tasks and tell the master server once a task is completed. The master server will dispatch a new task to that trainer. -1. If a task timeout. the master server will move it back to the todo queue. The timeout count will increase by one. If the timeout count is above a threshold, the task is likely to cause a trainer to crash, so it will be discarded. +1. Upon trainer requests for new task, the master server will dispatch a task from todo queue to it, put the task in the pending queue and wait for completion. +1. The trainer will work on its task and tell the master server once the task is completed and ask for new task. The master server will dispatch a new task to that trainer. +1. If a task fails for any reason in trainer, or takes longer than a specific period of time, the master server will move the task back to the todo queue. The timeout count for that task will increase by one. If the timeout count is above a threshold, the task is likely to cause a trainer to crash, then it will be discarded. 1. The master server will move completed task to the done queue. When the todo queue is empty, the master server will start a new pass by moving all tasks in the done queue to todo queue and reset the timeout counter of all tasks to zero. ### Trainer Process The trainer process will: -- Receive tasks from the master. -- Work on the tasks: calculate and upload gradient to parameter servers, and update local model by downloading new parameters from parameter servers. +- Request tasks from the master. +- Work on the tasks +- Upload gradient to parameter servers, and update local model by downloading new parameters from parameter servers. ### Parameter Server Process @@ -119,8 +120,8 @@ When the master is started by the Kubernetes, it executes the following steps at 1. Grabs a unique *master* lock in etcd, which prevents concurrent master instantiations. 1. Recovers the task queues from etcd if they already exist, otherwise, the master will create them. -1. Watches the trainer prefix keys `/trainer/` on etcd to find the live trainers. -1. Starts dispatching the tasks to the trainers, and updates task queue using an etcd transaction to ensure lock is held during the update. +1. Write its ip address to */master/addr* so that trainers can discover it. +1. Listens to trainers' request of task, dispatch one upon request, and updates task queue using an etcd transaction to ensure lock is held during the update. When the master server process is dead for any reason, Kubernetes will restart it. It will be online again with all states recovered from etcd in few minutes. @@ -128,13 +129,11 @@ When the master server process is dead for any reason, Kubernetes will restart i When the trainer is started by the Kubernetes, it executes the following steps at startup: -1. Watches the available parameter server prefix keys `/ps/` on etcd and waits until the count of parameter servers reaches the desired count. -1. Generates a unique ID, and sets key `/trainer/` with its contact address as value. The key will be deleted when the lease expires, so the master will be aware of the trainer being online and offline. -1. Waits for tasks from the master to start training. +1. Watches the available parameter server prefix keys `/ps/` on etcd and waits until the count of parameter servers reaches the desired count */ps_desired*. +1. Finds and watches */master/addr* to get master's address. +1. Requests for tasks from the master to start training. -If trainer's etcd lease expires, it will try set key `/trainer/` again so that the master server can discover the trainer again. - -When a trainer fails, Kuberentes would try to restart it. The recovered trainer would fetch tasks from the TODO queue and go on training. +When a trainer fails, Kuberentes would try to restart it. The recovered trainer would fetch tasks from master and go on training. ### Parameter Server Process diff --git a/doc/design/cluster_train/large_model_dist_train.md b/doc/design/cluster_train/large_model_dist_train.md new file mode 100644 index 0000000000000000000000000000000000000000..0c4b5bc24c854b7062d509249bea9c50d42bd5f1 --- /dev/null +++ b/doc/design/cluster_train/large_model_dist_train.md @@ -0,0 +1,101 @@ +# Alalysis of large model distributed training in Paddle + +***NOTE: This is only some note for how we implemeted this scheme in V1, not a new design.*** + +## What is it + +We often encounter cases that the embedding layer parameters(sparse) are so large that we can not store it in the trainer's memory when training. So we need to put them to several servers, and fetch them row by row instead of fetch all of the parameters. + +## How to use + +Specify command-line argument like `--loadsave_parameters_in_pserver=true --ports_num_for_sparse=1 --use_old_updater=1` when starting the paddle trainer. And also add something like `--ports_num_for_sparse=1 --pserver_num_threads=5` when starting pserver processes. + +Accrodingly, configure your embedding layers like: + +```python +SPARSE_REMOTE=True + +w1 = data_layer(name="w1", size=dict_size) +emb1 = embedding_layer(input=w1, size=32, param_attr=ParameterAttribute(sparse_update=SPARSE_REMOTE)) +w2 = data_layer(name="w2", size=dict_size) +emb2 = embedding_layer(input=w2, size=32, param_attr=ParameterAttribute(sparse_update=SPARSE_REMOTE)) +... +``` + +## Implementation details + +```c++ +enum MatType { + MAT_NORMAL, + MAT_NORMAL_SHARED, + MAT_VALUE_SHARED, + MAT_SPARSE_ROW_IDS, + MAT_SPARSE_ROW_AUTO_GROW, + MAT_CACHE_ROW, + MAT_SPARSE_ROW, + MAT_SPARSE_ROW_PREFETCH, + MAT_SPARSE_ROW_PREFETCH_FULL_SIZE, +}; +``` + +`MAT_SPARSE_ROW_PREFETCH` is what we use when configured to fetch only row of matrix when training. + +In `trainer_internal.cpp:L93 trainOneBatch`: + +```c++ + if (config_->getOptConfig().use_sparse_remote_updater()) { + REGISTER_TIMER("prefetch"); + gradientMachine_->prefetch(inArgs); + parameterUpdater_->getParametersRemote(); + } +``` + +When doing actual network forward and backward, at the beginning of each batch, the trainer will try to download one row of data from pserver. + +In `trainer/RemoteParameterUpdater.cpp`: `parameterUpdater_->getParametersRemote();`: + +```c++ +if (fullSize) { + ... +} else { +getParams = [&] { + parameterClient_->getParameterSparse( + /* recvParameterType= */ PARAMETER_VALUE, sendBackParameterType); +}; +applyL1 = [](Parameter& para, real decayRate) { + para.getMat(PARAMETER_VALUE)->applyL1(/*lr=*/1.0f, decayRate); +}; +} +``` + +Calling `parameterClient_->getParameterSparse` will do remote call to pserver's `getParameterSparse`: + +```c++ +void ParameterServer2::getParameterSparse(const SendParameterRequest& request, + std::vector& inputBuffers, + SendParameterResponse* response, + std::vector* outputBuffers) { + (void)inputBuffers; + auto& buffer = *readWriteBuffer_; + size_t numReals = 0; + for (const auto& block : request.blocks()) { + numReals += getParameterConfig(block).dims(1); + } + buffer.resize(numReals); + + VLOG(3) << "pserver: getParameterSparse, numReals=" << numReals; + + ReadLockGuard guard(parameterMutex_); + size_t offset = 0; + for (const auto& block : request.blocks()) { + size_t width = getParameterConfig(block).dims(1); + Buffer buf = {buffer.data() + offset, width}; + int type = request.send_back_parameter_type(); + sendBackParameterSparse(block, type, response, &buf, width, outputBuffers); + offset += width; + } +} +``` + +`getParameterConfig(block).dims(1)` returns the width of the current "parameter block"(a shard of parameter object), +then `getParameterSparse` remote call returns only one row of data to the client. diff --git a/doc/design/cluster_train/pserver_client.md b/doc/design/cluster_train/pserver_client.md index b3e4079010490b69db1de28157f0cab80cad2381..474b8c572cd92fc87e9f7f3f2b19d12cccd158de 100644 --- a/doc/design/cluster_train/pserver_client.md +++ b/doc/design/cluster_train/pserver_client.md @@ -74,14 +74,25 @@ typedef enum { typedef struct { char* name; paddle_element_type element_type; - void* content; + unsigned char* content; int content_len; } paddle_parameter, paddle_gradient; -typedef struct paddle_pserver_client paddle_pserver_client; +typedef int paddle_pserver_client; -paddle_pserver_client* paddle_new_pserver_client(); -void paddle_pserver_client_release(paddle_pserver_client* client); +/** + * @brief creates a pserver client that talks to etcd for coordination. + */ +paddle_pserver_client paddle_new_etcd_pserver_client(char* etcd_addr); + +/** + * @brief creates a pserver client given pserver addresses. + * + * @param pserver_addrs comma-separated pserver addresses. + * @param selected if current pserver client is selected to initialize all parameter servers. + */ +paddle_pserver_client paddle_new_pserver_client(char* pserver_addrs, int selected); +void paddle_pserver_client_release(paddle_pserver_client c); /** * @brief paddle_begin_init_params begins to initialize parameters on @@ -95,7 +106,7 @@ void paddle_pserver_client_release(paddle_pserver_client* client); * @return 1 if the trainer is selected to initialize parameter * servers, otherwise 0. */ -int paddle_begin_init_params(paddle_pserver_client* client); +int paddle_begin_init_params(paddle_pserver_client client); /** * @brief paddle_init_param initializes the parameter on parameter @@ -109,7 +120,7 @@ int paddle_begin_init_params(paddle_pserver_client* client); * @paddle_begin_init_param). Or simply exit the program and wait for * the cluster management system to restart the trainer. */ -int paddle_init_param(paddle_pserver_client* client, paddle_parameter param, const unsigned char* param_config_proto, int config_len); +int paddle_init_param(paddle_pserver_client client, paddle_parameter param, const unsigned char* param_config_proto, int config_len); /** * @brief paddle_finish_init_params tells parameter servers client has @@ -120,7 +131,7 @@ int paddle_init_param(paddle_pserver_client* client, paddle_parameter param, con * @paddle_begin_init_param). Or simply exit the program and wait for * the cluster management system to restart the trainer. */ -int paddle_finish_init_params(paddle_pserver_client* client); +int paddle_finish_init_params(paddle_pserver_client client); /** * @brief paddle_send_grads sends gradients to parameter servers for @@ -131,7 +142,7 @@ int paddle_finish_init_params(paddle_pserver_client* client); * @param learning_rate the learning rate for the gradients. * @return 0 if successful, otherwise -1. */ -int paddle_send_grads(paddle_pserver_client* client, const paddle_gradient* grads, int len); +int paddle_send_grads(paddle_pserver_client client, const paddle_gradient* grads, int len); /** * @brief paddle_get_params gets parameters from parameter servers. @@ -139,13 +150,15 @@ int paddle_send_grads(paddle_pserver_client* client, const paddle_gradient* grad * paddle_get_params will block until parameters are initialized on * the parameter servers. * - * @param names the array of names of the parameters to get. - * @param dst the destination array of parameters to save to. + * @param dst the destination array of parameter pointers to save to. + * The parameter pointer must be pre-popullated with required parameter name, + * and the content of parameter must be pre-allocated of the size of required + * parameter on pserver. * @param len the length of the names array and the paddle_parameter * array. * @return 0 if successful, otherwise -1. */ -int paddle_get_params(paddle_pserver_client* client, const char** names, paddle_parameter* dst, int len); +int paddle_get_params(paddle_pserver_client client, paddle_parameter** dst, int len); /** * @brief paddle_save_model indicates parameters to save the parameter @@ -154,5 +167,5 @@ int paddle_get_params(paddle_pserver_client* client, const char** names, paddle_ * @param path the path to save parameters. * @return 0 if successful, otherwise -1. */ -int paddle_save_model(paddle_pserver_client* client, const char* path); +int paddle_save_model(paddle_pserver_client client, const char* path); ``` diff --git a/doc/design/cluster_train/remote_parameter_updater.md b/doc/design/cluster_train/remote_parameter_updater.md new file mode 100644 index 0000000000000000000000000000000000000000..6e8e5938455b869e0f3367794c41250340b37f77 --- /dev/null +++ b/doc/design/cluster_train/remote_parameter_updater.md @@ -0,0 +1,21 @@ +# Design Doc: Remote Parameter Updater for Cluster Train + +For an overview of distribute training, please refer to [distributed training design doc](README.md). In this design doc, we will discuss the parameter updater that will use parameter server cclient [The Client Library of Parameter Server Design Doc](pserver_client.md) to manage and update parameters. + +## Parameter Updater + +Parameter Updater is used by trainer to manage and update parameter, there are mainly two kind of parameter updater: local and remote, since this design is for cluster train, we will only discuss remote parameter updater here. + +### Remote Parameter Updater + +Remote Parameter Updater manage parameters through remote parameter server with the client that communicate with pserver([The Client Library of Parameter Server Design Doc](pserver_client.md)) + +In PaddlePaddle Python V2 API, trainer is implemented in python, and the trainer will hold a instance of parameter updater and call it's functions directly. In this design, we will also expose the api of RemoteParameterUpdater to python with swig. + +#### Sparse Remote Parameter Updater + +Since we will only implement dense parameter management new, the mechanism for sparse parameter will be discussed in next stage. + +### Interface Design + +TBD diff --git a/doc/design/cluster_train/save_model.md b/doc/design/cluster_train/save_model.md new file mode 100644 index 0000000000000000000000000000000000000000..b755185c81ad617b9c85c47de0f5f65d2201c658 --- /dev/null +++ b/doc/design/cluster_train/save_model.md @@ -0,0 +1,111 @@ +# Design Doc: Save Model + +## Overview + +The model is the output of the training process. There are two +ways from which user can obtain a model: + +- Save model triggered by user code: user code asks PaddlePaddle to + save a model. +- Convert model from the checkpoint: model being converted from + pservers' periodic checkpoint. In this way, the user can cancel a + job at any time, and still have a relatively fresh model (we + checkpoint around every 5 minutes). + +### Trainer Saving Model vs. Pservers Saving Model + +Both trainers and pservers have access to the model. So the model can +be saved from a trainer or pservers. We need to decide where the model +is saved from. + +#### Dense Update vs. Sparse Update + +There are two types of model update methods: dense update and sparse +update (when the model parameter is configured to be sparse). + +- Dense update + + Every trainer has it's own full copy of the model. Every model + update will update the entire model. + +- Sparse update + + The training input is sparse, and the trainer does not have the + entire model. It will only download the sub-model necessary related + to the input. When updating the model, only the sub-model related to + the training input is updated. + + +#### Pservers Saving Model + +The benefit of letting pservers save model is they have the entire +model all the time. However, since pservers are on different nodes, it +requires a merging process to merge model shards into the same +model. Thus requires the pservers to write models to a distributed +filesystem, making the checkpoint shards visible to the merge program. + +#### Trainer Saving Model + +The benefit of letting one trainer to save the model is it does not +require a distributed filesystem. And it's reusing the same save model +logic when training locally - except when doing sparse update, the +trainer needs to download the entire model during the saving process. + +#### Conclusion + +Given trainer saving model does not require a distributed filesystem, +and is an intuitive extension to trainer saving model when training +locally, we decide to let the trainer save the model when doing +distributed training. + + +### Convert Model from Checkpoint + +TODO + + +## Timeline + +We first implement trainer save the model. Converting the latest +snapshot to a model will be a TODO for future. + + +## Trainer Save Model + +### Trainer Election + +One trainer will be elected as the one to save the model. When using +etcd, trainer ID is a randomly generated UUID, the trainer will +contact the master server requesting to save the model, and find out +if itself is elected. When the master server is not used, unique +trainer IDs will be given by the administrator, the trainer whose ID +is "0" is elected to save the model. + +### Model Save Path + +Each trainer will be given the directory to save the model. The +elected trainer will save the model to +`given-directory/trainerID`. Since the trainer ID is unique, this +would prevent concurrent save to the same file when multiple trainers +are elected to save the model when split-brain problem happens. + +### What Happens When Model Is Saving + +It takes some time to save model, we need to define what will happen +when save model is taking place. + +When doing dense update, the trainer uses the local model. Pservers +does not need to pause model update. + +When doing sparse update. The trainer needs to download the entire +model while saving. To get the most accurate model, the model update +needs to be paused before the download starts and resumed after the +download finishes. Otherwise, the trainer gets a model that is +"polluted": some part of the model is old, some part of the model is +new. + +It's unclear that the "polluted" model will be inferior due to the +stochastic nature of deep learning, and pausing the model update will +add more complexity to the system. Since supporting sparse update is a +TODO item. We defer the evaluation of pause the model update or not +during saving model to the future. diff --git a/doc/design/cluster_train/src/paddle-etcd.graffle b/doc/design/cluster_train/src/paddle-etcd.graffle index 56681ae5bbe11849116d621b066a6317e003e4ca..f973dc9b9dbf72e9bc31e2d32822916cd281f8d9 100644 Binary files a/doc/design/cluster_train/src/paddle-etcd.graffle and b/doc/design/cluster_train/src/paddle-etcd.graffle differ diff --git a/doc/design/cluster_train/src/paddle-etcd.png b/doc/design/cluster_train/src/paddle-etcd.png index 4f9c9762b3a8c089dd5e9b2c07cb9dfc78296a21..57981ceb4b94f0f7d6dfa63f3d28c0402bf9cc31 100644 Binary files a/doc/design/cluster_train/src/paddle-etcd.png and b/doc/design/cluster_train/src/paddle-etcd.png differ diff --git a/doc/design/cluster_train/src/trainer.graffle b/doc/design/cluster_train/src/trainer.graffle index 42384a3f059966e22e22f5fa4295cc9ead5cef83..43415ed8cf61a5acfa34f8e56b9577f338dbf254 100644 Binary files a/doc/design/cluster_train/src/trainer.graffle and b/doc/design/cluster_train/src/trainer.graffle differ diff --git a/doc/design/concurrent_programming.md b/doc/design/concurrent_programming.md new file mode 100644 index 0000000000000000000000000000000000000000..afc65e831d58ff427663806e56294292ccbef85b --- /dev/null +++ b/doc/design/concurrent_programming.md @@ -0,0 +1,163 @@ +# Design Doc: Concurrent Programming with Fluid + +With PaddlePaddle Fluid, users describe a program other than a model. The program is a [`ProgramDesc`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/framework.proto) protobuf message. TensorFlow/MxNet/Caffe2 applications generate protobuf messages too, but their protobuf messages represent the model, a graph of operators, but not the program that trains/uses the model. + +Many know that when we program TensorFlow, we can specify the device on which each operator runs. This allows us to create a concurrent/parallel AI application. An interesting questions is **how does a `ProgramDesc` represents a concurrent program?** + +The answer relies on the fact that a `ProgramDesc` is similar to an abstract syntax tree (AST) that describes a program. So users just program a concurrent program that they do with any concurrent programming language, e.g., [Go](https://golang.org). + +## An Analogy + +The following table compares concepts in Fluid and Go + +| Go | Fluid | +|----|-------| +|user-defined functions | [layers](https://github.com/PaddlePaddle/Paddle/tree/develop/python/paddle/v2/fluid) | +| control-flow and built-in functions | [intrinsics/operators](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/operators) | +| goroutines, channels | [class ThreadPool](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/framework/thread_pool.h) | +| runtime | [class Executor](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/executor.h) | + +## An Example Concurrent Program + +To review all above concepts in an example, let us take a simple program and writes its distributed version. + +Suppose that we want to parallelize a naive Fluid program (written in Go and calling Fluid's Go binding) that multiplies two tensors. + +```go +import "fluid" + +func paddlepaddle() { + X = fluid.read(...) + W = fluid.Tensor(...) + Y = fluid.mult(X, W) +} +``` + +Please be aware that the Fluid's Go binding provides the default `main` function, which calls the `paddlepaddle` function, which, in this case, is defined in above program and creates the following `ProgramDesc` message. + +```protobuf +message ProgramDesc { + block[0] = Block { + vars = [X, W, Y], + ops = [ + read(output = X) + assign(input = ..., output = W) + mult(input = {X, W}, output = Y) + ], + } +} +``` + +Then, the default `main` function calls `fluid.run()`, which creates an instance of the [`class Executor`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/executor.h) and calls `Executor.Run(block[0])`, where `block[0]` is the first and only block defined in above `ProgramDesc` message. + +The default `main` function is defined as follows: + +```go +func main() { + paddlepaddle() + fluid.run() +} +``` + +## The Concurrent Version + +By parallelizing the above program, we could support very big tensor X by splitting into small pieces {x_1, x_2, ...} and sent each piece to worker process/node for parallel multiplication. + +In this case, we can write a transpiler that takes a `ProgramDesc` message that represents the above example program and outputs two `ProgramDesc` messages, one for running on the master process/node, and the other one for worker processes/nodes. + +### The Master Program + +The master program could look like the following: + +```protobuf +message ProgramDesc { + block[0] = Block { + vars = [X, L, Y], + ops = [ + read(output = X) + kube_get_workers_addrs(output = L) + Y = tensor_array(len(L)) + parallel_for(input = X, output = Y, + attrs = {L, block_id(1)}) # referring to block 1 + ] + } + + block[1] = Block { + parent = 0, + vars = [x, y, index], + ops = [ + slice(input = [X, index], output = x) # index is initialized by parallel_for + send(input = x, attrs = L[index]) + recv(outputs = y, attrs = L[index]) + assign(input = y, output = Y[index]) + ] + } +} +``` + +The equivalent Fluid program (calling the Go binding) is: + +```go +func main() { //// block 0 + X = fluid.read(...) + L = fluid.k8s.get_worker_addrs() + Y = fluid.tensor_array(len(L)) + fluid.parallel_for(X, L, + func(index int) { //// block 1 + x = X[index] + fluid.send(L[index], x) + y = fluid.recv(L[index]) + Y[index] = y + }) +} +``` + +An explanation of the above program: + +- `fluid.k8s` is a package that provides access to Kubernetes API. +- `fluid.k8s.get_worker_addrs` returns the list of IP and ports of all pods of the current job except for the current one (the master pod). +- `fluid.tensor_array` creates a [tensor array](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/lod_tensor_array.h). `fluid.parallel_for` creates a `ParallelFor` intrinsic, which, when executed, + + 1. creates `len(L)` scopes, each for the concurrent running of the sub-block (block 1 in this case), and initializes a variable named "index" in the scope to an integer value in the range `[0, len(L)-1]`, and + 2. creates `len(L)` threads by calling into the `ThreadPool` singleton, each thread + 1. creates an Executor instance, and + 2. calls `Executor.Run(block)`, where `block` is block 1 as explained above. +1. Please be aware that block 1 is a sub-block of block 0, so ops in block 1 could refer to variables defined in block 0. + +### The Worker Program + +The worker program looks like + +```go +func main() { + W = Tensor(...) + x = fluid.listen_and_do( + fluid.k8s.self_addr(), + func(input Tensor) { + output = fluid.mult(input, W) + }) +} +``` + +where + +- `fluid.listen_and_do` creates a `ListenAndDo` intrinsic, which, when executed, + 1. listens on the current pod's IP address, as returned by `fliud.k8s.self_addr()`, + 2. once a connection is established, + 1. creates a scope of two parameters, "input" and "output", + 2. reads a [Fluid variable](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/variable.h) and saves it into "input", + 3. creates an Executor instance and calls `Executor.Run(block)`, where the block is generated by running the lambda specified as the second parameter of `fluid.listen_and_do`. + +## Summarization + +From the above example, we see that: + +1. Fluid enables the imperative programming paradigm by: + 1. letting users describe a program, but not a model (a sequence of layers, or a graph of operators), and + 2. call the `fluid.run` function that runs the program implicitly. +1. The program is described as a `ProgramDesc` protobuf message. +2. Function `Executor.Run` takes a block, instead of a `ProgramDesc`, as its parameter. +3. `fluid.run` calls `Executor.Run` to run the first block in the `ProgramDesc` message. +4. `Executor.Run`'s implementation is extremely simple -- it doesn't plan the execution nor create threads; instead, it runs on the current thread and execute intrinsics/operators' `Run` method sequentially as they appear in the `Block.ops` array. +5. Intrinsics/operators' `Run` method might create threads. For example, the `ListenAndDo` operator creates a thread to handle each incoming request. +6. Threads are not necessarily OS thread; instead, they could be [green threads](https://en.wikipedia.org/wiki/Green_threads) managed by ThreadPool. Multiple green threads might run on the same OS thread. An example green threads is Go's [goroutines](https://tour.golang.org/concurrency/1). diff --git a/doc/design/csp.md b/doc/design/csp.md new file mode 100644 index 0000000000000000000000000000000000000000..ba9cacfdea7dcf7c6499b562dfc58400d082f2c8 --- /dev/null +++ b/doc/design/csp.md @@ -0,0 +1,96 @@ +# Design Doc: CSP in PaddlePaddle Fluid + +## Motivation + +Concurrent programming is important for deep learning. Few example applications are: + +1. The main thread keeps reading the next mini-batch while another thread uses the GPU for computing. +2. The main thread performs the computation while another thread uploads the local gradients from each trainer to the parameter server. + +Most DL systems, including TensorFlow, Caffe2, and MxNet, can asynchronously execute operators in a graph. However, Fluid doesn't have the concept of a graph at all, as the design goal of Fluid is that of a programming language. + +## Concurrent Programming Models + +There were many concurrent programming models, implemented in various forms: + +| concurrent programming model | implementation | +|-----|-----| +| mutex | types and functions in standard libraries | +| semaphore | types and functions in standard libraries | +| communicating sequential processes (CSP) | Go programming language | +| actor model | Erlang programming language | +| message passing | MPI | +| bulk synchronous parallel (BSP) | Pregel distributed programming framework | + +Since Fluid was designed to be a programming language, we would like to implement CSP in Fluid. + +### CSP v.s. Actor Model + +A well-known implementation of Actor Model is the Erlang programming language. In Actor Model, *processes* could send messages to another process and receive messages from another process given the process IDs. We can find the three ingredients, process with ID, send, and recv, in MPI too. Indeed, we can rewrite Erlang programs in Python + MPI with possibly fewer lines of code. Our concern with Actor Model is that it doesn't seem reasonable to implement process management in a programming language's runtime library; instead, it should be the operating systems' responsibility to manage processes and libraries like MPI for send/recv. + +## CSP in Fluid + +Fluid has two fundamental control-flows: *if-else* and *while*. If we are to implement CSP, we need the following: + +1. a new data type: *channel* and operators *send* and *recv*, +1. *goroutine* or thread, and +1. a new control-flow: select. + +We also need Python wrappers for the above components. + +The type *channel* is conceptually the blocking queue. In Go, its implemented is a [blocking circular queue](https://github.com/golang/go/blob/68ce117cf17b8debf5754bfd476345779b5b6616/src/runtime/chan.go#L31-L50), which supports send and recv. + +The `select` operation has been in OS kernels long before Go language. All Unix kernels implement system calls *poll* and *select*. They monitor multiple file descriptors to see if I/O is possible on any of them. This takes O(N) time. Since Linux 2.6, a new system call, *epoll*, can do the same in O(1) time. In BSD systems, there is a similar system call *kqueue*. Go's Linux implementation uses epoll. + +It might be a good idea to implement Fluid's select using epoll too. In this design doc, we start from the O(N) way, so we could focus on Python binding and the syntax. + +### Type Channel + +Fluid supports many data types: + +1. Tensor, +1. Row-sparse Tensor +1. LoD Tensor, +1. Tensor array, etc + +Each data type is registered in the [`framework.proto`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/framework.proto#L117-L127) as an enum value. To add a new type channel, we need to add a new type enum. + +To expose a C++ type to Python, we need to edit the [`pybind.cc`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/pybind/pybind.cc) file. [Here](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/pybind/pybind.cc#L120-L164) is an example how we expose C++ class LoDTensor. + +## Syntax Design + +### Create Channel + +In Go, we create a channel by specifying the element type and buffer size: + +```go +ch := make(chan int) // a channel without buffer +ch1 := make(chan int, 100) // a channel that can buffer 100 ints. +``` + +In Fluid, we should be able to do the same: + +```python +ch = fluid.make_chan(dtype=INT) +ch1 = fluid.make_chan(dtype=INT, 100) +``` + +In addition to that, we want channels that can hold more complex element types, e.g., Tensors of float16: + +```python +ch = fluid.make_chan(dtype=Tensor, etype=float16) +``` + +or Tensors of Tensors of float16 etc. + +The point here is that we need a consistent way to compose types, like in C++ we can have `Tensor...> >`. + +### Send and Recv + +### Select + +## Example Programs + +### 1. RPC between Trainers and Parameter Servers + +### 2. Concurrent Minibatch Loading diff --git a/doc/design/dcgan.png b/doc/design/dcgan.png new file mode 100644 index 0000000000000000000000000000000000000000..15e8e290a111ff43900934341365cb4360d87d28 Binary files /dev/null and b/doc/design/dcgan.png differ diff --git a/doc/design/dist_refactor/distributed_architecture.md b/doc/design/dist_refactor/distributed_architecture.md new file mode 100644 index 0000000000000000000000000000000000000000..9368c5780dc922953f38bf0f86d9f797a4a8a6fe --- /dev/null +++ b/doc/design/dist_refactor/distributed_architecture.md @@ -0,0 +1,197 @@ +# Design Doc: Distributed Training Architecture + +## Abstract + +PaddlePaddle version 0.10.0 uses the "trainer-parameter server" architecture. We run multiple instances of trainers (where each trainer runs the same model) and parameter servers for distributed training. This architecture serves well, but has few limitations: + +1. There is a need to write special code that handles tasks which should only be run on a single trainer. E.g., initializing the model, saving the model etc. + +2. Model parallelism is hard: It would need all the if-else branches conditioned on the trainer ID to partition the model onto the trainers, and eventually manually writing out the inter-model-shard communication code to communicate between different trainers. + +3. The user can not directly specify the parameter update rule: This would need to modify the parameter server code and compile a new binary. This makes things more complicated for researchers: A lot of extra effort is required to make this work. Besides, the training job submission program may not allow running arbitrary binaries. + +This design doc discusses PaddlePaddle's new distributed training architecture that addresses the above mentioned limitations. + +## Analysis + +The assumption is that the user writes the trainer program in either Python or C++. + +### Limitation 1 + +There are two basic functionalities in the trainer program: + +1. The training logic such as loading / saving the model and printing out the logs. +2. The neural network definition such as the definition of the data layer, the fully connected layer, the cost function and the + optimizer. + +When we train using PaddlePaddle v0.10.0 in a distributed fashion, multiple instances of the same Python code are run on different nodes, hence both: the +training logic as well as the neural network computation logic, is replicated. + +The tasks that only need to be run once belong to the training logic. Hence if we only replicate the neural network computation part, and do **not** +replicate the training logic, the limitation mentioned above can be avoided. + +### Limitation 2 + +Model parallelism means that a single model is partitioned into different components and each node runs one of the component separately. This comes at the extra cost of managing the +inter-model-shard communication between nodes. + +PaddlePaddle should ideally be able to modify the neural network computation and figure out the support for model parallelism automatically. However, the +computation is only specified in Python code which sits outside of PaddlePaddle, hence PaddlePaddle can not support the feature in this setup. + +Similar to how a compiler uses an intermediate representation (IR) so that the programmer does not need to manually optimize their code for most of the cases, we can have an intermediate representation in PaddlePaddle as well. The compiler optimizes the IR as follows: + + + +PaddlePaddle can support model parallelism by converting the IR so that the user no longer needs to manually perform the computation and operations in the Python component: + + + +The IR for PaddlePaddle after refactoring is called a `Block`, it specifies the computation dependency graph and the variables used in the computation. + +### Limitation 3 + +The user can not directly specify the parameter update rule for the parameter server in the Python module, since the parameter server does not use the same computation definition as the trainer. Instead, the update rule is baked inside the parameter server. The user can not specify the update rule explicitly. + +This could be fixed by making the parameter server also run an IR, which can be different to the trainer side +For a detailed explanation, refer to this document - +[Design Doc: Parameter Server](./parameter_server.md) + +## Distributed Training Architecture + +The revamped distributed training architecture can address the above discussed limitations. Below is the illustration of how it does so: + + + +The major components are: *Python API*, *Distribute Transpiler* and *Remote Executor*. + +### Python API + +Python API is the Python library that user's Python code invokes, to read the data, build the neural network topology, and start training, etc. + +```Python +images = fluid.layers.data(name='pixel', shape=[1, 28, 28], dtype='float32') +label = fluid.layers.data(name='label', shape=[1], dtype='int64') +... +predict = fluid.layers.fc(input=conv_pool_2, size=10, act="softmax") +cost = fluid.layers.cross_entropy(input=predict, label=label) +avg_cost = fluid.layers.mean(x=cost) +optimizer = fluid.optimizer.Adam(learning_rate=0.01) +optimizer.minimize(avg_cost) + +train_reader = paddle.batch( + paddle.reader.shuffle( + paddle.dataset.mnist.train(), buf_size=500), + batch_size=BATCH_SIZE) + +place = fluid.CPUPlace() +exe = fluid.Executor(place) + +for pass_id in range(10): + for data in train_reader(): + loss, acc = exe.run(trainer_prog, + feed=feeder.feed(data), + fetch_list=[avg_cost]) +``` + +The code above is a typical local training program, the "Training Program" is built using helper functions such as +`fluid.layer.fc`. The training is done by calling `Executor.run` +iteratively. + +For more details, the implementation of IR is [Program](../program.md), and `ProgramDesc` is the protobuf type. + +[Executor](../executor.md) simply runs the `ProgramDesc`. For local training you generally use +`Executor` to run the program locally. For any kind of distributed training, you can use +`RemoteExecutor` to specify desired distributed training method with some optional arguments. + +### Distributed Transpiler + +The Distributed Transpiler automatically converts the IR (in protobuf format) to partitioned IRs. Then +the Remote Executor dispatches the new IRs to Remote Executors across the cluster. +Below are the steps that are followed : + +1. User only need to change `Executor` to `RemoteExecutor` to change local program to distributed program. +1. `RemoteExecutor` calls `Distributed Transpiler` to "transpile" user's program to several IRs representing a + distributed training program: + 1. Parse configurations from `RemoteExecutor`. + 1. Determine the type of distributed program, can be DataParallelism, ModelParallelism or Streaming. + 1. Partition the `ProgramDesc` according to type and add `send` / `recv` OP pair on the boundaries. Take + DataParallelism type for example, it removes the optimization operators and add a `send` OP to the + "trainer" role, then add the optimization operators to the parameter server role within the `recv` OP. +1. Dispatch the partitioned graph to different `RemoteExecutor` in the cluster. +1. `RemoteExecutor` on each node run the received `ProgramDesc` utill the end. + + +### RemoteExecutor + +As shown in the graph, `RemoteExecutor.run` sends the IR to the cluster for Execution. +You can also use parameter `fetch_list` to interactively fetch variable back to local for +log printing. + +The Python `RemoteExecutor` is derived from `Executor` class. + +```python +exe = RemoteExecutor( + feed=feeder.feed(data), + fetch_list=[avg_cost], + job_desc=JobDesc( + jobname, + num_trainer, + num_pserver, + cpu_per_trainer, + gpu_per_trainer, + mem_per_trainer, + cpu_per_pserver, + mem_per_pserver + )) +for data in train_reader(): + loss, acc = exe.run(trainer_prog, + feed=feeder.feed(data), + fetch_list=[avg_cost]) +``` + +`JobDesc` object describe the distributed job resource specification to run on +Cluster environment. + + + +`RemoteExecutor.run` sends the `ProgramDesc` and +[TrainingJob](https://github.com/PaddlePaddle/cloud/blob/develop/doc/autoscale/README.md#training-job-resource) +to a server in the cluster which executes `RemoteExecutor.listen`. This server is responsible +to start the final Kubernetes Jobs to run the different role of `ProgramDesc` from `ConfigMap`. + + +### Placement Algorithm + +Our first implementation will only support "trainer-parameter server" placement: the parameters, initializers, and optimizers are all placed on the PaddlePaddle runtimes with the parameter server role. Everything else will be placed on the PaddlePaddle runtimes with the trainer role. This has the same functionality as the "trainer-parameter server" architecture of PaddlePaddle v0.10.0, but is more generic and flexible. + +In the future, a more general placement algorithm should be implemented, which makes placements according to the input IR, and a model of device computation time and device communication time. Model parallelism requires the generic placement algorithm. + + +### Local Training Architecture + +The local training architecture will be the same as the distributed training architecture, the difference is that everything runs locally, and there is just one PaddlePaddle runtime: + + + + +### Training Data + +In PaddlePaddle v0.10.0, training data is typically read +with [data reader](../reader/README.md) from Python. This approach is +no longer efficient when training distributedly since the Python +process no longer runs on the same node with the trainer processes, +the Python reader will need to read from the distributed filesystem +(assuming it has the access) and send to the trainers, doubling the +network traffic. + +When doing distributed training, the user can still use Python data +reader: the training data are sent with `Executor.run`. However, should +be used for debugging purpose only. The users are encouraged to use +the read data OPs. + + +## References: + +[1] [TensorFlow: Large-Scale Machine Learning on Heterogeneous Distributed Systems](https://static.googleusercontent.com/media/research.google.com/en//pubs/archive/45166.pdf) + +[2] [TensorFlow: A System for Large-Scale Machine Learning](https://www.usenix.org/system/files/conference/osdi16/osdi16-abadi.pdf) diff --git a/doc/design/dist_refactor/multi_cpu.md b/doc/design/dist_refactor/multi_cpu.md new file mode 100644 index 0000000000000000000000000000000000000000..a8d8ee0422acc84835170a44eb83f9b5f0c6bb40 --- /dev/null +++ b/doc/design/dist_refactor/multi_cpu.md @@ -0,0 +1,43 @@ +# Design Doc: Execute the Program with Multi CPU + +## Abstract + +This Design Doc propose an approach to make the user-defined Op graph +running with multi-CPU, we will use an auto transpiler to convert the user-defined +Op graph to a multi-CPU Op graph, and run `ParallelDo` Op to run the graph. + +## Transpiler + + + +After converted: + + + +## Implement + +- `Multi-CPU Transpiler` will convert the graph to a multi-CPU graph + which would be executed with multi-threads. +- `BlockingCounter` will `Init/Decrement` an atomic counter, and Blocking `Wait` + for the atomic counter become `0`: + ```cpp + BlockingCounter bc(thread_count); + for (int i = 0; i < thread_count; ++i) { + thread_pool->Start([&bc] {bc.DecrementCount(); }) + } + bc.Wait(); + ``` +- `ParallelDo` Operator + - Initialize a thread pool which is a Singleton. + - Use a block id as the input, and create run the specify Block on independent scope + with multi-threads. + - Initialize a `BlockingCounter` instance and wait until all threads are done. +- `Split` Operator will split the Input Tensor into a TensorArray. +- `Merge` merge all the gradients which calculated in different threads + with `mean/sum/max/min...` method, and then run the Optimizer Op to optimize `W`. + +## TODO + +- Improve the optimizer stage with multi-threads, since we could + assign the parameters to the different threads and execute + optimizer with multi-threads. diff --git a/doc/design/dist_refactor/parameter_server.md b/doc/design/dist_refactor/parameter_server.md new file mode 100644 index 0000000000000000000000000000000000000000..805dd13048d41b995d2a01cda52b2ea33e4bbe1d --- /dev/null +++ b/doc/design/dist_refactor/parameter_server.md @@ -0,0 +1,96 @@ +# Design Doc: Parameter Server + +## Abstract + +We propose an approach to implement the parameter server. In this +approach, there is no fundamental difference between the trainer and +the parameter server: they both run subgraphs, but subgraphs of +different purposes. + +## Background + +The previous implementations of the parameter server do not run a +fluid sub-program. Parameter initialization, optimizer computation, network +communication and checkpointing are implemented twice on both the +trainer as well as the parameter server. + +It would be great if we can write code once and use them on both: the +trainer and the parameter server, since this reduces code duplication and +improves extensibility. Given that after the current refactoring, we are +representing everything as a computation graph on the +trainer. Representing everything as a computation graph on the parameter +server becomes a natural extension. + +## Design + +### Distributed Transpiler + +The *Distributed Transpiler* converts the user-defined fluid program +into sub-programs to be scheduled on different nodes with the following +steps: + +1. OP placement: the OPs will be placed on different nodes according + to a heuristic that minimizes the estimated total computation + time. Currently we will use a simple heuristic that puts parameter + variable on parameter server workers and everything else on trainer + workers. +1. Add communication OPs to enable the communication between nodes. + +We will need these OPs: *Send*, *Recv*, *Enqueue*, *Dequeue*. + +Below is an example of converting the user defined graph to the +subgraphs for the trainer and the parameter server: + + + +After converting: + + + +1. The parameter variable W and its optimizer program are placed on the parameter server. +1. Operators are added to the program. + - *Send* sends data to the connected *Recv* operator. The + scheduler on the receive node will only schedule *Recv* operator + to run when the *Send* operator has ran (the *Send* OP will mark + the *Recv* OP runnable automatically). + - *Enqueue* enqueues the input variable, it can block until space + become available in the queue. + - *Dequeue* outputs configurable numbers of tensors from the + queue. It will block until the queue has the required number of + tensors. + + +### Benefits + +- Model parallelism becomes easier to implement: it is an extension to + the trainer - parameter server approach. We can have several "Transpilers" + to achieve different goals. +- User-defined optimizer is easier to add - user can now express it as + a sub-program. +- No more duplication logic inside the trainer and the parameter + server mentioned in the background section. + +### Challenges + +- It is important to balance the parameter shards on multiple + parameter servers. If a single parameter is very big (for example: some + word-embedding, fully connected, softmax layer), we need to + automatically partition the single parameter onto different + parameter servers when possible (only element-wise optimizer depends + on the parameter variable). +- In the "Async SGD" figure, the "W" variable on the parameter server + could be read and written concurrently. See + [here](https://github.com/PaddlePaddle/Paddle/pull/6394) for more + details about concurrent program in Fluid. + +### Discussion + +- Can the Enqueue OP be implemented under our current tensor design + (put the input tensor into the queue tensor)? +- *Dequeue* OP will have variable numbers of output (depending on the + `min_count` attribute), does our current design support it? (similar + question for the *Add* OP) + + +### References: +[1] [TensorFlow: Large-Scale Machine Learning on Heterogeneous Distributed Systems](https://static.googleusercontent.com/media/research.google.com/en//pubs/archive/45166.pdf) diff --git a/doc/design/dist_refactor/src/compiler.graffle b/doc/design/dist_refactor/src/compiler.graffle new file mode 100644 index 0000000000000000000000000000000000000000..8cc678fea3c820103e7ce81f7a5d625d6c1d92de Binary files /dev/null and b/doc/design/dist_refactor/src/compiler.graffle differ diff --git a/doc/design/dist_refactor/src/compiler.png b/doc/design/dist_refactor/src/compiler.png new file mode 100644 index 0000000000000000000000000000000000000000..65d34f841afce9756def07dd8ecb9ca44e658bfe Binary files /dev/null and b/doc/design/dist_refactor/src/compiler.png differ diff --git a/doc/design/dist_refactor/src/dist-graph.graffle b/doc/design/dist_refactor/src/dist-graph.graffle new file mode 100644 index 0000000000000000000000000000000000000000..941399c6ced8d5f65b6c595522b770c88259df4b Binary files /dev/null and b/doc/design/dist_refactor/src/dist-graph.graffle differ diff --git a/doc/design/dist_refactor/src/dist-graph.png b/doc/design/dist_refactor/src/dist-graph.png new file mode 100644 index 0000000000000000000000000000000000000000..3546b09f1c2ee3e4f60f519d5e47f823f08051a7 Binary files /dev/null and b/doc/design/dist_refactor/src/dist-graph.png differ diff --git a/doc/design/dist_refactor/src/distributed_architecture.graffle b/doc/design/dist_refactor/src/distributed_architecture.graffle new file mode 100644 index 0000000000000000000000000000000000000000..d1b60141342232e06227c2d430ebc60ec349a907 Binary files /dev/null and b/doc/design/dist_refactor/src/distributed_architecture.graffle differ diff --git a/doc/design/dist_refactor/src/distributed_architecture.png b/doc/design/dist_refactor/src/distributed_architecture.png new file mode 100644 index 0000000000000000000000000000000000000000..29c7b0c0783f97c6d33b1db1ed484d6a2b9dd356 Binary files /dev/null and b/doc/design/dist_refactor/src/distributed_architecture.png differ diff --git a/doc/design/dist_refactor/src/local-graph.graffle b/doc/design/dist_refactor/src/local-graph.graffle new file mode 100644 index 0000000000000000000000000000000000000000..19e509bd9af3c1e9a3f5e0f16ddd281457a339c5 Binary files /dev/null and b/doc/design/dist_refactor/src/local-graph.graffle differ diff --git a/doc/design/dist_refactor/src/local-graph.png b/doc/design/dist_refactor/src/local-graph.png new file mode 100644 index 0000000000000000000000000000000000000000..ada51200f793a9bb18911e7d63cfdb3244b967d7 Binary files /dev/null and b/doc/design/dist_refactor/src/local-graph.png differ diff --git a/doc/design/dist_refactor/src/local_architecture.graffle b/doc/design/dist_refactor/src/local_architecture.graffle new file mode 100644 index 0000000000000000000000000000000000000000..49fcc663ebe3824aa234e3a67aadf285cb417877 Binary files /dev/null and b/doc/design/dist_refactor/src/local_architecture.graffle differ diff --git a/doc/design/dist_refactor/src/local_architecture.png b/doc/design/dist_refactor/src/local_architecture.png new file mode 100644 index 0000000000000000000000000000000000000000..14adc9fd72b855bb9f74fbf2c84ac9ec0cf2b122 Binary files /dev/null and b/doc/design/dist_refactor/src/local_architecture.png differ diff --git a/doc/design/dist_refactor/src/multi-threads.graffle b/doc/design/dist_refactor/src/multi-threads.graffle new file mode 100644 index 0000000000000000000000000000000000000000..e71173715fff92a0a933d0c7d83599ba948552c6 Binary files /dev/null and b/doc/design/dist_refactor/src/multi-threads.graffle differ diff --git a/doc/design/dist_refactor/src/multi-threads/multi-threads@3x.png b/doc/design/dist_refactor/src/multi-threads/multi-threads@3x.png new file mode 100644 index 0000000000000000000000000000000000000000..e40a869987dbbf5019d4cb03c1dab55b74d6c9f9 Binary files /dev/null and b/doc/design/dist_refactor/src/multi-threads/multi-threads@3x.png differ diff --git a/doc/design/dist_refactor/src/multi-threads/single-thread@3x.png b/doc/design/dist_refactor/src/multi-threads/single-thread@3x.png new file mode 100644 index 0000000000000000000000000000000000000000..4083aebfdd45af5fbac25fa2c4176bc08c3cb44a Binary files /dev/null and b/doc/design/dist_refactor/src/multi-threads/single-thread@3x.png differ diff --git a/doc/design/dist_refactor/src/paddle-compile.graffle b/doc/design/dist_refactor/src/paddle-compile.graffle new file mode 100644 index 0000000000000000000000000000000000000000..a6348cc3dbcaca923c6e794681b2edb85cb9f8f6 Binary files /dev/null and b/doc/design/dist_refactor/src/paddle-compile.graffle differ diff --git a/doc/design/dist_refactor/src/paddle-compile.png b/doc/design/dist_refactor/src/paddle-compile.png new file mode 100644 index 0000000000000000000000000000000000000000..e0f13d551ac41afaec627a57dea79356464bf0bf Binary files /dev/null and b/doc/design/dist_refactor/src/paddle-compile.png differ diff --git a/doc/design/dist_refactor/src/remote_executor.graffle b/doc/design/dist_refactor/src/remote_executor.graffle new file mode 100644 index 0000000000000000000000000000000000000000..41b2067311694b56d211a4f32d1b76884eeffd2d Binary files /dev/null and b/doc/design/dist_refactor/src/remote_executor.graffle differ diff --git a/doc/design/dist_refactor/src/remote_executor.png b/doc/design/dist_refactor/src/remote_executor.png new file mode 100644 index 0000000000000000000000000000000000000000..744e2fb2e0f1bbe058e991ba7b2a09000965ee79 Binary files /dev/null and b/doc/design/dist_refactor/src/remote_executor.png differ diff --git a/doc/design/error_clip.md b/doc/design/error_clip.md new file mode 100644 index 0000000000000000000000000000000000000000..58aa73b8cd38d01e2426278a3479714e4fb6a3b0 --- /dev/null +++ b/doc/design/error_clip.md @@ -0,0 +1,92 @@ +# Error Clip + +## Overview + +Error clip is widely used in model training to prevent gradient exploding. It takes some specific rules to adjust variables' gradients and prevent them from being too large. With it, values of a gradient will be checked before they are taken by the next `grad_op` and be shrunk if necessary. +## Usage + +Users are allowed to assign different error clip methods or attributes to different `Variable`s. Users can specify it as a parameter of `Variable`'s constructor: + +```python +var = framework.Variable(..., error_clip=myErrorClip, ...) +``` + +The default value of `error_clip` is `None`, which means no error clip is employed. When it's not `None`, it should take an object of `BaseErrorClipAttr`'s derived class. So far, `BaseErrorClipAttr` has only one derived class: `ErrorClipByValue`, whose constructor is: + +```python +ErrorClipByValue(max, min=None) +``` + +`max` and `min` represent the maximal and minimal clip threshold respectively. In backward pass, all values of `var`'s gradient greater than `max` or less than `min` will be clipped to `max` and `min` respectively. When the `min` is None, the minimal threshold will be assigned with `-max` automatically. + +So we can enable the error clip with threshold `[-5.0, 5.0]` for variable `var` by: + +```python +var = framework.Variable(..., error_clip=ErrorClipByValue(max=5.0), ...) +``` + +## Implementation + +The `BaseErrorClipAttr` and its derived class `ErrorClipByValue` are defined in *clip.py*. + +```python +class BaseErrorClipAttr(object): + def append_clip_op(self, block, grad_name): + raise NotImplementedError() + + +class ErrorClipByValue(BaseErrorClipAttr): + def __init__(self, max, min=None): + max = float(max) + if min is None: + min = -max + else: + min = float(min) + self.max = max + self.min = min + + def append_clip_op(self, block, grad_name): + clip_op_desc = block.desc.append_op() + clip_op_desc.set_type("clip") + clip_op_desc.set_input("X", [grad_name]) + clip_op_desc.set_output("Out", [grad_name]) + clip_op_desc.set_attr("min", self.min) + clip_op_desc.set_attr("max", self.max) +``` + +The `BaseErrorClipAttr` have one main member functions: `append_clip_op(self, block, grad_name)`. + +This function is used to create a `clip_op` and append it to the end of given `block`. For different error clip algorithm require different `clip_op`, the function is defined as virtual in the base class. All derived classes must implement their own versions of this function. + +These `clip_op`s should be inserted after `grad_op`s whose output gradients need to be clipped. It is equivalent to appending some `clip_op`s to the end of the target block every time a new `grad_op` is added. + +```python +for op_desc in grad_op_descs: + new_op_desc = target_block.desc.append_op() + new_op_desc.copy_from(op_desc) + callback(block=target_block, context=grad_to_var) +``` + +Here we employ a callback function to complete this kind of jobs. In `_append_backward_ops_` function, each time after a `grad_op` is added to the `target_block`, a callback function is invoked. The logic of `clip_op` appending can be implemented inside the callback function. + +The callback function for `clip_op` appending is defined in *clip.py*: + +```python +def error_clip_callback(block, context): + # the context is a grad_to_var map + grad_to_var = context + op_desc = block.desc.op(block.desc.op_size() - 1) + for grad_n in filter(lambda n: grad_to_var.has_key(n), + op_desc.output_arg_names()): + fwd_var = block.var_recursive(grad_to_var[grad_n]) + error_clip = getattr(fwd_var, "error_clip", None) + if not (error_clip is None or isinstance(error_clip, + BaseErrorClipAttr)): + raise TypeError( + "Variable's error_clip should be an instance of BaseErrorClipAttr or None." + ) + if error_clip is not None: + error_clip.append_clip_op(block, grad_n) +``` + +This function takes a `block` and a `context`(which is actually a grad\_to\_var map) as inputs. It checks each output of the last `OpDesc` in the `block`. Notice that the last `OpDesc` of the `block` must be a `grad_op` and its outputs must be some forward variables' gradients. If an output gradient's corresponding forward variable has an attribute of `error_clip`, `error_clip_callback` will call the `error_clip`'s `append_clip_op` function to append the required `clip_op` into the `block`. diff --git a/doc/design/evaluator.md b/doc/design/evaluator.md new file mode 100644 index 0000000000000000000000000000000000000000..11cc129d56905a9ee666da92fbe6f8559c6d325a --- /dev/null +++ b/doc/design/evaluator.md @@ -0,0 +1,58 @@ +## Evaluator Design + +### Problem Statement + +During training or inference, we provide an evaluation function to measure the model performance, for example, accuracy, precision, etc. In the operator based framework design, the data passes through the network pipeline batch by batch. As a result, inside the operator, we only calculate the metrics for one minibatch. Thus, we need to provide a mechanism to calculate the metrics for each N pass/batch the user wants. + +### Evaluator Design +Currently, every operation is expressed in the graph. We divide the evaluator process into three steps. + +1. Initialize the metric state and add it into the block. + +2. Calculate the concerned metrics for every mini-batch. The single evaluator operator is only responsible for calculating the necessary statistics for one mini-batch. For example, the accuracy operator only calculates the accuracy for a minibatch data if run once. + + +3. Merge the mini-batch statistics to form the evaluation result for multiple mini-batches. When it comes to distributed training/Multi-GPU training, aggregate the value from different devices. + +### Implementation +This design is shown in the Python API. +Each metric operator needs to caculate the metric statistic and return the batch-aware states. Python side is responsible for accumulating the states for each pass. + + +```python +class Evaluator(object): + """ + Evaluator Base class. + """ + def __init__(self, name, **kwargs): + """ + Different evaluator may has different metric states. E.g, Accuracy need two variables, total and right sample counts. + Auc need four variables, `true_positives`, + `true_negatives`, `false_positives` and `false_negatives`. So every evaluator should create its needed variables and append to main_program + + The initialization of Evaluator should be responsible for: + create metric states and append to the main_program + """ + pass + + def _update_ops(self, input, label, **kwargs) + """ + Add mini-batch evaluator caculate operators to the main_program. + Add increment operator to accumulate the metric states. + """ + + + def reset(self, executor, reset_program=None): + """ + Reset metric states at the begin of each pass/user specified batch number. + Execute the reset_program to reset the states. + """ + + + def eval(self, executor, eval_program=None): + """ + Merge the mini-batch statistics to form the evaluation result for multiple mini-batches. + Execute the eval_program and return the result. + """ + return eval_result +``` diff --git a/doc/design/executor.md b/doc/design/executor.md new file mode 100644 index 0000000000000000000000000000000000000000..2d4b371cc56db82ce5747da6db07f05aa7f7e6c1 --- /dev/null +++ b/doc/design/executor.md @@ -0,0 +1,29 @@ +# Executor Design Doc + +## Motivation +In [fluid](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/fluid.md), we encourage the user to use deep learning programming paradigms to describe the training process. When the user-written Python program is executed, it will first create a protobuf message +[`ProgramDesc`](https://github.com/PaddlePaddle/Paddle/blob/a91efdde6910ce92a78e3aa7157412c4c88d9ee8/paddle/framework/framework.proto#L145) that describes the process and is conceptually like an [abstract syntax tree](https://en.wikipedia.org/wiki/Abstract_syntax_tree). + +The executor runs the `ProgramDesc` like an interpreter. `ProgramDesc` contains the intrinsics (operators in this case) and variables which will be used, executor explicitly executes the stored precompiled code. + +## Overview + +An executor takes a `ProgramDesc`, a `block_id` and a `Scope`. The `ProgramDesc` is a list of blocks and each block contains the protobuf definition of all the parameters and operators in the block. The `block_id` specifies the entrance block. And the `Scope` is the container of all the variable instances, which is persistent throughout different runs. + +## Executor + +The `Executor` explicitly executes all the intrinsics (operators here) in the `block_id`th block of a `ProgramDesc`. Essentially, it instantiates Variables and Operators, then runs all the operators in sequence one-by-one. +It is very similar to how a push stack frame works when entering a block, following which it cleans up all the temporary variables when a mini-batch is finished. It does not however, have the stack frame pop process. + +### The interface +```c++ + Executor(places); +``` +A executor does not own any computing resources, a user can only construct an executor using the specified places. + +### Running an Executor + +``` + void Run(ProgramDesc, Scope, block_id, create_local_scope); +``` +An `Executor` only provides a unified way to execute `ProgramDesc`. `ProgramDesc` is the target that will be executed, the `Scope` specifies the variable container, the `block_id` indicates the entrance block and `create_local_scope` is a boolean that states whether it will destroy the temporary variables after the execution is finished. diff --git a/doc/design/float16.md b/doc/design/float16.md new file mode 100644 index 0000000000000000000000000000000000000000..1ea95ed6b5d6792171569b6ff76d09be92fcb13e --- /dev/null +++ b/doc/design/float16.md @@ -0,0 +1,105 @@ +# Design Doc: float16 + +## Why float16 +Half precision (float16) is a binary floating-point format that occupies 16 bits in memory. float16 is half the size of traditional 32-bit single precision format (float) and has lower precision and smaller range. + +When high precision computation is not required, using float16 data type could potentially + +- reduce storage space, memory bandwidth, and power usages; +- increase the chance of data fitting into a smaller cache of lower latency; +- provide arithmetic speed up if supported by hardware. + +## Survey of current float16 support +A brief survey of float16 support on different compilers, hardwares, and libraries can be found below. Interested readers can refer to [link1](https://github.com/PaddlePaddle/Paddle/issues/4853) and [link2](https://github.com/Xreki/Xreki.github.io/blob/master/multi_data_types_in_dl_framework/ppt/float16_and_quantized_type.md) for more info. + +The goal of float16 is to serve as a key for the executor to find and run the correct version of compute method specialized for float16 in operator kernel. It should be compatible with various natively supported float16 implementations including `__half` for cuda, `float16_t` for ARM, and `Eigen::half` for Eigen to make writing customized float16 kernels easier. + +### Compiler +- nvcc supports `__half` data type after CUDA 7.5. +- `__fp16` or `float16_t` is supported as storage type for gcc >= 6.1 and clang >= 3.4. +- `__fp16` or `float16_t` is supported as arithmetic type for gcc >= 7.1 and clang >= 3.9. + +### Hardware +- `__half` is supported on GPU with compute capability >= 5.3. +- `__fp16` is supported as storage type for ARMv7-A, ARMv8-A, and above. +- `__fp16` is supported as arithmetic type after ARMv8.2-A (currently, the only microarchitecture implementing ARMv8.2-A is ARM Cortex-A75, which is announced in May 2017. There seems to be no application processors currently available on market that adopts this architecture. It is reported that Qualcomm Snapdragon 845 uses Cortex-A75 design and will be available in mobile devices in early 2018). + +### Libraries +- [Eigen](https://github.com/RLovelett/eigen) >= 3.3 supports float16 calculation on both GPU and CPU using the `Eigen::half` class. It is mostly useful for Nvidia GPUs because of the overloaded arithmetic operators using cuda intrinsics. It falls back to using software emulation on CPU for calculation and there is no special treatment to ARM processors. +- [ARM compute library](https://github.com/ARM-software/ComputeLibrary) >= 17.02.01 supports NEON FP16 kernels (requires ARMv8.2-A CPU). + +### CUDA version issue +There are currently three versions of CUDA that supports `__half` data type, namely, CUDA 7.5, 8.0, and 9.0. +CUDA 7.5 and 8.0 define `__half` as a simple struct that has a `uint16_t` data (see [`cuda_fp16.h`](https://github.com/ptillet/isaac/blob/9212ab5a3ddbe48f30ef373f9c1fb546804c7a8c/include/isaac/external/CUDA/cuda_fp16.h)) as follows: +``` +typedef struct __align__(2) { + unsigned short x; +} __half; + +typedef __half half; +``` +This struct does not define any overloaded arithmetic operators. So you have to directly use `__hadd` instead of `+` to correctly add two half types: +``` +__global__ void Add() { + half a, b, c; + c = __hadd(a, b); // correct + c = a + b; // compiler error: no operator "+" matches these operands +} +``` +CUDA 9.0 provides a major update to the half data type. The related code can be found in the updated [`cuda_fp16.h`](https://github.com/ptillet/isaac/blob/master/include/isaac/external/CUDA/cuda_fp16.h) and the newly added [`cuda_fp16.hpp`](https://github.com/ptillet/isaac/blob/master/include/isaac/external/CUDA/cuda_fp16.hpp). + +Essentially, CUDA 9.0 renames the original `__half` type in 7.5 and 8.0 as `__half_raw`, and defines a new `__half` class type that has constructors, conversion operators, and also provides overloaded arithmetic operators such as follows: +``` +typedef struct __CUDA_ALIGN__(2) { + unsigned short x; +} __half_raw; + + +struct __CUDA_ALIGN__(2) __half { +protected: + unsigned short __x; +public: + // constructors and conversion operators from/to + // __half_raw and other built-in data types +} + +typedef __half half; + +__device__ __forceinline__ +__half operator+(const __half &lh, const __half &rh) { + return __hadd(lh, rh); +} + +// Other overloaded operators +``` +This new design makes `c = a + b` work correctly for CUDA half data type. + +## Implementation +The float16 class holds a 16-bit `uint16_t` data internally. +``` +struct float16 { + uint16_t x; +}; +``` + +float16 supports the following features: + - constructors / assignment operators that take input from primitive data types including bool, integers of various length, float, and double. + - constructors / assignment operators that take input from `__half` on cuda, `float16_t` on ARM, and `Eigen::half` on Eigen. + - conversion operators to primitive data types and half precision data types on cuda, ARM and Eigen. + - overloaded arithmetic operators for cuda, arm, and non-arm cpu, respectively. These operators will take advantage of the cuda and ARM intrinsics on the corresponding hardware. + +To support the above features, two fundamental conversion functions are provided: +``` +float16 float_to_half_rn(float f); // convert to half precision in round-to-nearest-even mode +float half_to_float(float16 h); +``` +which provides one-to-one conversion between float32 and float16. These twos functions will do different conversion routines based on the current hardware. CUDA/ARM instrinsics will be used when the corresonding hardware is available. If the hardware or compiler level does not support float32 to float16 conversion, software emulation will be performed to do the conversion. + +## To do +After float16 class is available, some of the future items are below: + +- Update pybind/tensor_py.h to bind c++ float16 with numpy float16. + +- Modify `GetKernelType()` method in `framework/operator.h` to make it compatible with float16. + +- Create a type-casting operator that can convert the data type in tensor between float16 and other types. diff --git a/doc/design/fluid-compiler.graffle b/doc/design/fluid-compiler.graffle new file mode 100644 index 0000000000000000000000000000000000000000..c933df2cb855462c52b2d25f7f9a99b95652961d Binary files /dev/null and b/doc/design/fluid-compiler.graffle differ diff --git a/doc/design/fluid-compiler.png b/doc/design/fluid-compiler.png new file mode 100644 index 0000000000000000000000000000000000000000..1b0ffed2039c91a3a00bbb719da08c91c3acf7bb Binary files /dev/null and b/doc/design/fluid-compiler.png differ diff --git a/doc/design/fluid.md b/doc/design/fluid.md new file mode 100644 index 0000000000000000000000000000000000000000..2acc168007d25a083f588b48f84e12e29baf4f47 --- /dev/null +++ b/doc/design/fluid.md @@ -0,0 +1,114 @@ +# Design Doc: PaddlePaddle Fluid + +## Why Fluid + +When Baidu developed PaddlePaddle in 2013, the only well-known open source deep learning system at the time was Caffe. However, when PaddlePaddle was open-sourced in 2016, many other choices were available. There was a challenge -- what is the need for open sourcing yet another deep learning framework? + +Fluid is the answer. Fluid is similar to PyTorch and TensorFlow Eager Execution, which describes the "process" of training or inference using the concept of a model. In fact in PyTorch, TensorFlow Eager Execution and Fluid, there is no concept of a model at all. The details are covered in the sections below. Fluid is currently more extreme in the above mentioned idea than PyTorch and Eager Execution, and we are trying to push Fluid towards the directions of a compiler and a new programming language for deep learning. + +## The Evolution of Deep Learning Systems + +Deep learning infrastructure is one of the fastest evolving technologies. Within four years, there have already been three generations of technologies invented. + +| Existed since | model as sequence of layers | model as graph of operators | No model | +|--|--|--|--| +| 2013 | Caffe, Theano, Torch, PaddlePaddle | | | +| 2015 | | TensorFlow, MxNet, Caffe2, ONNX, n-graph | | +| 2016 | | | PyTorch, TensorFlow Eager Execution, PaddlePaddle Fluid | + +From the above table, we see that the deep learning technology is evolving towards getting rid of the concept of a model. To understand the reasons behind this direction, a comparison of the *programming paradigms* or the ways to program deep learning applications using these systems, would be helpful. The following section goes over these. + +## Deep Learning Programming Paradigms + +With the systems listed as the first or second generation, e.g., Caffe or TensorFlow, an AI application training program looks like the following: + +```python +x = layer.data("image") +l = layer.data("label") +f = layer.fc(x, W) +s = layer.softmax(f) +c = layer.mse(l, s) + +for i in xrange(1000): # train for 1000 iterations + m = read_minibatch() + forward({input=x, data=m}, minimize=c) + backward(...) + +print W # print the trained model parameters. +``` + +The above program includes two parts: + +1. The first part describes the model, and +2. The second part describes the training process (or inference process) for the model. + +This paradigm has a well-known problem that limits the productivity of programmers. If the programmer made a mistake in configuring the model, the error messages wouldn't show up until the second part is executed and `forward` and `backward` propagations are performed. This makes it difficult for the programmer to debug and locate a mistake that is located blocks away from the actual error prompt. + +This problem of being hard to debug and re-iterate fast on a program is the primary reason that programmers, in general, prefer PyTorch over the older systems. Using PyTorch, we would write the above program as following: + +```python +W = tensor(...) + +for i in xrange(1000): # train for 1000 iterations + m = read_minibatch() + x = m["image"] + l = m["label"] + f = layer.fc(x, W) + s = layer.softmax(f) + c = layer.mse(l, s) + backward() + +print W # print the trained model parameters. +``` + +We can see that the main difference is the moving the model configuration part (the first step) into the training loop. This change would allow the mistakes in model configuration to be reported where they actually appear in the programming block. This change also represents the model better, or its forward pass, by keeping the configuration process in the training loop. + +## Describe Arbitrary Models for the Future + +Describing the process instead of the model also brings Fluid, the flexibility to define different non-standard models that haven't been invented yet. + +As we write out the program for the process, we can write an RNN as a loop, instead of an RNN as a layer or as an operator. A PyTorch example would look like the following: + +```python +for i in xrange(1000): + m = read_minibatch() + x = m["sentence"] + for t in xrange x.len(): + h[t] = the_step(x[t]) +``` + +With Fluid, the training loop and the RNN in the above program are not really Python loops, but just a "loop structure" provided by Fluid and implemented in C++ as the following: + +```python +train_loop = layers.While(cond) +with train_loop.block(): + m = read_minibatch() + x = m["sentence"] + rnn = layers.While(...) + with rnn.block(): + h[t] = the_step(input[t]) +``` + +An actual Fluid example is described [here](https://github.com/PaddlePaddle/Paddle/blob/a91efdde6910ce92a78e3aa7157412c4c88d9ee8/python/paddle/v2/fluid/tests/test_while_op.py#L36-L44). + +From the example, the Fluid programs look very similar to their PyTorch equivalent programs, except that Fluid's loop structure, wrapped with Python's `with` statement, could run much faster than just a Python loop. + +We have more examples of the [`if-then-else`](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/if_else_op.md) structure of Fluid. + +## Turing Completeness + +In computability theory, a system of data-manipulation rules, such as a programming language, is said to be Turing complete if it can be used to simulate any Turing machine. For a programming language, if it provides if-then-else and loop, it is Turing complete. From the above examples, Fluid seems to be Turing complete; however, it is noteworthy to notice that there is a slight difference between the `if-then-else` of Fluid and that of a programming language. The difference being that the former runs both of its branches and splits the input mini-batch into two -- one for the True condition and another for the False condition. This hasn't been researched in depth if this is equivalent to the `if-then-else` in programming languages that makes them Turing-complete. Based on a conversation with [Yuang Yu](https://research.google.com/pubs/104812.html), it seems to be the case but this needs to be looked into in-depth. + +## The Execution of a Fluid Program + +There are two ways to execute a Fluid program. When a program is executed, it creates a protobuf message [`ProgramDesc`](https://github.com/PaddlePaddle/Paddle/blob/a91efdde6910ce92a78e3aa7157412c4c88d9ee8/paddle/framework/framework.proto#L145) that describes the process and is conceptually like an [abstract syntax tree](https://en.wikipedia.org/wiki/Abstract_syntax_tree). + +There is a C++ class [`Executor`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/executor.h), which runs a `ProgramDesc`, similar to how an interpreter runs a Python program. + +Fluid is moving towards the direction of a compiler, which is explain in [fluid_compiler.md](fluid_compiler.md). + +## Backward Compatibility of Fluid + +Given all the advantages from the removal of the concept of a *model*, hardware manufacturers might still prefer the existence of the concept of a model, so it would be easier for them to support multiple frameworks all at once and could run a trained model during inference. For example, Nervana, a startup company acquired by Intel, has been working on an XPU that reads the models in the format known as [n-graph](https://github.com/NervanaSystems/ngraph). Similarly, [Movidius](https://www.movidius.com/) is producing a mobile deep learning chip that reads and runs graphs of operators. The well-known [ONNX](https://github.com/onnx/onnx) is also a file format of graphs of operators. + +For Fluid, we can write a converter that extracts the parts in the `ProgramDesc` protobuf message, converts them into a graph of operators, and exports the graph into the ONNX or n-graph format. diff --git a/doc/design/fluid_compiler.md b/doc/design/fluid_compiler.md new file mode 100644 index 0000000000000000000000000000000000000000..2a6beafc52e815fa067b273bb5887ddcf6ab15ae --- /dev/null +++ b/doc/design/fluid_compiler.md @@ -0,0 +1,110 @@ +# PaddlePaddle Fluid: Towards a Compiled Programming Language + +As described in [fluid.md](fluid.md), when a Fluid application program +runs, it generates a `ProgramDesc` protobuf message as an intermediate +representation of itself. The C++ class `Executor` can run this +protobuf message as an interpreter. This article describes the Fluid +compiler. + +![](fluid-compiler.png) + +## ProgramDesc + +Before we go deeper into the idea of compiled language, let us take a +look at a simple example Fluid application. + +```python +import "fluid" + +func paddlepaddle() { + X = fluid.read(...) + W = fluid.Tensor(...) + Y = fluid.mult(X, W) +} +``` + +This program consists of a [block](block.md) of three operators -- +`read`, `assign`, and `mult`. Its `ProgramDesc` message looks like +the following + +```protobuf +message ProgramDesc { + block[0] = Block { + vars = [X, W, Y], + ops = [ + read(output = X) + assign(input = ..., output = W) + mult(input = {X, W}, output = Y) + ], + } +} +``` + +## Transpilers + +We can write a transpiler program that takes a `ProgramDesc`, e.g., +the above one, and outputs another `ProgramDesc`. Let us take some +examples: + +1. *Memory optimization transpiler*: We can write a transpiler that + inserts some `FreeMemoryOp`s in the above example `ProgramDesc` so + to free memory early, before the end of an iteration, so to keep a + small memory footprint. + +1. *Distributed training transpiler*: We can write a transpiler that + converts a`ProgramDesc` into its distributed version of two + `ProgramDesc`s -- one for running by the trainer processes and the + other for the parameter server. + +In the rest of this article, we talk about a special kind of +transpiler, *Native code generator*, which takes a `ProgramDesc` and +generates a `.cu` (or `.cc`) file, which could be built by C++ +compilers (gcc, nvcc, icc) into binaries. + +## Native Code Generator + +For the above example, the native code generator transpiler, say, the +CUDA code generator, should generate a `main` function: + +```c++ +void main() { + auto X = fluid_cuda_read(...); + auto W = fluid_cuda_create_tensor(...); + auto Y = fluid_cuda_mult(X, W); +} +``` + +and the definitions of functions `fluid_cuda_read`, +`fluid_cuda_create_tensor`, and `fluid_cuda_mult`. Please be aware +that each function could just define a C++ instance of an operator and +run it. For example + +```c++ +paddle::Tensor fluid_cuda_read(...) { + paddle::Tensor t; + paddle::operator::Read r(&t, ...); + r.Run(); + return t; +} +``` + +For computational operators that have multiple *kernels*, each for a +specific hardware platform, for example, the `mult` operator, the +generated code should call its CUDA kernel: + +```c++ +paddle::Tensor fluid_cuda_mult(const paddle::Tensor& a, + const paddle::Tensor& b) { + paddle::Tensor t; + paddle::operator::Mult m(a, b, ...); + Mult.Run(cuda_context); +} +``` + +where `cuda_context` could be a global variable of type +`paddle::CUDADeviceContext`. + +## Multi-Block Code Generation + +Most Fluid application programs may have more than one blocks. To +execute them, we need to trace [scopes](scope.md). diff --git a/doc/design/functions_operators_layers.md b/doc/design/functions_operators_layers.md new file mode 100644 index 0000000000000000000000000000000000000000..984b59f4c6971dfb6f46dfe342f2751f392c0e88 --- /dev/null +++ b/doc/design/functions_operators_layers.md @@ -0,0 +1,100 @@ +# Design Doc: Functions, Operators, and Layers + +In a DL system, we can compose one or more fine grained operators into a coarse grained one. For example, the FC layer can be composed of a multiplication operator and an add operator. + +Historically, some fine grained operations are known as operators, and some coarse level ones are known as layers. But we need a well-defined separation. + +In general, operators are those very fine grained operations, e.g., mul and add. In the implementation, we can write them as C++ functions: + +```c++ +template T add(T x, T y) { return x + y; } +template T mul(T x, T y) { return x * y; } +``` + +Then we can wrap them into operators which are C++ classes and can be created from Python bindings by name. A C macro can do this. For example, the following macro invocation + +```c++ +#define MAKE_FUNCTION_OPERATOR(mul); +``` + +generates + +```c++ +template class mulOp : public OperatorBase {...}; +REGISTER_OP(mulOp, "mul"); +``` + +so that in Python we can create operator mul by: + +```python +X1 = Var() +X2 = Var() +Y = Var() +paddle.cpp.create_operator("mul", input=[X1, X2], output=Y) +``` + +Also, at the same time, we can compose a coarse level C++ operator class by composing functions `mul` and `add`: + +```c++ +template +class FCOp : public OperatorBase { + public: + void Run(...) { + add(mul(Input("X"), Input("W")), Input("b"); + } +}; +REGISTER_OP(FCOp, "fc"); +``` + +We need to support such composition in Python as well. To do so, we need a higher level Python wrapping of operator creation than `paddle.cpp.create_operator`. This higher level operator API should be compatible with the layer API. + +Let's explain using an example. Suppose that we are going to compose the FC using mul and add in Python, we'd like to have Python functions `mul` and `add` defined in module `operator`: + +```python +def operator.mul(X1, X2): + O = Var() + paddle.cpp.create_operator("mul", input={X1, Y1}, output=O) + return O + +def operator.add(X1, X2): + O = Var() + paddle.cpp.create_operator("add", input={X1, X2}, output=O) + return O +``` + +Above code snippets are automatically generated. Given them, users can define + +```python +def layer.fc(X): + W = Var() + b = Var() + return operator.add(operator.mul(X, W), b) +``` + +If we don't have `operator.mul` and `operator.add`, the definiton of `layer.fc` would be complicated: + +```python +def layer.fc(X): + W = Var() + b = Var() + O1 = Var() + paddle.cpp.create_operator("mul", input=[X, W], output=O1) + O2 = Var() + paddle.cpp.create_operator("add", input=[O1, b], output=O2) + return O2 +``` + +We'd like to have Python bindings to operators in package `paddle.operator`, and Python compositions of operators in package `paddle.layer`. So we have the following concepts in above illustrative example: + + +| C++ functions/functors | mul | add | | | +|------------------------|--------------|--------------|-------------|----------| +| C++ operator class | mulOp | addOp | FCOp | | +| Python binding | operator.mul | operator.add | operator.fc | | +| Python function | | | | layer.fc | + + +This is how we differentiate layer and operators in PaddlePaddle: + +- those defined in C++ and have a lightweighted Python wrapper in module `operators` are operators; whereas +- those who don't have C++ implementations but a Python implementation that compose C++ operators are known as layers. diff --git a/doc/design/gan_api.md b/doc/design/gan_api.md new file mode 100644 index 0000000000000000000000000000000000000000..fb41df8615f73d9fd4c32995eab265833eac1a55 --- /dev/null +++ b/doc/design/gan_api.md @@ -0,0 +1,253 @@ +# Design for GAN + +GAN (General Adversarial Net [https://arxiv.org/abs/1406.2661]) is an important model for unsupervised learning and widely used in many areas. + +It applies several important concepts in machine learning system design, including building and running subgraphs, dependency tracing, different optimizers in one executor and so forth. + +In our GAN design, we wrap it as a user-friendly easily customized python API to design different models. We take the conditional DC-GAN (Unsupervised Representation Learning with Deep Convolutional Generative Adversarial Networks [https://arxiv.org/abs/1511.06434]) as an example due to its good performance on image generation. + +

+
+Figure 1. The overall running logic of GAN. The black solid arrows indicate the forward pass; the green dashed arrows indicate the backward pass of generator training; the red dashed arrows indicate the backward pass of the discriminator training. The BP pass of the green (red) arrow should only update the parameters in the green (red) boxes. The diamonds indicate the data providers. d\_loss and g\_loss marked in red and green are the two targets we would like to run. +

+ +The operators, layers and functions required/optional to build a GAN demo is summarized in https://github.com/PaddlePaddle/Paddle/issues/4563. + +

+
+Figure 2. Photo borrowed from the original DC-GAN paper. +

+ +## The Conditional-GAN might be a class. +This design we adopt the popular open source design in https://github.com/carpedm20/DCGAN-tensorflow and https://github.com/rajathkmp/DCGAN. It contains following data structure: + +- DCGAN(object): which contains everything required to build a GAN model. It provides following member functions methods as API: + +- __init__(...): Initialize hyper-parameters (like conv dimension and so forth), and declare model parameters of discriminator and generator as well. + +- generator(z, y=None): Generate a fake image from input noise z. If the label y is provided, the conditional GAN model will be chosen. +Returns a generated image. + +- discriminator(image): +Given an image, decide if it is from a real source or a fake one. +Returns a 0/1 binary label. + +- build_model(self): +build the whole GAN model, define training loss for both generator and discrimator. + +## Discussion on Engine Functions required to build GAN +- Trace the tensor and variable dependency in the engine executor. (Very critical, otherwise GAN can'be be trained correctly) +- Different optimizers responsible for optimizing different loss. + +To be more detailed, we introduce our design of DCGAN as following: + +### Class member Function: Initializer +- Set up hyper-parameters, including condtional dimension, noise dimension, batch size and so forth. +- Declare and define all the model variables. All the discriminator parameters are included in the list self.theta_D and all the generator parameters are included in the list self.theta_G. +```python +class DCGAN(object): + def __init__(self, y_dim=None): + + # hyper parameters + self.y_dim = y_dim # conditional gan or not + self.batch_size = 100 + self.z_dim = z_dim # input noise dimension + + # define parameters of discriminators + self.D_W0 = pd.Variable(shape=[3,3, 1, 128], data=pd.gaussian_normal_randomizer()) + self.D_b0 = pd.Variable(np.zeros(128)) # variable also support initialization using a numpy data + self.D_W1 = pd.Variable(shape=[784, 128], data=pd.gaussian_normal_randomizer()) + self.D_b1 = pd.Variable(np.zeros(128)) # variable also support initialization using a numpy data + self.D_W2 = pd.Varialble(np.random.rand(128, 1)) + self.D_b2 = pd.Variable(np.zeros(128)) + self.theta_D = [self.D_W0, self.D_b0, self.D_W1, self.D_b1, self.D_W2, self.D_b2] + + # define parameters of generators + self.G_W0 = pd.Variable(shape=[784, 128], data=pd.gaussian_normal_randomizer()) + self.G_b0 = pd.Variable(np.zeros(128)) # variable also support initialization using a numpy data + self.G_W1 = pd.Variable(shape=[784, 128], data=pd.gaussian_normal_randomizer()) + self.G_b1 = pd.Variable(np.zeros(128)) # variable also support initialization using a numpy data + self.G_W2 = pd.Varialble(np.random.rand(128, 1)) + self.G_b2 = pd.Variable(np.zeros(128)) + self.theta_G = [self.G_W0, self.G_b0, self.G_W1, self.G_b1, self.G_W2, self.G_b2] +``` + +### Class member Function: Generator +- Given a noisy input z, returns a fake image. +- Concatenation, batch-norm, FC operations required; +- Deconv layer required, which is missing now... +```python +class DCGAN(object): + def generator(self, z, y = None): + # input z: the random noise + # input y: input data label (optional) + # output G_im: generated fake images + + if not self.y_dim: + z = pd.layer.concat(1, [z, y]) + + G_h0 = pd.layer.fc(z, self.G_w0, self.G_b0) + G_h0_bn = pd.layer.batch_norm(G_h0) + G_h0_relu = pd.layer.relu(G_h0_bn) + + G_h1 = pd.layer.deconv(G_h0_relu, self.G_w1, self.G_b1) + G_h1_bn = pd.layer.batch_norm(G_h1) + G_h1_relu = pd.layer.relu(G_h1_bn) + + G_h2 = pd.layer.deconv(G_h1_relu, self.G_W2, self.G_b2)) + G_im = pd.layer.tanh(G_im) + return G_im +``` + +### Class member function: Discriminator +- Given a noisy input z, returns a fake image. +- Concatenation, Convolution, batch-norm, FC, Leaky-ReLU operations required; +```python +class DCGAN(object): + def discriminator(self, image): + # input image: either generated images or real ones + # output D_h2: binary logit of the label + + D_h0 = pd.layer.conv2d(image, w=self.D_w0, b=self.D_b0) + D_h0_bn = pd.layer.batchnorm(h0) + D_h0_relu = pd.layer.lrelu(h0_bn) + + D_h1 = pd.layer.conv2d(D_h0_relu, w=self.D_w1, b=self.D_b1) + D_h1_bn = pd.layer.batchnorm(D_h1) + D_h1_relu = pd.layer.lrelu(D_h1_bn) + + D_h2 = pd.layer.fc(D_h1_relu, w=self.D_w2, b=self.D_b2) + return D_h2 +``` + +### Class member function: Build the model +- Define data readers as placeholders to hold the data; +- Build generator and discriminators; +- Define two training losses for discriminator and generator, respectively. +If we have execution dependency engine to back-trace all tensors, the module building our GAN model will be like this: +```python +class DCGAN(object): + def build_model(self): + if self.y_dim: + self.y = pd.data(pd.float32, [self.batch_size, self.y_dim]) + self.images = pd.data(pd.float32, [self.batch_size, self.im_size, self.im_size]) + self.faked_images = pd.data(pd.float32, [self.batch_size, self.im_size, self.im_size]) + self.z = pd.data(tf.float32, [None, self.z_size]) + + # step 1: generate images by generator, classify real/fake images with discriminator + if self.y_dim: # if conditional GAN, includes label + self.G = self.generator(self.z, self.y) + self.D_t = self.discriminator(self.images) + # generated fake images + self.sampled = self.sampler(self.z, self.y) + self.D_f = self.discriminator(self.G) + else: # original version of GAN + self.G = self.generator(self.z) + self.D_t = self.discriminator(self.images) + # generate fake images + self.sampled = self.sampler(self.z) + self.D_f = self.discriminator(self.images) + + # step 2: define the two losses + self.d_loss_real = pd.reduce_mean(pd.cross_entropy(self.D_t, np.ones(self.batch_size)) + self.d_loss_fake = pd.reduce_mean(pd.cross_entropy(self.D_f, np.zeros(self.batch_size)) + self.d_loss = self.d_loss_real + self.d_loss_fake + + self.g_loss = pd.reduce_mean(pd.cross_entropy(self.D_f, np.ones(self.batch_szie)) +``` + +If we do not have dependency engine but blocks, the module building our GAN model will be like this: +```python +class DCGAN(object): + def build_model(self, default_block): + # input data in the default block + if self.y_dim: + self.y = pd.data(pd.float32, [self.batch_size, self.y_dim]) + self.images = pd.data(pd.float32, [self.batch_size, self.im_size, self.im_size]) + # self.faked_images = pd.data(pd.float32, [self.batch_size, self.im_size, self.im_size]) + self.z = pd.data(tf.float32, [None, self.z_size]) + + # step 1: generate images by generator, classify real/fake images with discriminator + with pd.default_block().g_block(): + if self.y_dim: # if conditional GAN, includes label + self.G = self.generator(self.z, self.y) + self.D_g = self.discriminator(self.G, self.y) + else: # original version of GAN + self.G = self.generator(self.z) + self.D_g = self.discriminator(self.G, self.y) + self.g_loss = pd.reduce_mean(pd.cross_entropy(self.D_g, np.ones(self.batch_szie)) + + with pd.default_block().d_block(): + if self.y_dim: # if conditional GAN, includes label + self.D_t = self.discriminator(self.images, self.y) + self.D_f = self.discriminator(self.G, self.y) + else: # original version of GAN + self.D_t = self.discriminator(self.images) + self.D_f = self.discriminator(self.G) + + # step 2: define the two losses + self.d_loss_real = pd.reduce_mean(pd.cross_entropy(self.D_t, np.ones(self.batch_size)) + self.d_loss_fake = pd.reduce_mean(pd.cross_entropy(self.D_f, np.zeros(self.batch_size)) + self.d_loss = self.d_loss_real + self.d_loss_fake +``` +Some small confusion and problems with this design: +- D\_g and D\_f are actually the same thing, but has to be written twice; i.e., if we want to run two sub-graphs conceptually, the same codes have to be written twice if they are shared by the graph. +- Requires ability to create a block anytime, rather than in if-else or rnn only; + +## Main function for the demo: +Generally, the user of GAN just need to the following things: +- Define an object as DCGAN class; +- Build the DCGAN model; +- Specify two optimizers for two different losses with respect to different parameters. +```python +# pd for short, should be more concise. +from paddle.v2 as pd +import numpy as np +import logging + +if __name__ == "__main__": + # dcgan class in the default graph/block + # if we use dependency engine as tensorflow + # the codes, will be slightly different like: + # dcgan = DCGAN() + # dcgan.build_model() + with pd.block() as def_block: + dcgan = DCGAN() + dcgan.build_model(def_block) + + # load mnist data + data_X, data_y = self.load_mnist() + + # Two subgraphs required!!! + with pd.block().d_block(): + d_optim = pd.train.Adam(lr = .001, beta= .1) + d_step = d_optim.minimize(dcgan.d_loss, dcgan.theta_D) + with pd.block.g_block(): + g_optim = pd.train.Adam(lr = .001, beta= .1) + g_step = pd.minimize(dcgan.g_loss, dcgan.theta_G) + + # executor + sess = pd.executor() + + # training + for epoch in xrange(10000): + for batch_id in range(N / batch_size): + idx = ... + # sample a batch + batch_im, batch_label = data_X[idx:idx+batch_size], data_y[idx:idx+batch_size] + # sample z + batch_z = np.random.uniform(-1., 1., [batch_size, z_dim]) + + if batch_id % 2 == 0: + sess.run(d_step, + feed_dict = {dcgan.images: batch_im, + dcgan.y: batch_label, + dcgan.z: batch_z}) + else: + sess.run(g_step, + feed_dict = {dcgan.z: batch_z}) +``` + +# More thinking about dependency engine v.s. block design: +- What if we just want to run an intermediate result? Do we need to run the whole block/graph? +- Should we call eval() to get the fake images in the first stage? And then train the discriminator in the second stage? diff --git a/doc/design/graph.md b/doc/design/graph.md new file mode 100644 index 0000000000000000000000000000000000000000..7519a65df835a39fe14f6ef45530afff170191ff --- /dev/null +++ b/doc/design/graph.md @@ -0,0 +1,70 @@ +# Design Doc: Computations as a Graph + +A primary goal of the refactorization of PaddlePaddle is a more flexible representation of deep learning computation, in particular, a graph of operators and variables, instead of sequences of layers as before. + +This document explains that the construction of a graph as three steps: + +- construct the forward part +- construct the backward part +- construct the optimization part + +## The Construction of a Graph + +Let us take the problem of image classification as a simple example. The application program that trains the model looks like: + +```python +x = layer.data("images") +l = layer.data("label") +y = layer.fc(x) +cost = layer.mse(y, l) +optimize(cost) +train(cost, reader=mnist.train()) +``` + +### Forward Part + +The first four lines of above program build the forward part of the graph. + +![](images/graph_construction_example_forward_only.png) + +In particular, the first line `x = layer.data("images")` creates variable x and a Feed operator that copies a column from the minibatch to x. `y = layer.fc(x)` creates not only the FC operator and output variable y, but also two parameters, W and b, and the initialization operators. + +Initialization operators are kind of "run-once" operators -- the `Run` method increments a class data member counter so to run at most once. By doing so, a parameter wouldn't be initialized repeatedly, say, in every minibatch. + +In this example, all operators are created as `OpDesc` protobuf messages, and all variables are `VarDesc`. These protobuf messages are saved in a `BlockDesc` protobuf message. + +### Backward Part + +The fifth line `optimize(cost)` calls two functions, `ConstructBackwardGraph` and `ConstructOptimizationGraph`. + +`ConstructBackwardGraph` traverses the forward graph in the `BlockDesc` protobuf message and builds the backward part. + +![](images/graph_construction_example_forward_backward.png) + +According to the chain rule of gradient computation, `ConstructBackwardGraph` would + +1. create a gradient operator G for each operator F, +1. make all inputs, outputs, and outputs' gradient of F as inputs of G, +1. create gradients for all inputs of F, except for those who don't have gradients, like x and l, and +1. make all these gradients as outputs of G. + +### Optimization Part + +For each parameter, like W and b created by `layer.fc`, marked as double circles in above graphs, `ConstructOptimizationGraph` creates an optimization operator to apply its gradient. Here results in the complete graph: + +![](images/graph_construction_example_all.png) + +## Block and Graph + +The word block and graph are interchangable in the desgin of PaddlePaddle. A [Block](https://github.com/PaddlePaddle/Paddle/pull/3708) is a metaphore of the code and local variables in a pair of curly braces in programming languages, where operators are like statements or instructions. A graph of operators and variables is a representation of the block. + +A Block keeps operators in an array `BlockDesc::ops` + +```protobuf +message BlockDesc { + repeated OpDesc ops = 1; + repeated VarDesc vars = 2; +} +``` + +in the order that they appear in user programs, like the Python program at the beginning of this article. We can imagine that in `ops`, we have some forward operators, followed by some gradient operators, and then some optimization operators. diff --git a/doc/design/graph_survey.md b/doc/design/graph_survey.md new file mode 100644 index 0000000000000000000000000000000000000000..6c6db08f463ae0a2b94fc4546f123a1d7c151870 --- /dev/null +++ b/doc/design/graph_survey.md @@ -0,0 +1,232 @@ +## Survey on Graph + +Neural network framework often provides symbolic API for users to write network topology conveniently. This doc manily focus on symbolic API in most popular neural network frameworks, and try to find out how to parse symbolic configuration to a portable file, such as protobuf or json. + +### Mxnet + +The core concept of symbolic API is `Symbol`. Mxnet implements `Symbol` class in C++, and export to Python using C-API. Please refer to the comments in Mxnet: + + +`Symbol` is help class used to represent the operator node in Graph. +`Symbol` acts as an interface for building graphs from different components like Variable, Functor and Group. `Symbol` is also exported to python front-end (while Graph is not) to enable quick test and deployment. Conceptually, symbol is the final operation of a graph and thus including all the information required (the graph) to evaluate its output value. + + +A simple network topology wrote by Symbol is as follows: + +```python +def get_symbol(num_classes=10, **kwargs): + data = mx.symbol.Variable('data') + data = mx.symbol.Flatten(data=data) + fc1 = mx.symbol.FullyConnected(data = data, name='fc1', num_hidden=128) + act1 = mx.symbol.Activation(data = fc1, name='relu1', act_type="relu") + fc2 = mx.symbol.FullyConnected(data = act1, name = 'fc2', num_hidden = 64) + act2 = mx.symbol.Activation(data = fc2, name='relu2', act_type="relu") + fc3 = mx.symbol.FullyConnected(data = act2, name='fc3', num_hidden=num_classes) + mlp = mx.symbol.SoftmaxOutput(data = fc3, name = 'softmax') + return mlp +``` + + + +Varible here is actually a Symbol. Every basic Symbol will correspond to one Node, and every Node has its own NodeAttr. There is a op field in NodeAttr class, when a Symbol represents Variable(often input data), the op field is null. + +Symbol contains a data member, std::vector outputs, and NodeEntry cantains a poniter to Node. We can follow the Node pointer to get all the Graph. + +And Symbol can be saved to a Json file. + +Here is a detailed example: + +``` +>>> import mxnet as mx +>>> data = mx.symbol.Variable('data') +>>> print data.debug_str() +Variable:data + +>>> data = mx.symbol.Flatten(data=data) +>>> print data.debug_str() +Symbol Outputs: + output[0]=flatten0(0) +Variable:data +-------------------- +Op:Flatten, Name=flatten0 +Inputs: + arg[0]=data(0) version=0 + +>>> fc1 = mx.symbol.FullyConnected(data = data, name='fc1', num_hidden=128) +>>> print fc1.debug_str() +Symbol Outputs: + output[0]=fc1(0) +Variable:data +-------------------- +Op:Flatten, Name=flatten0 +Inputs: + arg[0]=data(0) version=0 +Variable:fc1_weight +Variable:fc1_bias +-------------------- +Op:FullyConnected, Name=fc1 +Inputs: + arg[0]=flatten0(0) + arg[1]=fc1_weight(0) version=0 + arg[2]=fc1_bias(0) version=0 +Attrs: + num_hidden=128 + +``` + + +### TensorFlow + + +The core concept of symbolic API is `Tensor`. Tensorflow defines `Tensor` in Python. Please refer to the comments in TensorFlow: + +A `Tensor` is a symbolic handle to one of the outputs of an `Operation`. It does not hold the values of that operation's output, but instead provides a means of computing those values in a TensorFlow [Session](https://www.tensorflow.org/api_docs/python/tf/Session). + +A simple example is as follows: + +```python + # Build a dataflow graph. + c = tf.constant([[1.0, 2.0], [3.0, 4.0]]) + d = tf.constant([[1.0, 1.0], [0.0, 1.0]]) + e = tf.matmul(c, d) + + # Construct a `Session` to execute the graph. + sess = tf.Session() + + # Execute the graph and store the value that `e` represents in `result`. + result = sess.run(e) +``` + + +The main method of `Tensor` is as follows: + + +```python +@property +def op(self): + """The `Operation` that produces this tensor as an output.""" + return self._op + +@property +def dtype(self): + """The `DType` of elements in this tensor.""" + return self._dtype + +@property +def graph(self): + """The `Graph` that contains this tensor.""" + return self._op.graph + +@property +def name(self): + """The string name of this tensor.""" + if not self._op.name: + raise ValueError("Operation was not named: %s" % self._op) + return "%s:%d" % (self._op.name, self._value_index) + +@property +def device(self): + """The name of the device on which this tensor will be produced, or None.""" + return self._op.device +``` + + +Tensor can be taken as target to run by session. Tensor contains all the information of Graph, and tracks data dependency. + + +Here is a detailed example: + + +``` +>>> import tensorflow as tf +>>> c = tf.constant([[1.0, 2.0], [3.0, 4.0]]) +>>> print c.graph + +>>> d = tf.constant([[1.0, 1.0], [0.0, 1.0]]) +>>> print d.graph + +>>> e = tf.matmul(c, d) +>>> print e.graph + +``` + +### Dynet + + +The core concept of symbolic API is `Expression`, and Dynet defines `Expression` class in C++. + + +A simple example is as follows: + +```cpp +ComputationGraph cg; +Expression W = parameter(cg, pW); + +Expression in = input(cg, xs[i]); +Expression label = input(cg, ys[i]); +Expression pred = W * in; +Expression loss = square(pred - label); +``` + +The input data and parameter are also represented by Expression. Every basci Expression corresponds to a Node. And input data is also a Node. + +Expression has a data member ComputationGraph, and ComputationGraph will be modified in users' configuring process. Expression can be a running target, beacuse Expression contains all dependency. + + +Here is a detailed example: + +write topology in C++ + +``` +ComputationGraph cg; +Expression W = parameter(cg, pW); +cg.print_graphviz(); + +Expression pred = W * xs[i]; +cg.print_graphviz(); + +Expression loss = square(pred - ys[i]); +cg.print_graphviz(); +``` + +compile and print + +``` +# first print +digraph G { + rankdir=LR; + nodesep=.05; + N0 [label="v0 = parameters({1}) @ 0x7ffe4de00110"]; +} +# second print +digraph G { + rankdir=LR; + nodesep=.05; + N0 [label="v0 = parameters({1}) @ 0x7ffe4de00110"]; + N1 [label="v1 = v0 * -0.98"]; + N0 -> N1; +} +# third print +digraph G { + rankdir=LR; + nodesep=.05; + N0 [label="v0 = parameters({1}) @ 0x7ffe4de00110"]; + N1 [label="v1 = v0 * -0.98"]; + N0 -> N1; + N2 [label="v2 = -1.88387 - v1"]; + N1 -> N2; + N3 [label="v3 = -v2"]; + N2 -> N3; + N4 [label="v4 = square(v3)"]; + N3 -> N4; +} +``` + +### Conclusion + + +Actually, Symbol/Tensor/Expression in Mxnet/TensorFlow/Dynet are the same level concepts. We use a unified name Expression here, this level concept has following features: + +- Users wirte topoloy with symbolic API, and all return value is Expression, including input data and parameter. +- Expression corresponds with a global Graph, and Expression can also be composed. +- Expression tracks all dependency and can be taken as a run target diff --git a/doc/design/if_else_op.md b/doc/design/if_else_op.md new file mode 100644 index 0000000000000000000000000000000000000000..26d140f06db4ecefa86be015eaa731ffddc6910c --- /dev/null +++ b/doc/design/if_else_op.md @@ -0,0 +1,51 @@ +# The `IfElse` Operator + +PaddlePaddle's `IfElse` operator differs from TensorFlow's: + +- the TensorFlow version takes a scalar boolean value as the condition so that the whole mini-batch goes to either the true or the false branch, whereas +- the PaddlePaddle version takes a vector of boolean value as the condition, and instances corresponding to true values go to the true branch, those corresponding to false values go to the false branch. + +## Example + +The following PaddlePaddle program shows the usage of the IfElse operator: + +```python +import paddle as pd + +x = minibatch([10, 20, 30]) # shape=[None, 1] +y = var(1) # shape=[1], value=1 +z = minibatch([10, 20, 30]) # shape=[None, 1] +cond = larger_than(x, 15) # [false, true, true] + +ie = pd.ifelse() +with ie.true_block(): + d = pd.layer.add(x, y) + ie.output(d, pd.layer.softmax(d)) +with ie.false_block(): + d = pd.layer.fc(z) + ie.output(d, d+1) +o1, o2 = ie(cond) +``` + +A challenge to implement the `IfElse` operator is to infer those variables to be split, or, say, to identify the variable of the mini-batch or those derived from the mini-batch. + +An equivalent C++ program is as follows: + +```c++ +namespace pd = paddle; + +int x = 10; +int y = 1; +int z = 10; +bool cond = false; +int o1, o2; +if (cond) { + int d = x + y; + o1 = z; + o2 = pd::layer::softmax(z); +} else { + int d = pd::layer::fc(z); + o1 = d; + o2 = d+1; +} +``` diff --git a/doc/design/images/asgd.gif b/doc/design/images/asgd.gif new file mode 100644 index 0000000000000000000000000000000000000000..4a0da7bf6df9326a2aab1638b77c5455c18b8c4e Binary files /dev/null and b/doc/design/images/asgd.gif differ diff --git a/doc/design/images/control_flow_graph.png b/doc/design/images/control_flow_graph.png new file mode 100644 index 0000000000000000000000000000000000000000..3579998e58d07abc50bd3332128d4733a391cb3b Binary files /dev/null and b/doc/design/images/control_flow_graph.png differ diff --git a/doc/design/images/dataflow_equations.png b/doc/design/images/dataflow_equations.png new file mode 100644 index 0000000000000000000000000000000000000000..c10f7f69f4007952e5b0394edaa04efa1cfbb658 Binary files /dev/null and b/doc/design/images/dataflow_equations.png differ diff --git a/doc/design/images/deep_learning.png b/doc/design/images/deep_learning.png new file mode 100644 index 0000000000000000000000000000000000000000..026becc4d94e01e407dacb2a5314a0e5723334ff Binary files /dev/null and b/doc/design/images/deep_learning.png differ diff --git a/doc/design/images/duplicate_op.graffle b/doc/design/images/duplicate_op.graffle new file mode 100644 index 0000000000000000000000000000000000000000..5979f792e252f028a615729215529c2be42d9165 Binary files /dev/null and b/doc/design/images/duplicate_op.graffle differ diff --git a/doc/design/images/duplicate_op.png b/doc/design/images/duplicate_op.png new file mode 100644 index 0000000000000000000000000000000000000000..f299c5d37f260a1bb0daec886f0a4ee1c1f31c92 Binary files /dev/null and b/doc/design/images/duplicate_op.png differ diff --git a/doc/design/images/duplicate_op2.graffle b/doc/design/images/duplicate_op2.graffle new file mode 100644 index 0000000000000000000000000000000000000000..5cec3bc64dbd44dc99e348485969f29bd128ceb1 Binary files /dev/null and b/doc/design/images/duplicate_op2.graffle differ diff --git a/doc/design/images/duplicate_op2.png b/doc/design/images/duplicate_op2.png new file mode 100644 index 0000000000000000000000000000000000000000..21cdd5cabf1b5203e1435a75b57770d2f702fa92 Binary files /dev/null and b/doc/design/images/duplicate_op2.png differ diff --git a/doc/design/images/feed_forward.png b/doc/design/images/feed_forward.png new file mode 100644 index 0000000000000000000000000000000000000000..d312371a04c26aa6cd196e0bd1f51becb425180b Binary files /dev/null and b/doc/design/images/feed_forward.png differ diff --git a/doc/design/images/feed_forward_regularized.png b/doc/design/images/feed_forward_regularized.png new file mode 100644 index 0000000000000000000000000000000000000000..677e99bfd9f8e72ed9fe4b27127af2ced202f447 Binary files /dev/null and b/doc/design/images/feed_forward_regularized.png differ diff --git a/doc/design/images/graph_construction_example.bash b/doc/design/images/graph_construction_example.bash new file mode 100755 index 0000000000000000000000000000000000000000..35e6997abd17588e17a82d448918fc1b3bd7220e --- /dev/null +++ b/doc/design/images/graph_construction_example.bash @@ -0,0 +1,11 @@ +cat ./graph_construction_example.dot | \ + sed 's/color=red/color=red, style=invis/g' | \ + sed 's/color=green/color=green, style=invis/g' | \ + dot -Tpng > graph_construction_example_forward_only.png + +cat ./graph_construction_example.dot | \ + sed 's/color=green/color=green, style=invis/g' | \ + dot -Tpng > graph_construction_example_forward_backward.png + +cat ./graph_construction_example.dot | \ + dot -Tpng > graph_construction_example_all.png diff --git a/doc/design/images/graph_construction_example.dot b/doc/design/images/graph_construction_example.dot new file mode 100644 index 0000000000000000000000000000000000000000..e115f9844bae6ad24f638c8ed4749cea8aff06a9 --- /dev/null +++ b/doc/design/images/graph_construction_example.dot @@ -0,0 +1,68 @@ +digraph ImageClassificationGraph { + ///////// The forward part ///////// + FeedX [label="Feed", color=blue, shape=box]; + FeedY [label="Feed", color=blue, shape=box]; + InitW [label="Init", color=blue, shape=diamond]; + Initb [label="Init", color=blue, shape=diamond]; + FC [label="FC", color=blue, shape=box]; + MSE [label="MSE", color=blue, shape=box]; + + x [label="x", color=blue, shape=oval]; + l [label="l", color=blue, shape=oval]; + y [label="y", color=blue, shape=oval]; + W [label="W", color=blue, shape=doublecircle]; + b [label="b", color=blue, shape=doublecircle]; + cost [label="cost", color=blue, shape=oval]; + + FeedX -> x -> FC -> y -> MSE -> cost [color=blue]; + FeedY -> l [color=blue]; + InitW -> W [color=blue]; + Initb -> b [color=blue]; + W -> FC [color=blue]; + b -> FC [color=blue]; + l -> MSE [color=blue]; + + ////////// The backward part ///////// + MSE_Grad [label="MSE_grad", color=red, shape=box]; + FC_Grad [label="FC_grad", color=red, shape=box]; + + d_cost [label="d cost", color=red, shape=oval]; + d_y [label="d y", color=red, shape=oval]; + d_b [label="d b", color=red, shape=oval]; + d_W [label="d W", color=red, shape=oval]; + + cost -> MSE_Grad [color=red]; + d_cost -> MSE_Grad [color=red]; + l -> MSE_Grad [color=red]; + y -> MSE_Grad -> d_y [color=red]; + + x -> FC_Grad [color=red]; + y -> FC_Grad [color=red]; + d_y -> FC_Grad [color=red]; + W -> FC_Grad -> d_W [color=red]; + b -> FC_Grad -> d_b [color=red]; + + ////////// The optimizaiton part ////////// + + OPT_W [label="SGD", color=green, shape=box]; + OPT_b [label="SGD", color=green, shape=box]; + + W -> OPT_W [color=green]; + b -> OPT_b [color=green]; + d_W -> OPT_W -> W [color=green]; + d_b -> OPT_b -> b [color=green]; + + ////////// Groupings ////////// + + subgraph clusterMSE { + style=invis; + MSE; + MSE_Grad; + } + + subgraph clusterFC { + style=invis; + FC; + FC_Grad; + } +} diff --git a/doc/design/images/graph_construction_example_all.png b/doc/design/images/graph_construction_example_all.png new file mode 100644 index 0000000000000000000000000000000000000000..261611a5721f9aa97874f7e6d897fe48cf667db2 Binary files /dev/null and b/doc/design/images/graph_construction_example_all.png differ diff --git a/doc/design/images/graph_construction_example_forward_backward.png b/doc/design/images/graph_construction_example_forward_backward.png new file mode 100644 index 0000000000000000000000000000000000000000..4c69687f4a6a181138f3df72ce5e8aa48487b5be Binary files /dev/null and b/doc/design/images/graph_construction_example_forward_backward.png differ diff --git a/doc/design/images/graph_construction_example_forward_only.png b/doc/design/images/graph_construction_example_forward_only.png new file mode 100644 index 0000000000000000000000000000000000000000..e668c16e0cac73acb4e5dc2b1827557ae77126b4 Binary files /dev/null and b/doc/design/images/graph_construction_example_forward_only.png differ diff --git a/doc/design/images/l1_regularization.png b/doc/design/images/l1_regularization.png new file mode 100644 index 0000000000000000000000000000000000000000..e1b9c7a44f94dc027598a98da93ddb8133190972 Binary files /dev/null and b/doc/design/images/l1_regularization.png differ diff --git a/doc/design/images/l2_regularization.png b/doc/design/images/l2_regularization.png new file mode 100644 index 0000000000000000000000000000000000000000..d5c2fcbc2ccae75ad083162e5a2dceb0210be298 Binary files /dev/null and b/doc/design/images/l2_regularization.png differ diff --git a/doc/design/images/loss_equation.png b/doc/design/images/loss_equation.png new file mode 100644 index 0000000000000000000000000000000000000000..14212ec8d36c803de96bde8a9a4b5591bd20434e Binary files /dev/null and b/doc/design/images/loss_equation.png differ diff --git a/doc/design/images/multigpu_allreduce.graffle b/doc/design/images/multigpu_allreduce.graffle new file mode 100644 index 0000000000000000000000000000000000000000..cb5bc420ceafe8ba4c87694d44ee4e5e4ad06779 Binary files /dev/null and b/doc/design/images/multigpu_allreduce.graffle differ diff --git a/doc/design/images/multigpu_allreduce.png b/doc/design/images/multigpu_allreduce.png new file mode 100644 index 0000000000000000000000000000000000000000..87a1b3e8f6dd4a713ec9df9f0037d1da04e9178a Binary files /dev/null and b/doc/design/images/multigpu_allreduce.png differ diff --git a/doc/design/images/multigpu_before_convert.graffle b/doc/design/images/multigpu_before_convert.graffle new file mode 100644 index 0000000000000000000000000000000000000000..6c35ab1b21fb76ceae82d3693ed0d085b5bc0855 Binary files /dev/null and b/doc/design/images/multigpu_before_convert.graffle differ diff --git a/doc/design/images/multigpu_before_convert.png b/doc/design/images/multigpu_before_convert.png new file mode 100644 index 0000000000000000000000000000000000000000..9c8f7711165d80a2fa3911280fdee91855a401b1 Binary files /dev/null and b/doc/design/images/multigpu_before_convert.png differ diff --git a/doc/design/images/profiler.png b/doc/design/images/profiler.png new file mode 100644 index 0000000000000000000000000000000000000000..d57b71ca88aaba5d05584a6219d84214e285a1e1 Binary files /dev/null and b/doc/design/images/profiler.png differ diff --git a/doc/design/images/theta_star.gif b/doc/design/images/theta_star.gif new file mode 100644 index 0000000000000000000000000000000000000000..dd24d33e124396be3fc410c9b12f33148f64efe2 Binary files /dev/null and b/doc/design/images/theta_star.gif differ diff --git a/doc/design/infer_var_type.md b/doc/design/infer_var_type.md new file mode 100644 index 0000000000000000000000000000000000000000..d9d5397becba2ef1806d9341cd49cd9aabbf4a6a --- /dev/null +++ b/doc/design/infer_var_type.md @@ -0,0 +1,78 @@ +# Design Doc: InferVarType + +## The Problem Posed + +The variable in our design can hold variant types. Such as `LoDTensor` and `SelectedRows`. An operator should be able to inference the variable types of its output. + +For example, a `lookup table` operator takes two `LoDTensor`; one is a float tensor as the embedding table, the other is an int tensor as word ID. The gradient operator of `lookup table` will generate a `SelectedRows` as its output. A `sum` operator can take both `LoDTensor` and `SelectedRows` as its inputs and will generate a `LoDTensor` if any of its inputs is `LoDTensor`, otherwise, the `sum` operator will generate `SelectedRows` as its output. + +The variable type will be constant at runtime. Every variable's type can either be set by the user (input data and parameter) or be inferred by the operator in compile time. + +## Proposed Solution + +The `InferVarType` is a compile-time function which is registered to each operator. The inferface of that function is: + + +```c++ +using InferVarTypeFN = std::function< + void (const OpDescBind& /*op_desc*/, BlockDescBind* /*block*/)>; +``` + +It takes an operator description as its input and will write the output variable type and store them in block description. + +The `InferVarTypeFN` will be registered in `OpInfo`, to replace `infer_var_type_` field. The `OpInfo` should be + +```cpp +struct OpInfo { + InferVarTypeFN infer_var_type_; + ... +}; +``` + +The default `InferVarType` will set output type as `LoDTensor`. It can be done by `GetInferVarType()`. + +```cpp +void DefaultInferVarType(const OpDescBind& op_desc, BlockDescBind* block) { + // set the output type of variable as `LoDTensor`. + // ... +} + +struct OpInfo { + InferVarTypeFN infer_var_type_; + InferVarTypeFN GetInferVarType() const { + if (infer_var_type_) { + return infer_var_type_; + } else { + return DefaultInferVarType; + } + } +}; +``` + +## Register InferVarType + +We provide a thin base class for registering an `InferVarTypeFN`. To use a base class will ease the implementation of registry since we can detect the registry entry is an `InferVarTypeFN` or not. + +```cpp +class VarTypeInferer { +public: + virtual void operator()(const OpDescBind& op_desc, BlockDescBind* block) const = 0; +} +``` + +Operator developers can write the specialize `VarTypeInferer` as follow. + +```cpp +class SpecialVarTypeInferer : public VarTypeInferer { +public: + virtual void operator()(const OpDescBind& op_desc, BlockDescBind* block) const { + // .. own logic + } +} +``` + +Then user can register the `InferVarType` just like `GradOpDescMaker` and `OpInfoMaker`. + +``` +REGISTER_OPERATOR(some_op, OpType, SpecialVarTypeInferer, ...); +``` diff --git a/doc/design/kernel_hint_design.md b/doc/design/kernel_hint_design.md new file mode 100644 index 0000000000000000000000000000000000000000..a54b7da045e1a362626ef066f9ebb56af2c3181a --- /dev/null +++ b/doc/design/kernel_hint_design.md @@ -0,0 +1,57 @@ +## Problem +In PaddlePaddle's [Design](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/switch_kernel.md), one Operator may have multiple kernels. Users may have some personal preference to choose a certain type of kernel for an operator, such as `force_cpu` to choose a CPU kernel, `use_cudnn` to choose a CUDNN kernel, we need to provide a way for users to do this. + +In the current design, we use KernelType to describe one kernel. + +```cpp +struct KernelType { + Place place_; + DataType data_type_; + LayoutType layout_; +}; +``` + `place_` `data_type_` and `layout_` can be got from the input tensors of the operator, `GetActualKernelType(inputs)` use inputs to infer the proper kernel key that fit the incoming data, but users can not directly configure it. + +The [design](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/switch_kernel.md) also provides a virtual method `GetExpectedKernelType` that user can overload and use to choose the KernelType they want to use. + +So we should send the information user defined in proto to `GetExpectedKernelType` for choosing a kernel. + +The problem is, how should we define and send the information for `GetExpectedKernelType` to use? + +## Solution + +### Potential choice +1. Do nothing, let the user add the information they want to operator‘s attribute and get them inside `GetExpectedKernelType`, this can work properly. But there is a little problem that users may define many kinds of hints for the same purpose, such as `force_cpu`, `use_cpu`, `cpu_kernel` to choose CPU kernel, and `use_cudnn`, `force_cudnn`, `cudnn_kernel` to choose CUDNN kernel. + +2. Pre-define all the needed option and use a single attr key such as `kernel_hint` for the user, this is not so flexible if the user wants to define some more kind of hint. + +### Final choice +To provide enough flexibility while avoiding confusion definition, we can define some global constants for these attribute names, such as `force_cpu`, `use_cudnn`, `use_mkldnn` for a user to choose. + +In C++ + +```cpp +const std::string kForceCPU = "force_cpu"; +const std::string kUseCUDNN = "use_cudnn"; +const std::string kUseMKLDNN = "use_mkldnn"; + +KernelType GetExpectedKernelType() { + if (Attr(kForceCPU)) { + return KernelType(CPUPlace, ...) + } else { + ... + } +} +``` + +In Python code + +```python +FORCE_CPU = core.kForceCPU() + +def xx_layer(..., force_cpu=false): + layer_helper = LayerHelper(...) + layer_helper.append_op( + type="xx", + attr={FORCE_CPU: force_cpu}) +``` diff --git a/doc/design/memory_optimization.md b/doc/design/memory_optimization.md new file mode 100644 index 0000000000000000000000000000000000000000..1f68cef4cc28cd005acbeaa5c03cc0d84a83939c --- /dev/null +++ b/doc/design/memory_optimization.md @@ -0,0 +1,217 @@ +# Memory Optimization + + +## Problem + +In a lecture from Andrew Ng, he attributes the recent sucess of AI due to a combination of these: + +- Availability of Big Data +- Supercomputing power to process this Big Data over very large neural networks +- Modern algorithms + +Following graph shows the details: + +![](images/deep_learning.png) + +Larger model usually bring better performance. However, GPU memory is limited. For example, the memory size of a GTX TITAN X is only 12GB. To train complex and large models, we have to take care of memory usage. Besides, memory optimization is also necessary in both online/mobile inference. + +## Solution + +### Basic Strategy + +There are some basic strategies to improve memory usage, including in-place operations and memory sharing. + +#### In-place Operation +In a relu activation operator: + +$y = \max(x, 0)$ + +If the variable x is not used in any other operator, we can make an in-place operation. In other words, the memory block of variable y and variable x will be the same. In-place operations will save 50% memory occupancy immediately. + +#### Memory Sharing + +Not all operators support in-place operations. Memory sharing is a more general strategy. + +Following is an example: + +``` +a = op1(b, c); +d = op2(a) +e = op3(d, f) +``` + +In this case, variable a is no longer used, and op2 does not support in-place operation. After op2 finishes, we can put the memory of variable a to a memory pool. Then, variable e can share the memory of variable a from the pool. + + +### Live Variable Analysis + +It's not enough to only have some basic strategies. The pre-requisite of memory optimization is to know if a variable is still "live" after an operation. + +In our design, the neural network topology is defined as a program. Luckily, [live variable analysis](https://en.wikipedia.org/wiki/Live_variable_analysis) is a classic problem in compilers which can be used in many stages, such as register allocation. + +In compilers, the front end of the compiler translates programs into an intermediate language with an unbounded number of temporary variables. This program must run on a machine with a bounded number of registers. Two temporary variables a and b can fit into the same register, if a and b are never "in use" at the same time. Thus, many temporary variables can fit in few registers; if they don't all fit, the excess tempory variables can be kept in memory. + +Therefore, the compiler needs to analyze the intermediate-representation program to determine which temporary variables are in use at the same time. We say a variable is "live" if it holds a value that may be needed in the future, so this analysis is called liveness analysis. + +We can leran these techniques from compilers. There are mainly two stages to make live variable analysis: + +- construct a control flow graph +- solve the dataflow equations + + +#### Control Flow Graph +To perform analysis on a program, it is often useful to make a control flow graph. A [control flow graph](https://en.wikipedia.org/wiki/Control_flow_graph) (CFG) in computer science is a representation, using graph notation, of all paths that might be traversed through a program during its execution. Each statement in the program is a node in the flow graph; if statemment x can be followed by statement y, there is an egde from x to y. + +Following is the flow graph for a simple loop. + +![](images/control_flow_graph.png) + +#### Dataflow Analysis + +Liveness of variable "flows" around the edges of the control flow graph; determining the live range of each variable is an example of a dataflow problem. [Dataflow analysis](https://en.wikipedia.org/wiki/Data-flow_analysis) is a technique for gathering information about the possible set of values calculated at various points in a computer program. + +A simple way to perform data-flow analysis of programs is to set up dataflow equations for each node of the control flow graph and solve them by repeatedly calculating the output from the input locally at each node until the whole system stabilizes. + +- Flow Graph Terminology + +A flow graph node has out-edges that lead to sucessor nodes, and in-edges that come from predecessor nodes. The set *pred[n]* is all the predecessors of node n, and *succ[n]* is the set of sucessors. +In former control flow graph, the out-edges of node 5 are 5 --> 6 and 5 --> 2, and *succ[5]* = {2, 6}. The in-edges of 2 are 5 --> 2 and 1 --> 2, and *pred[2]* = {1, 5}. + +- Uses and Defs + +An assignmemt to a variable or temporary defines that variable. An occurence of a variable on the right-hand side of an assginment(or in other expressions) uses the variable. We can define the *def* of a variable as the set of graph nodes that define it; or the *def* of a graph node as the set of variables that it defines; and the similarly for the *use* of a variable or graph node. In former control flow graph, *def(3)* = {c}, *use(3)* = {b, c}. + +- Liveness + +A variable is *live* on an edge if there is a directed path from that edge to a *use* of the variable that does not go through any *def*. A variable is *live-in* at a node if it is live on any of the in-edges of that node; it is *live-out* at a node if it is live on any of the out-edges of the node. + + +The calcution of liveness can be solved by iteration until a fixed pointer is reached. Following is the recursive formula: + +![](images/dataflow_equations.png) + +### Memory optimization transpiler + +At last, we take basic strategy and liveness analysis techniques learning from compilers to implement our memory optimization transpiler. + +#### add in-place attribute + +In-place is a built-in attribute of an operator. Since we treat in-place and other operators differently, we have to add an in-place attribute for every operator. + + +#### contruct control flow graph + +Following is the ProgramDesc protobuf of [machine translation](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/v2/fluid/tests/book/test_machine_translation.py) example. + +- Block0: + +``` +lookup_table +mul +... +while(sub-block idx 1) +... +array_to_lod_tensor +cross_entropy +... +while_grad(sub-block idx 2) +read_from_array +array_to_lod_tensor +... +``` + +- Block1 + +``` +read_from_array +read_from_array +... +write_to_array +increment +write_to_array +less_than +``` + +- Block2 + +``` +read_from_array +increment +... +write_to_array +write_to_array +``` + +We can transfer all the operators and variables in ProgramDesc to build a control flow graph. + +```python +class ControlFlowGraph(object): + def __init__(self, Program): + self._sucessors = defaultdict(set) + self._presucessors = defaultdict(set) + self._uses = defaultdict(set) + self._defs = defaultdict(set) + self._live_in = defaultdict(set) + self._live_out = defaultdict(set) + self._program = Program + + def build(self): + pass + + def dataflow_analysis(self): + pass + + def memory_optimization(self): + pass + + def get_program(self): + return self._program +``` + +#### Make dataflow analysis + +We follow the guide from compilers and try to solve the dataflow equation to get liveness of every variable. If the live-in of an operator node is different from the live-out, then we can make memory sharing. + +For example: + +``` +a = op1(b, c); +d = op2(a) +e = op3(d, f) +``` + +The dataflow analysis result is: + +``` +live_in(op1) = {b, c, f} +live_out(op1) = {a, f} + +live_in(op2) = {a, f} +live_out(op2) = {d, f} + +live_in(op3) = {d, f} +live_out(op3) = {} +``` + +After op1, we can process variable b and variable c; After op2, we can process variable a. After op3, we can process variable d and variable f. + +#### memory sharing policy + +A memory pool will be mantained in the stage of memory optimization. Each operator node will be scanned to determine memory optimization is done or not. If an operator satifies the requirement, following policy will be taken to handle input/output variables. + +``` +if op.support_inplace(): + i --> pool + pool --> o +else: + pool --> o + i --> pool +``` + + + +## Reference + +- [Lecture Notes From Artificial Intelligence Is The New Electricity By Andrew Ng](https://manavsehgal.com/lecture-notes-from-artificial-intelligence-is-the-new-electricity-by-andrew-ng-4712dcbf26e5) +- Modern compiler implementation in ML, by Andrew W. Appel +- [Optimizing Memory Consumption in Deep learning](https://mxnet.incubator.apache.org/architecture/note_memory.html) diff --git a/doc/design/mkl/image/engine.png b/doc/design/mkl/image/engine.png new file mode 100644 index 0000000000000000000000000000000000000000..1f5f65c2cc765a514a3ba9e7b7f468e1dc4b0c3b Binary files /dev/null and b/doc/design/mkl/image/engine.png differ diff --git a/doc/design/mkl/image/gradients.png b/doc/design/mkl/image/gradients.png new file mode 100644 index 0000000000000000000000000000000000000000..f031bcf8e4cec14e63075b8b9d2c7bbd9f1b1a3c Binary files /dev/null and b/doc/design/mkl/image/gradients.png differ diff --git a/doc/design/mkl/image/layers.png b/doc/design/mkl/image/layers.png new file mode 100644 index 0000000000000000000000000000000000000000..306f79b7a844610915eb8944128f57d2b7a3065a Binary files /dev/null and b/doc/design/mkl/image/layers.png differ diff --git a/doc/design/mkl/image/matrix.png b/doc/design/mkl/image/matrix.png new file mode 100644 index 0000000000000000000000000000000000000000..c33ce9cf0335e47cc8c1253304d0fe179186e6f2 Binary files /dev/null and b/doc/design/mkl/image/matrix.png differ diff --git a/doc/design/mkl/image/overview.png b/doc/design/mkl/image/overview.png new file mode 100644 index 0000000000000000000000000000000000000000..8fb7bbb9dd654bf363d701d0c8cd4a557043d188 Binary files /dev/null and b/doc/design/mkl/image/overview.png differ diff --git a/doc/design/mkl/mkl_packed.md b/doc/design/mkl/mkl_packed.md new file mode 100644 index 0000000000000000000000000000000000000000..0123315ad4368e68b377f66119949bfd6c1c7860 --- /dev/null +++ b/doc/design/mkl/mkl_packed.md @@ -0,0 +1,108 @@ +# Intel® MKL Packed on PaddlePaddle: Design Doc + + +## Contents + +- [Overview](#overview) +- [Key Points](#key-points) + - [Background](#background) + - [Solution](#solution) +- [Actions](#actions) + - [CMake](#cmake) + - [Layers](#layers) + - [Unit Tests](#unit-tests) + - [Python API](#python-api) + - [Benchmarking](#benchmarking) + + +## Overview +我们计划将 Intel® MKL 中引入的 GEMM Packed APIs\[[1](#references)\] 集成到 PaddlePaddle 中,充分发挥英特尔平台的优势,有效提升PaddlePaddle在英特尔架构上的性能。 +现阶段的优化主要针对 Recurrent Neural Network(以下简称RNN)相关层(包括`RecurrentLayer`, `GatedRecurrentLayer`和`LstmLayer`), 以及 PaddlePaddle V1 API。 + +## Key Points + +### Background +目前PaddlePaddle采用了 Intel® MKL库的[cblas_?gemm](https://software.intel.com/en-us/mkl-developer-reference-c-cblas-gemm)函数,这个函数本身会在计算前将原数据转换为更适合英特尔平台的内部格式。 + +1. 转换耗时 \ +这一数据格式的转换操作(Packing),在问题本身的计算量比较小的时候,显得相对来说较为耗时。例如在DeepSpeech2 \[[2](#references)\] 的Vanilla RNN部分中,矩阵大小是`batch_size * 2048`。 +2. 转换冗余 \ +由于在现有的某些情况下(例如RNN),多次调用 cblas_?gemm 会使用相同的原数据,因此,每次调用时对原数据的重复Packing便成为了冗余。 + +为了最大程度减少多次调用 cblas_?gemm 在Packing上的耗时,Intel® MKL 引入了以下四个API: + * [cblas_?gemm_alloc](https://software.intel.com/en-us/mkl-developer-reference-c-cblas-gemm-alloc) + * [cblas_?gemm_pack](https://software.intel.com/en-us/mkl-developer-reference-c-cblas-gemm-pack) + * [cblas_?gemm_compute](https://software.intel.com/en-us/mkl-developer-reference-c-cblas-gemm-compute) + * [cblas_?gemm_free](https://software.intel.com/en-us/mkl-developer-reference-c-cblas-gemm-free) + +通过使用这些API,我们可以先完成对原数据的Packing操作,再把已转换为Packed格式的数据传递给那些复用同一数据的gemm_compute函数,从而避免了Packing冗余。 + +### Solution +在RNN的情况下,同一次前向、后向(forward/backward)过程中所有时间步(time step)共享同一个权重(weight)。当只做推断(inference)时,各次前向之间也都使用了相同的权重,没有必要在每次前向中每个时间步的计算时对权重进行重复的Packing操作。 + +我们通过使用新引入的GEMM Packed APIs,在层初始化的时候,先完成对权重的Packing操作,然后在前向,后向时复用已经转换过的权重,并在每次权重更新后,对新的权重进行转换用于下次迭代。 + +* 优化前,对于序列长度(sequence length)为`T`的网络模型(model), `N`次迭代执行的转换次数为: + - `inference`: `N * T` + - `training`: `2 * N * T` +* 优化后,对于同样设置的网络模型,其转换次数减少至: + - `inference`: `1` + - `training`: `2 * N` + +## Actions + +添加的相关文件和目录结构如下: + +```txt +PaddlePaddle/Paddle +├── ... +└── paddle/ + ├── ... + └── gserver/ + ├── ... + ├── layers/ + │ ├── ... + │ ├── MKLPackedRecurrentLayer.* + | ├── MKLPackedGatedRecurrentLayer.* + | ├── MKLPackedLstmLayer.* + | └── MKLPackedGemm.h + └── tests/ + ├── ... + └── test_MKLPacked.cpp +``` + +### CMake +在对应的`CMakeLists.txt`中根据`WITH_MKL`是否打开,来决定是否开启MKL Packed相关功能。 + +### Layers +所有的`MKLPacked*Layer`都继承于PaddlePaddle的基类`Layer`, 并添加头文件 `MKLPackedGemm.h`,该文件对相关GEMM Packed APIs做了封装。 + +### Unit Tests +我们会添加`test_MKLPacked.cpp`用于MKL Packed优化后layer的测试。 +对于每一个新加的RNN layer,我们会对比如下2个方面: +1. 对比优化后layer自身,sequence mode(`rnn_use_batch=false`)与batch mode(`rnn_use_batch=true`)的结果。 +2. 对比优化后layer与相对应的PaddlePaddle原有layer, 在batch mode下的结果。 + +### Python API +计划在`paddle/utils.Flags`中添加`use_mkl_packed`的flag,用于选择是否使用相关功能,并且当编译时`WITH_MKL=ON`的情况下,默认设置为`true`。 + +同时,在`python/paddle/trainer/config_parser.py`中对应的layer处,添加`use_mkl_packed`这个选择,方便用户在Python端选择是否启用这个功能。 + +具体实现方式比如: + +```python +use_mkl_packed = bool(int(g_command_config_args.get("use_mkl_packed", 0))) +if use_mkl_packed: + self.layer_type = mkl_packed_* +``` + +所有相关的`layer_type`会以*mkl_packed_*开头,这些会在`MKLPacked*Layer`注册layer的时候保证,以示区分。 + + +### Benchmarking +会添加相应的脚本用于测试和对比在使用MKL Packed recurrent layers 前后的网络性能。 + +## References +1. [Introducing the new Packed APIs for GEMM](https://software.intel.com/en-us/articles/introducing-the-new-packed-apis-for-gemm) +2. [DeepSpeech2 on PaddlePaddle](https://github.com/PaddlePaddle/DeepSpeech#deepspeech2-on-paddlepaddle) + diff --git a/doc/design/mkl/mkldnn.md b/doc/design/mkl/mkldnn.md new file mode 100644 index 0000000000000000000000000000000000000000..e2fe1e6b26ffa73fda81863abfadf697c0acbfcf --- /dev/null +++ b/doc/design/mkl/mkldnn.md @@ -0,0 +1,210 @@ +# Intel® MKL-DNN on PaddlePaddle: Design Doc + +我们计划将英特尔深度神经网络数学库[Intel MKL-DNN](https://github.com/01org/mkl-dnn) +(Intel Math Kernel Library for Deep Neural Networks)集成到PaddlePaddle, +充分展现英特尔平台的优势,有效提升PaddlePaddle在英特尔架构上的性能。 + +
+
+Figure 1. PaddlePaddle on IA +
+ +近期目标 + +- 完成常用Layer的MKL-DNN实现。 +- 完成常见深度神经网络VGG,GoogLeNet 和 ResNet的MKL-DNN实现。 + +目前的优化,主要针对PaddlePaddle在重构之前的代码框架以及V1的API。 +具体的完成状态可以参见[这里](https://github.com/PaddlePaddle/Paddle/projects/21)。 + +## Contents + +- [Overview](#overview) +- [Actions](#actions) + - [CMake](#cmake) + - [Matrix](#matrix) + - [Layers](#layers) + - [Activations](#activations) + - [Parameters](#parameters) + - [Gradients](#gradients) + - [Unit Tests](#unit-tests) + - [Python API](#python-api) + - [Benchmarking](#benchmarking) + - [Others](#others) +- [Design Concerns](#design-concerns) + +## Overview + +我们会把MKL-DNN会作为第三方库集成进PaddlePaddle,与其他第三方库一样,会在编译PaddlePaddle的时候下载并编译MKL-DNN。 + +同时,为了进一步提升PaddlePaddle在基本数学运算的计算速度,我们也将MKLML即(MKL small library\[[1](#references)\]) +作为另一个第三方库集成进PaddlePaddle,它只会包括生成好的动态库和头文件。 + +MKL,MKLML以及MKL-DNN三者关系如下表: + +| Name | Open Source | License | Descriptions | +| :---------- | :--------------- | :---------- | :------------ | +| MKL | No | Proprietary | Accelerate math processing routines | +| MKLML | No | Proprietary | Small package of MKL, especially for Machine Learning | +| MKL-DNN | Yes | Apache 2.0 | Accelerate primitives processing routines especially for Deep Neural Networks | + +MKLML可以与MKL-DNN共同使用,以此达到最好的性能。 + +
+
+Figure 2. PaddlePaddle with MKL Engines +
+ +## Actions + +添加的相关文件和目录结构如下: + +```txt +PaddlePaddle/Paddle +├── ... +├── cmake/ +│ ├── external/ +│ │ ├── ... +│ │ ├── mkldnn.cmake +│ │ └── mklml.cmake +└── paddle/ + ├── ... + ├── math/ + │ ├── ... + │ └── MKLDNNMatrix.* + └── gserver/ + ├── ... + ├── layers/ + │ ├── ... + │ └── MKLDNN*Layer.* + ├── activations/ + │ ├── ... + │ └── MKLDNNActivations.* + └── tests/ + ├── ... + ├── MKLDNNTester.* + └── test_MKLDNN.cpp +``` + +### CMake +在`CMakeLists.txt`中提供一个与MKL有关的总开关:`WITH_MKL`,它负责决定编译时是否使用MKLML和MKL-DNN + +- `WITH_MKLML` 控制是否使用MKLML库。 +当打开`WITH_MKL`时,会自动使用MKLML库作为PaddlePaddle的CBLAS和LAPACK库,同时会开启Intel OpenMP用于提高MKLML的性能。 +编译时会把对应的头文件和库放在`build/third_party/install/mklml/*`目录下对应的地方。 +MKLML的库目前都是动态库,主要包括`libiomp5.so`和`libmklml_intel.so`。 +- `WITH_MKLDNN` 控制是否使用MKL-DNN。 +当开启`WITH_MKL`时,会自动根据硬件配置[[2](#references)]选择是否编译MKL-DNN。 +编译时会把对应的头文件和库放在`build/third_party/install/mkldnn/*`目录下对应的地方。 +MKL-DNN的库目前只有动态库`libmkldnn.so`。 + +### Matrix +目前在PaddlePaddle中数据都是以`NCHW`的格式存储,但是在MKL-DNN中的排列方式不止这一种。 +所以我们定义了一个`MKLDNNMatrix`用于管理MKL-DNN数据的不同格式以及相互之间的转换。 + +
+
+Figure 3. MKLDNNMatrix +
+ +### Layers +所有MKL-DNN的Layers都会继承于`MKLDNNLayer`,该类继承于PaddlePaddle的基类`Layer`。 +在`MKLDNNLayer`中会提供一些必要的接口和函数,并且会写好`forward`和`backward`的基本逻辑, +子类只需要使用定义好的接口,实现具体的函数功能即可。 + +
+
+Figure 4. MKLDNNLayer +
+ +每个MKLDNNLayer都包含用于内部存储和外部存储的一系列MKLDNNMatrix: + +- 内部存储(internel memory):`inVal_`,`inGrad_`,`outVal_`和`outGrad_`,分别代表输入数据,输入梯度,输出数据和输出梯度。 +- 外部存储(external memory):都是以ext开头,比如`extInVal_`和`extInGrad_`,它们主要是用于, +当数据格式与PaddlePaddle默认的`NCHW`格式不匹配时,转换内存的工作。 +需要注意的是,PaddlePaddle的activation会直接使用`output_.value`和`output_.grad`, +所以`extOutVal_`和`extOutGrad_`必须分别与`output_.value`和`output_.grad`共享内存, +如果不需要外部存储用于转换,那么对应的内部存储也会与它们共享内存。 +- 转换函数(resetXXX): 包括`resetInValue`,`resetInGrad`,`resetOutValue`和`resetOutGrad`, +表示对输入数据,输入梯度,输出数据和输出梯度的转换。 +这些函数会根据输入参数重新设置内部和外部存储,当然这两者也可以相等,即表示不需要转换。 + +注意:每个`MKLDNNlayer`的子类只需要使用内部存储就可以了,所有外部的转换工作都会在reset系列函数中都准备好。 + +### Activations +在重构前的PaddlePaddle中,激活函数是独立于`Layer`的概念,并且输入输出都是共用一块内存, +所以添加了对应的`MKLDNNActivation`来实现,方式类似于`MKLDNNLayer`。 + +### Parameters +对于有参数的层,我们会保证`MKLDNNLayer`使用的参数与PaddlePaddle申请的buffer共用一块内存。 +如果存在数据排列格式不一样的情况时,我们会在网络训练之前把格式转换为MKL-DNN希望的格式, +在训练结束的时候再保存为PaddlePaddle的格式,但是整个训练过程中不需要任何转换。 +这样既使得最终保存的参数格式与PaddlePaddle一致,又可以避免不必要的转换。 + +### Gradients +由于MKL-DNN的操作都是直接覆盖的形式,也就是说输出的结果不会在原来的数据上累加, +这样带来的好处就是不需要一直清空memory,节省了不必要的操作。 +但是注意的是,当网络出现分支且在`backward`的时候,需要累加不同Layer传过来的梯度。 +所以在`MKLDNNlayer`中实现了一个merge的方法,此时每个小分支的`Input Gradient` +会先临时保存在`MKLDNNMatrix`中,由分支处的Layer负责求和,并把结果放到当前层的`output_.grad`中。 +所以整体上,在实现每个子类的时候就不需要关心分支的事情了。 + +
+
+Figure 5. Merge Gradients +
+ +### Unit Tests +我们会添加`test_MKLDNN.cpp`和`MKLDNNTester.*`用于MKL-DNN的测试。 +测试分为每个Layer(或Activation)的单元测试和简单网络的整体测试。 +每个测试会对比PaddlePaddle中CPU算出的结果与MKL-DNN的结果,小于某个比较小的阈值认为通过。 + +### Python API +目前只考虑**v1 API**。 + +计划在`python/paddle/trainer/config_parser.py`里面添加`use_mkldnn`这个选择,方便用户选择使用MKL-DNN的layers。 + +具体实现方式比如: + +```python +use_mkldnn = bool(int(g_command_config_args.get("use_mkldnn", 0))) +if use_mkldnn + self.layer_type = mkldnn_* +``` + +所有MKL-DNN的`layer_type`会以*mkldnn_*开头,这些会在`MKLDNN*Layer`注册layer的时候保证,以示区分。 + +同时,会在`paddle/utils.Flags`中添加一个`use_mkldnn`的flag,用于选择是否使用MKL-DNN的相关功能。 + +### Benchmarking +会添加相应的脚本在[这里](https://github.com/PaddlePaddle/Paddle/tree/develop/benchmark/paddle/image),用于测试和对比在使用MKL-DNN前后的CNN网络性能。 +测试的性能对比结果会在[IntelOptimizedPaddle.md](https://github.com/PaddlePaddle/Paddle/blob/develop/benchmark/IntelOptimizedPaddle.md) + +### Others +1. 如果在使用MKL-DNN的情况下,会把CPU的Buffer对齐为4096,具体可以参考MKL-DNN中的[memory](https://github.com/01org/mkl-dnn/blob/master/include/mkldnn.hpp#L673)。 +2. 深入PaddlePaddle,寻找有没有其他可以优化的可能,进一步优化。比如可能会用OpenMP改进SGD的更新性能。 + +## Design Concerns + +为了更好的符合PaddlePaddle的代码风格\[[3](#references)\],同时又尽可能少的牺牲MKL-DNN的性能\[[4](#references)\]。 + +我们总结出一些特别需要注意的点: + +1. 使用**deviceId_**。为了尽可能少的在父类Layer中添加变量或者函数, +我们决定使用已有的`deviceId_`变量来区分layer的属性,定义`-2`为`MKLDNNLayer`特有的设备ID。 +2. 重写父类Layer的**init**函数,修改`deviceId_`为`-2`,代表这个layer是用于跑在MKL-DNN的环境下。 +3. 创建`MKLDNNBase`,定义一些除了layer和memory相关的类和函数。 +包括MKL-DNN会用到`MKLDNNStream`和`CPUEngine`,和未来可能还会用到`FPGAEngine`等。 +4. 如果MKL-DNN layer的后面接有cpu device,那么就会使`output_.value`与`extOutVal_`共享内存, +同时数据格式就是`NCHW`,这样下一个cpu device就能拿到正确的数据。 +在有普通的CPU layer时, `extOutVal_`和`extOutGrad_`的格式始终是`NCHW`或者`NC`。 + +## References +1. [MKL small library](https://github.com/01org/mkl-dnn#linking-your-application)是[Intel MKL](https://software.intel.com/en-us/mkl)的一个子集。 +主要包括了深度学习相关的数学原语与操作,一般由MKL-DNN在发布[新版本](https://github.com/01org/mkl-dnn/releases)时一起更新。 +2. [MKL-DNN System Requirements](https://github.com/01org/mkl-dnn#system-requirements)。 +目前在PaddlePaddle中,仅会在支持AVX2指令集及以上的机器才使用MKL-DNN。 +3. [原来的方案](https://github.com/PaddlePaddle/Paddle/pull/3096)会引入**nextLayer**的信息。 +但是在PaddlePaddle中,无论是重构前的layer还是重构后的op,都不会想要知道next layer/op的信息。 +4. MKL-DNN的高性能格式与PaddlePaddle原有的`NCHW`不同(PaddlePaddle中的cuDNN部分使用的也是`NCHW`,所以不存在这个问题)。 +所以需要引入一个转换方法,并且只需要在必要的时候转换这种格式,才能更好的发挥MKL-DNN的性能。 diff --git a/doc/design/mkl/mkldnn_fluid.md b/doc/design/mkl/mkldnn_fluid.md new file mode 100644 index 0000000000000000000000000000000000000000..bef126f3f0577b69f646dfe5d10539b372c6a8a5 --- /dev/null +++ b/doc/design/mkl/mkldnn_fluid.md @@ -0,0 +1,149 @@ +# Design Doc: Add MKLDNN Kernel in Fluid Operator + +## Principles + +First of all, we should follow some basical principles like: +1. [How to write a new operator](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/howto/dev/new_op_en.md). We are trying to add a new kind of kernel into operators, so basically we should follow this doc. +2. [Supporting new Device/Library](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/support_new_device.md). Since MKLDNN is a new library to fluid, we should add `MKLDNNDeviceContext` and maybe `mkldnn_helper.h`, just like [cudnn_helper.h](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/platform/cudnn_helper.h). +3. [Switch Kernel](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/switch_kernel.md). Another important point is that we should ensure the data synchronization between different kernel types, which is this [topic](https://github.com/PaddlePaddle/Paddle/issues/6549). So basically we should override `GetExpectedKernelType` and `trans` functions to support switching kernels. +4. [The Keys of Operator Kernel Type](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/operator_kernel_type.md). Kernel Type is a pivotal conception which can record the `Place`, `Library`, `DataType` and `Layout`. + +## Sulution + +In general, there are four parts we should follow to run a MKL-DNN primitive. +- Create a primitive descriptor that describe this operator +- Create a primitive itself by primitive descriptor and the engine +- Create all memory buffers that primitive needed +- Launch a stream to execute the primitive created +More details can refer to [here](http://01org.github.io/mkl-dnn). + +It's better to avoid reinitialization of primitives and memory handles in the first three stages in every iteration. \ +So we plan to create a map to record all the `primitive` and `memory`, which should not take too much memories as discussed [here](https://github.com/PaddlePaddle/Paddle/issues/6822). + +It's assumed that following three conditions should be satisfied. +1. there is a unique key for each operator instance. May be the actual name of `Output Tensor`. +2. the `Input Tensor` inside `Compute` function is the one after converted. +3. we can get the phase(eg. `is_test`) inside `Compute` function, otherwise we need to expose this attribue to user. + +### Compute +The algorithm of `Compute` would be described as follow, let's take conv like an example. + +```c++ + + PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace()), "It must use CPUPlace."); + PADDLE_ENFORCE(platform::is_mkldnn_library(ctx.GetLibrary()), "It must use MKLDNN Library."); + + auto& dev_ctx = ctx.template device_context(); + + // find primitive by unique key from mkldnn context + // the op_key should be a unique name of this op instance + auto& p = dev_ctx.findPrimitive(op_key + "_fwd"); + + // assuming the input tensor inside this compute function is the one after converted + // this point should be guarantee by another mechanism + auto& i = dev_ctx.findMemory(op_key + "_input"); + + if (p == nullptr || i == nullptr || inputSizeChanged(p, i)) { + auto fwd_primitive_desc = createPrimitiveDesc(ctx); + auto* input = ctx.Input("Input"); + auto* filter = ctx.Input("Filter"); + auto* output = ctx.Output("Output"); + shared_ptr in(new mkldnn::memory(fwd_primitive_desc->src_primitive_desc(), input->data())); + shared_ptr wgt(new mkldnn::memory(fwd_primitive_desc->weights_primitive_desc(), filter->data())); + shared_ptr out(new mkldnn::memory(fwd_primitive_desc->dst_primitive_desc(), output->mutable_data(ctx.GetPlace()))); + shared_ptr fwd_primitive(new mkldnn::conv_fwd(*fwd_primitive_desc, *in, *wgt, *out)); + + dev_ctx.addMemory(op_key+"_input", in); + dev_ctx.addMemory(op_key+"_output", out); + dev_ctx.addMemory(op_key+"_filer", wgt); + dev_ctx.addPrimitive(op_key+"_fwd", fwd_primitive); + dev_ctx.addPrimitiveDesc(op_key+"_fwd_PD", fwd_primitive_desc); + } + + p = dev_ctx.findPrimitive(op_key + "_fwd"); + + PADDLE_ENFORCE(p, "Should have forward Primitive"); + PADDLE_ENFORCE(dev_ctx.findMemory(op_unique_key+"_input"), "Should have input memory"); + PADDLE_ENFORCE(dev_ctx.findMemory(op_unique_key+"_output"), "Should have output memory"); + PADDLE_ENFORCE(dev_ctx.findMemory(op_unique_key+"_filter"), "Should have filter memory"); + PADDLE_ENFORCE(dev_ctx.findPrimitiveDesc(op_unique_key+"_fwd_PD"), "Should have forward PrimitiveDesc"); + dev_ctx.submit(p); + dev_ctx.execute(); // the convert primitive should have already contained. + +``` + +The `createPrimitiveDesc` returns the primitive descripotor of this operator, would be like this: +```c++ + auto* input = ctx.Input("Input"); + auto* filter = ctx.Input("Filter"); + auto* output = ctx.Output("Output"); + std::vector strides = ctx.Attr>("strides"); + std::vector paddings = ctx.Attr>("paddings"); + std::vector dilations = ctx.Attr>("dilations"); + int groups = ctx.Attr("groups"); + algorithm algo = static_cast(ctx.Attr("convolution_algorithm_option")); + prop_kind pk = ctx.Attr("is_test") ? prop_kind::forward_inference : prop_kind::forward_training; + + auto fwd_desc = mkldnn::conv_fwd::desc(/* all the setting above*/); + shared_ptr fwd_primitive_desc(new mkldnn::conv_fwd::primitive_desc(fwd_desc, ctx.getEngine())); + + return fwd_primitive_desc; + } +``` + +### MKLDNNDeviceContext +`MKLDNNDeviceContext`, which is very straightforward, should contain some base information like: `stream`, `engine` and the map needed. + + +### mkldnn_helper +Some functions would be put in `paddle/platform/mkldnn_helper.h`. +- create MKLDNN memories +- create MKLDNN primitives +- error check function +- etc + + +### Kernel Switch +We should `reorder` the different Layout from other device or to other device. `GetExpectedKernelType` and `trans` functions can help us to implement it. + +`GetExpectedKernelType` should get the context, and this operator can return the best `KernelType`. +`trans` would be like this: + +```c++ +void trans(inputs, ctx) override { + if (NoNeedTrans()) { + return; + } + // find reorder primitive by op_key from context + auto& dev_ctx = ctx.template device_context(); + auto& p = dev_ctx.findPrimitive(op_key + "_reorder_input"); + auto& i = dev_ctx.findMemory(op_key + "_src_input"); + + if (p == nullptr || i == nullptr || changeSized(i, input)) { + auto prim = createPrimitiveDesc(ctx); + auto src = createMemory(memoryDesc(input->dims(), actual_layout), input->data); + auto newbuffer = paddle::memory::Alloc(ctx.GetPlace(), input->size_in_bytes()); + auto dst = createMemory(p->expected_desc(), newbuffer->data); + auto reorder_primitive(new mkldnn::reorder(src, dst)); + + dev_ctx.addMemory(op_key+"_src_input", src); + dev_ctx.addMemory(op_key+"_input", dst); + dev_ctx.addPrimitive(op_key+"_reorder_input", reorder_primitive); + } + + p = dev_ctx.findPrimitive(op_key + "_reorder_input"); + PADDLE_ENFORCE(p, "Should have Reorder Primitive"); + dev_ctx.submit(p); + if (! this->isMKLDNNKernel()) { + // execute immediately only if this is not mkldnn kernel function. + // otherwise, it can be executed with the operator primitive in Compute + dev_ctx.stream(); + } + // after submit, the input tensor in ExecutionContext should be changed as the converted one + // there should be another mechanism to ensure this +} +``` + +### Unit Test +All the functions should be tested corresponding. +TBD diff --git a/doc/design/model_format.md b/doc/design/model_format.md new file mode 100644 index 0000000000000000000000000000000000000000..e29129fddf775939c9f7a8b49d850d523e6e5a45 --- /dev/null +++ b/doc/design/model_format.md @@ -0,0 +1,36 @@ +# Design Doc: Model Format + +## Motivation + +A model is an output of the training process. One complete model consists of two parts, the **topology** and the **parameters**. In order to support industrial deployment, the model format must be self-complete and must not expose any training source code. + +As a result, In PaddlePaddle, the **topology** is represented as a [ProgramDesc](https://github.com/PaddlePaddle/Paddle/blob/1c0a4c901c9fc881d120249c703b15d1c50dae7d/doc/design/program.md), which describes the model structure. The **parameters** contain all the trainable weights in the model. We must support large size parameters and efficient serialization/deserialization of parameters. + +## Implementation + +The topology is saved as a plain text in a detailed self-contain protobuf file. + +The parameters are saved as a binary file. As we all know, the protobuf message has a limit of [64M size](https://developers.google.com/protocol-buffers/docs/reference/cpp/google.protobuf.io.coded_stream#CodedInputStream.SetTotalBytesLimit.details). We have done a [benchmark experiment](https://github.com/PaddlePaddle/Paddle/pull/4610), which shows that protobuf is not fit for the task. + +As a result, we design a particular format for tensor serialization. By default, an arbitrary tensor in Paddle is a [LoDTensor](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/lod_tensor.md), and has a description information proto of [LoDTensorDesc](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/framework.proto#L99). We save the DescProto as the byte string header. It contains all the necessary information, such as the `dims`, and the `LoD` information in [LoDTensor](https://github.com/PaddlePaddle/Paddle/blob/1c0a4c901c9fc881d120249c703b15d1c50dae7d/paddle/framework/lod_tensor.md). A tensor stores values in a continuous memory buffer. For speed we dump the raw memory to disk and save it as the byte string content. So, the binary format of one tensor is, + +The table below shows a tensor's byte view in detail. Note that all the signed values are written in the little-endian format. + +|field name | type | description | +| --- | --- | --- | +| version | uint32_t | Version of saved file. Always 0 now. | +| tensor desc length | uint32_t | TensorDesc(Protobuf message) length in bytes. | +| tensor desc | void* | TensorDesc protobuf binary message | +| tensor data | void* | Tensor's data in binary format. The length of `tensor_data` is decided by `TensorDesc.dims()` and `TensorDesc.data_type()` | +| lod_level | uint64_t | Level of LoD | +| length of lod[0] | uint64_t | [Optional] length of lod[0] in bytes. | +| data of lod[0] | uint64_t* | [Optional] lod[0].data() | +| ... | ... | ... | + + + +## Summary + +- We introduce a model format. +- The model represented by its forward-pass computation procedure is saved in a **ProgramDesc** protobuf message. +- A bunch of specified format binary tensors describe the **parameters**. diff --git a/doc/design/operator_kernel_type.md b/doc/design/operator_kernel_type.md new file mode 100644 index 0000000000000000000000000000000000000000..f86e6b7a564ed23f2bddbec25da1c110014f941d --- /dev/null +++ b/doc/design/operator_kernel_type.md @@ -0,0 +1,91 @@ +# Design Doc: The Keys of Operator Kernel Type +## Problem +An operator can have different kernel implementations, and each operator will have a map to store the related kernels. Fluid uses `OpKernelType` as a key to identify a unique kernel. Before an operator runs, a certain type of kernel must be chosen via a key of `OpKernelType`. Currently, `OpKernelType` is defined as follows: + +```cpp +struct OpKernelType { + platform::Place place_; + proto::DataType data_type_; +}; +``` +For more details, please refer to [codes](https://github.com/PaddlePaddle/Paddle/blob/2d5ec16bc8a09fb8e0f62c89b116b0cd1d333907/paddle/framework/operator.h#L348-L374) in github. + +It contains two keys, `Place` and `DataType`. And these two keys will be hashed to a unique key to represent a certain type of kernel. However, these two keys do not provide enough information. We need a more complete representation of `OpKernelType`. + +We often implement a kernel of an operator with some computing library on certain device(place). Please note that computing library and device do not have a one-to-one correspondence. A device can have a lot of computing libraries and a computing library can also support different devices. + +For example, Eigen library supports Nvidia GPU/AMD GPU/CPU and MKLDNN library supports Intel CPU/Intel FPGA. Both `Place` and `Library` should be a key of `OpKernelType`. + +Different DataTypes, such as fp64/fp32/int8, will obviously have different kernels. But different data layout of a Tensor will also lead to different implementations. Please refer to the batch norm operator [kernels](https://github.com/PaddlePaddle/Paddle/blob/a948fac4d0ad7e0412d373b8aabeb711c2899563/paddle/operators/batch_norm_op.cc#L180-L209) as an example. Data layout should also be taken into consideration. + +## Solution + +There are four keys to determine a kernel type of an operator: `Place`/`Library`/`DataType`/`Layout`. + +```cpp +struct OpKernelType { + platform::Place place_; + platform::Library library_; + proto::DataType data_type_; + framework::Layout layout_; +}; +``` + +The details are as follows: + +### Place + +`Place` is defined as: + +```cpp +typedef boost::variant Place; +``` + +`Place` represents the device memory where data is located. + + +### Library + +One operator kernel is usually implemented based on one library. `Library` is defined as a enum variable: + +```cpp +enum Library { Plain, MKLDNN, CUDNN }; +``` + +We use `Plain` enumerator to represent default library. Since most operators in Fluid are implemented based on the `Eigen` library, we take `Eigen` library as the `Plain` enumerator. +A library usually has a corresponding `DeviceContext` which contains some handles needed for computation. Fluid now has two default DeviceContexts for CPU and CUDA, namely, `CPUDeviceContext` and `CUDADeviceContext`. `CPUDeviceContext` contains an Eigen library handle and `CDUADeviceContext` contains an Eigen library handle and a cuBLAS handle. + +If we want to support new library, a new enumerator need to be added to `Library` and a corresponding new `LibraryDeviceContext` need to be created. + + +### DataType + + +`DataType` is defined in [framework.proto](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/framework.proto). Currently, int32/int64/fp32/fp64 are supported. + +### Layout + +Actually, a Tensor is a view of a block of memory. Besides a pointer to the memory, we also have to get some other descriptions of this block of memory, such as shape(ddim), stride, and layout. + +Different layout leads to different implementation of the operator kernel. There are mainly 4 principles we have to follow to support layout in our Fluid framework. + +- We take layout as a data member of Tensor. Layout is actually a enum variable. If Fluid is built with MKLDNN, then the memory format in MKLDNN will also be added into this enum variable. + +- Users have to set layout for input data. And some operators like fill_constant/random, also have to set layout for generating data. Of course, we can have some default layout, like NCHW. + +- The inference of Layout is at run-time, not at compile-time. + +- Every operator has to implement different kernels for different layouts. Let's take MKLDNN as an example. If we want to implement an MKLDNN convolution operator, we have to implement all the kernels for different layouts, which are listed [here](http://01org.github.io/mkl-dnn/structmkldnn_1_1memory.html). And we will have a special macro to register kernels for MKLDNN operators. + +`Layout` is also defined as a enum variable: + +```cpp +enum Layout { + kNCHW, + kNHWC, +#ifdef PADDLE_WITH_MKLDNN + knChw8c + ... +#endif +}; +``` diff --git a/doc/design/ops/images/2_level_rnn.dot b/doc/design/ops/images/2_level_rnn.dot new file mode 100644 index 0000000000000000000000000000000000000000..5d77865061ca7bbbfcf254dd938f09aef5553505 --- /dev/null +++ b/doc/design/ops/images/2_level_rnn.dot @@ -0,0 +1,56 @@ +digraph G { + + rnn [label="1st level RNN" shape=box] + + subgraph cluster0 { + label = "time step 0" + + sent0 [label="sentence"] + sent1 [label="sentence"] + + rnn1 [label="2nd level RNN" shape=box] + + sent0 -> rnn1 + sent1 -> rnn1 + } + + subgraph cluster1 { + label = "time step 1" + + sent2 [label="sentence"] + sent3 [label="sentence"] + + rnn2 [label="2nd level RNN" shape=box] + + sent2 -> rnn2 + sent3 -> rnn2 + } + + subgraph cluster2 { + label = "time step 2" + + sent4 [label="sentence"] + sent5 [label="sentence"] + + rnn3 [label="2nd level RNN" shape=box] + + sent4 -> rnn3 + sent5 -> rnn3 + } + + + para0 [label="paragraph info 0"] + para1 [label="paragraph info 1"] + para2 [label="paragraph info 2"] + + rnn1 -> para0 + rnn2 -> para1 + rnn3 -> para2 + + para0 -> rnn + para1 -> rnn + para2 -> rnn + + chapter [label="chapter info"] + rnn -> chapter +} diff --git a/doc/design/ops/images/2_level_rnn.png b/doc/design/ops/images/2_level_rnn.png new file mode 100644 index 0000000000000000000000000000000000000000..0537a75beb175c0c284717421f7aa908da2a5038 Binary files /dev/null and b/doc/design/ops/images/2_level_rnn.png differ diff --git a/doc/design/ops/images/LOD-and-shape-changes-during-decoding.jpg b/doc/design/ops/images/LOD-and-shape-changes-during-decoding.jpg new file mode 100644 index 0000000000000000000000000000000000000000..8b0d90f7b9d8184b314b0ee4e521f53eb5f1b455 Binary files /dev/null and b/doc/design/ops/images/LOD-and-shape-changes-during-decoding.jpg differ diff --git a/doc/design/ops/images/rnn.dot b/doc/design/ops/images/rnn.dot new file mode 100644 index 0000000000000000000000000000000000000000..c1141cd9c981bb3cbf50d8bf7a6ed210280d79a5 --- /dev/null +++ b/doc/design/ops/images/rnn.dot @@ -0,0 +1,87 @@ +digraph G { + label = "simple RNN implementation" + + ranksep=2; + + //graph [nodesep=1, ranksep=1]; + + node[nodesep=1] + + subgraph cluster0 { + label = "global scope" + rankdir = TB + W + boot_memory + input + output + } + + subgraph cluster1 { + label = "step-scope 0" + rankdir = TB + memory0[label="memory"] + prememory0[label="pre-memory"] + step_input0[label="step input"] + step_output0[label="step output"] + } + + subgraph cluster2 { + label = "step-scope 1" + rankdir = TB + memory1[label="memory"] + prememory1[label="pre-memory"] + step_input1[label="step input"] + step_output1[label="step output"] + } + + subgraph cluster3 { + label = "step-scope 2" + rankdir = TB + memory2[label="memory"] + prememory2[label="pre-memory"] + step_input2[label="step input"] + step_output2[label="step output"] + } + + stepnet [shape=box] + stepnet0 [shape=box, style=dashed] + stepnet1 [shape=box, style=dashed] + stepnet2 [shape=box, style=dashed] + + + edge[color=blue] + boot_memory -> prememory0 [label="init" color="blue"] + memory0 -> prememory1 [label="copy/reference" color="blue"] + memory1 -> prememory2 [label="copy/reference" color="blue"] + + edge[color=black] + W -> stepnet0[constraint=false, style=dashed] + W -> stepnet1[constraint=false, style=dashed] + W -> stepnet2[constraint=false, style=dashed] + + memory0 -> stepnet0[style=dashed] + prememory0 -> stepnet0 -> step_output0[style=dashed] + + memory1 -> stepnet1[style=dashed] + prememory1 -> stepnet1 -> step_output1[style=dashed] + + memory2 -> stepnet2[style=dashed] + prememory2 -> stepnet2 -> step_output2[style=dashed] + + input -> step_input0 + input -> step_input1 + input -> step_input2 + + step_input0 -> stepnet0 [style=dashed] + step_input1 -> stepnet1[style=dashed] + step_input2 -> stepnet2[style=dashed] + + step_output0 -> output + step_output1 -> output + step_output2 -> output + + stepnet0 -> stepnet[style=dashed] + stepnet1 -> stepnet[style=dashed] + stepnet2 -> stepnet[style=dashed] + +} diff --git a/doc/design/ops/images/rnn.jpg b/doc/design/ops/images/rnn.jpg new file mode 100644 index 0000000000000000000000000000000000000000..9867e404cf959df0dce6ded5222b466c788fb840 Binary files /dev/null and b/doc/design/ops/images/rnn.jpg differ diff --git a/doc/design/ops/images/rnn.png b/doc/design/ops/images/rnn.png new file mode 100644 index 0000000000000000000000000000000000000000..e139e373fe8396782044cfd936fdde624f8c66fe Binary files /dev/null and b/doc/design/ops/images/rnn.png differ diff --git a/doc/design/ops/images/rnn_2level_data.dot b/doc/design/ops/images/rnn_2level_data.dot new file mode 100644 index 0000000000000000000000000000000000000000..1d85ae2617a915ad0ad8288d848b607cc37ad297 --- /dev/null +++ b/doc/design/ops/images/rnn_2level_data.dot @@ -0,0 +1,75 @@ +digraph G { + chapter [label="chapter"] + + subgraph cluster0 { + label = "paragraph 0" + + top_rnn0[label="top rnn step 0" shape=box] + + p0 [label="paragraph 0"] + p1 [label="paragraph 1"] + } + + subgraph cluster1{ + label = "paragraph 1" + + top_rnn1[label="top rnn step 1" shape=box] + + p2 [label="paragraph 0"] + p3 [label="paragraph 1"] + } + + subgraph cluster_p0 { + label = "sentence 0" + + low_rnn0 [label="low rnn step 0" shape=box] + s00 [label="sentence 0"] + s01 [label="sentence 1"] + + low_rnn0 -> s00 + low_rnn0 -> s01 + } + + subgraph cluster_p1 { + label = "sentence 1" + low_rnn1 [label="low rnn step 1" shape=box] + s10 [label="sentence 0"] + s11 [label="sentence 1"] + low_rnn1 -> s10 + low_rnn1 -> s11 + } + + subgraph cluster_p2 { + label = "sentence 1" + low_rnn2 [label="low rnn step 0" shape=box] + s20 [label="sentence 0"] + s21 [label="sentence 1"] + low_rnn2 -> s20 + low_rnn2 -> s21 + } + + subgraph cluster_p3 { + label = "sentence 1" + low_rnn3 [label="low rnn step 1" shape=box] + s30 [label="sentence 0"] + s31 [label="sentence 1"] + low_rnn3 -> s30 + low_rnn3 -> s31 + } + + + chapter -> top_rnn0 + chapter -> top_rnn1 + + top_rnn0 -> p0 + top_rnn0 -> p1 + top_rnn1 -> p2 + top_rnn1 -> p3 + + + p0 -> low_rnn0 + p1 -> low_rnn1 + p2 -> low_rnn2 + p3 -> low_rnn3 + +} diff --git a/doc/design/ops/images/rnn_2level_data.png b/doc/design/ops/images/rnn_2level_data.png new file mode 100644 index 0000000000000000000000000000000000000000..4be81b2430717a6a506342a09fc26899568574c6 Binary files /dev/null and b/doc/design/ops/images/rnn_2level_data.png differ diff --git a/doc/design/ops/rnn.md b/doc/design/ops/rnn.md new file mode 100644 index 0000000000000000000000000000000000000000..2f4854793fa1f0b02e4dc17b51a48a972be61c06 --- /dev/null +++ b/doc/design/ops/rnn.md @@ -0,0 +1,153 @@ +# RNNOp design + +This document describes the RNN (Recurrent Neural Network) operator and how it is implemented in PaddlePaddle. The RNN op requires that all instances in a mini-batch have the same length. We will have a more flexible dynamic RNN operator in the future. + +## RNN Algorithm Implementation + +

+ +

+ +The above diagram shows an RNN unrolled into a full network. + +There are several important concepts here: + +- *step-net*: the sub-graph that runs at each step. +- *memory*, $h_t$, the state of the current step. +- *ex-memory*, $h_{t-1}$, the state of the previous step. +- *initial memory value*, the memory of the first (initial) step. + +### Step-scope + +There could be local variables defined in each step-net. PaddlePaddle runtime realizes these variables in *step-scopes* which are created for each step. + +

+
+Figure 2 illustrates the RNN's data flow +

+ +Please be aware that every step runs the same step-net. Each step does the following: + +1. Creates the step-scope. +2. Initializes the local variables including step-outputs, in the step-scope. +3. Runs the step-net, which uses the above mentioned variables. + +The RNN operator will compose its output from step outputs in each of the step scopes. + +### Memory and Ex-memory + +Let's give more details about memory and ex-memory using a simple example: + +$$ +h_t = U h_{t-1} + W x_t +$$, + +where $h_t$ and $h_{t-1}$ are the memory and ex-memory (previous memory) of step $t$ respectively. + +In the implementation, we can make an ex-memory variable either "refer to" the memory variable of the previous step, +or copy the memory value of the previous step to the current ex-memory variable. + +### Usage in Python + +For more information on Block, please refer to the [design doc](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/block.md). + +We can define an RNN's step-net using a Block: + +```python +import paddle as pd + +X = some_op() # x is some operator's output and is a LoDTensor +a = some_op() + +# declare parameters +W = pd.Variable(shape=[20, 30]) +U = pd.Variable(shape=[20, 30]) + +rnn = pd.create_rnn_op(output_num=1) +with rnn.stepnet(): + x = rnn.add_input(X) + # declare a memory (rnn's step) + h = rnn.add_memory(init=a) + # h.pre_state(), the previous memory of rnn + new_state = pd.add_two( pd.matmul(W, x) + pd.matmul(U, h.pre_state())) + # update current memory + h.update(new_state) + # indicate that h variables in all step scopes should be merged + rnn.add_outputs(h) + +out = rnn() +``` + +Python API functions in above example: + +- `rnn.add_input`: indicates that the parameter is a variable that will be segmented into step-inputs. +- `rnn.add_memory`: creates a variable used as the memory. +- `rnn.add_outputs`: marks the variables that will be concatenated across steps into the RNN output. + +### Nested RNN and LoDTensor + +An RNN whose step-net includes other RNN operators is known as an *nested RNN*. + +For example, we could have a 2-level RNN, where the top level corresponds to paragraphs, and the lower level corresponds to sentences. Each step of the higher level RNN also receives an input from the corresponding step of the lower level, and additionally the output from the previous time step at the same level. + +The following figure illustrates feeding in text into the lower level, one sentence at a step, and the feeding in step outputs to the top level. The final top level output is about the whole text. + +

+ +

+ +```python +import paddle as pd + +W = pd.Variable(shape=[20, 30]) +U = pd.Variable(shape=[20, 30]) + +W0 = pd.Variable(shape=[20, 30]) +U0 = pd.Variable(shape=[20, 30]) + +# a is output of some op +a = some_op() + +# chapter_data is a set of 128-dim word vectors +# the first level of LoD is sentence +# the second level of LoD is a chapter +chapter_data = pd.Variable(shape=[None, 128], type=pd.lod_tensor, level=2) + +def lower_level_rnn(paragraph): + ''' + x: the input + ''' + rnn = pd.create_rnn_op(output_num=1) + with rnn.stepnet(): + sentence = rnn.add_input(paragraph, level=0) + h = rnn.add_memory(shape=[20, 30]) + h.update( + pd.matmul(W, sentence) + pd.matmul(U, h.pre_state())) + # get the last state as sentence's info + rnn.add_outputs(h) + return rnn + +top_level_rnn = pd.create_rnn_op(output_num=1) +with top_level_rnn.stepnet(): + paragraph_data = rnn.add_input(chapter_data, level=1) + low_rnn = lower_level_rnn(paragraph_data) + paragraph_out = low_rnn() + + h = rnn.add_memory(init=a) + h.update( + pd.matmul(W0, paragraph_data) + pd.matmul(U0, h.pre_state())) + top_level_rnn.add_outputs(h) + +# output the last step +chapter_out = top_level_rnn(output_all_steps=False) +``` + +In the above example, the construction of the `top_level_rnn` calls `lower_level_rnn`. The input is an LoD Tensor. The top level RNN segments input text data into paragraphs, and the lower level RNN segments each paragraph into sentences. + +By default, the `RNNOp` will concatenate the outputs from all the time steps. +If the `output_all_steps` is set to False, it will only output the final time step. + + +

+ +

diff --git a/doc/design/ops/sequence_decoder.md b/doc/design/ops/sequence_decoder.md new file mode 100644 index 0000000000000000000000000000000000000000..c4a9bbeeefca0e05c335dd60233691e8bac33015 --- /dev/null +++ b/doc/design/ops/sequence_decoder.md @@ -0,0 +1,229 @@ +# Design: Sequence Decoder Generating LoDTensors +In tasks such as machine translation and visual captioning, +a [sequence decoder](https://github.com/PaddlePaddle/book/blob/develop/08.machine_translation/README.md) is necessary to generate sequences, one word at a time. + +This documentation describes how to implement the sequence decoder as an operator. + +## Beam Search based Decoder +The [beam search algorithm](https://en.wikipedia.org/wiki/Beam_search) is necessary when generating sequences. It is a heuristic search algorithm that explores the paths by expanding the most promising node in a limited set. + +In the old version of PaddlePaddle, the C++ class `RecurrentGradientMachine` implements the general sequence decoder based on beam search, due to the complexity involved, the implementation relies on a lot of special data structures that are quite trivial and hard to be customized by users. + +There are a lot of heuristic tricks in the sequence generation tasks, so the flexibility of sequence decoder is very important to users. + +During the refactoring of PaddlePaddle, some new concepts are proposed such as: [LoDTensor](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/lod_tensor.md) and [TensorArray](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/tensor_array.md) that can better support the sequence usage, and they can also help make the implementation of beam search based sequence decoder **more transparent and modular** . + +For example, the RNN states, candidates IDs and probabilities of beam search can be represented all as `LoDTensors`; +the selected candidate's IDs in each time step can be stored in a `TensorArray`, and `Packed` to the sentences translated. + +## Changing LoD's absolute offset to relative offsets +The current `LoDTensor` is designed to store levels of variable-length sequences. It stores several arrays of integers where each represents a level. + +The integers in each level represent the begin and end (not inclusive) offset of a sequence **in the underlying tensor**, +let's call this format the **absolute-offset LoD** for clarity. + +The absolute-offset LoD can retrieve any sequence very quickly but fails to represent empty sequences, for example, a two-level LoD is as follows +```python +[[0, 3, 9] + [0, 2, 3, 3, 3, 9]] +``` +The first level tells that there are two sequences: +- the first's offset is `[0, 3)` +- the second's offset is `[3, 9)` + +while on the second level, there are several empty sequences that both begin and end at `3`. +It is impossible to tell how many empty second-level sequences exist in the first-level sequences. + +There are many scenarios that rely on empty sequence representation, for example in machine translation or visual captioning, one instance has no translation or the empty candidate set for a prefix. + +So let's introduce another format of LoD, +it stores **the offsets of the lower level sequences** and is called **relative-offset** LoD. + +For example, to represent the same sequences of the above data + +```python +[[0, 3, 6] + [0, 2, 3, 3, 3, 9]] +``` + +the first level represents that there are two sequences, +their offsets in the second-level LoD is `[0, 3)` and `[3, 5)`. + +The second level is the same with the relative offset example because the lower level is a tensor. +It is easy to find out the second sequence in the first-level LoD has two empty sequences. + +The following examples are based on relative-offset LoD. + +## Usage in a simple machine translation model +Let's start from a simple machine translation model that is simplified from the [machine translation chapter](https://github.com/PaddlePaddle/book/tree/develop/08.machine_translation) to draw a blueprint of what a sequence decoder can do and how to use it. + +The model has an encoder that learns the semantic vector from a sequence, and a decoder which uses the sequence encoder to generate new sentences. + +**Encoder** +```python +import paddle as pd + +dict_size = 8000 +source_dict_size = dict_size +target_dict_size = dict_size +word_vector_dim = 128 +encoder_dim = 128 +decoder_dim = 128 +beam_size = 5 +max_length = 120 + +# encoder +src_word_id = pd.data( + name='source_language_word', + type=pd.data.integer_value_sequence(source_dict_dim)) +src_embedding = pd.embedding(size=source_dict_size, size=word_vector_dim) + +src_word_vec = pd.lookup(src_embedding, src_word_id) + +encoder_out_seq = pd.gru(input=src_word_vec, size=encoder_dim) + +encoder_ctx = pd.last_seq(encoder_out_seq) +# encoder_ctx_proj is the learned semantic vector +encoder_ctx_proj = pd.fc( + encoder_ctx, size=decoder_dim, act=pd.activation.Tanh(), bias=None) +``` + +**Decoder** + +```python +def generate(): + decoder = pd.while_loop() + with decoder.step(): + decoder_mem = decoder.memory(init=encoder_ctx) # mark the memory + generated_ids = decoder.memory() # TODO init to batch_size s + generated_scores = decoder.memory() # TODO init to batch_size 1s or 0s + + target_word = pd.lookup(trg_embedding, gendrated_ids) + # expand encoder_ctx's batch to fit target_word's lod + # for example + # decoder_mem.lod is + # [[0 1 3], + # [0 1 3 6]] + # its tensor content is [a1 a2 a3 a4 a5] + # which means there are 2 sentences to translate + # - the first sentence has 1 translation prefixes, the offsets are [0, 1) + # - the second sentence has 2 translation prefixes, the offsets are [1, 3) and [3, 6) + # the target_word.lod is + # [[0, 1, 6] + # [0, 2, 4, 7, 9 12]] + # which means 2 sentences to translate, each has 1 and 5 prefixes + # the first prefix has 2 candidates + # the following has 2, 3, 2, 3 candidates + # the encoder_ctx_expanded's content will be + # [a1 a1 a2 a2 a3 a3 a3 a4 a4 a5 a5 a5] + encoder_ctx_expanded = pd.lod_expand(encoder_ctx, target_word) + decoder_input = pd.fc( + act=pd.activation.Linear(), + input=[target_word, encoder_ctx_expanded], + size=3 * decoder_dim) + gru_out, cur_mem = pd.gru_step( + decoder_input, mem=decoder_mem, size=decoder_dim) + scores = pd.fc( + gru_out, + size=trg_dic_size, + bias=None, + act=pd.activation.Softmax()) + # K is an config + topk_scores, topk_ids = pd.top_k(scores, K) + topk_generated_scores = pd.add_scalar(topk_scores, generated_scores) + + selected_ids, selected_generation_scores = decoder.beam_search( + topk_ids, topk_generated_scores) + + # update the states + decoder_mem.update(cur_mem) # tells how to update state + generated_ids.update(selected_ids) + generated_scores.update(selected_generation_scores) + + decoder.output(selected_ids) + decoder.output(selected_generation_scores) + +translation_ids, translation_scores = decoder() +``` +The `decoder.beam_search` is an operator that, given the candidates and the scores of translations including the candidates, +returns the result of the beam search algorithm. + +In this way, users can customize anything on the input or output of beam search, for example: + +1. Make the corresponding elements in `topk_generated_scores` zero or some small values, beam_search will discard this candidate. +2. Remove some specific candidate in `selected_ids`. +3. Get the final `translation_ids`, remove the translation sequence in it. + +The implementation of sequence decoder can reuse the C++ class: [RNNAlgorithm](https://github.com/Superjom/Paddle/blob/68cac3c0f8451fe62a4cdf156747d6dc0ee000b3/paddle/operators/dynamic_recurrent_op.h#L30), +so the python syntax is quite similar to that of an [RNN](https://github.com/Superjom/Paddle/blob/68cac3c0f8451fe62a4cdf156747d6dc0ee000b3/doc/design/block.md#blocks-with-for-and-rnnop). + +Both of them are two-level `LoDTensors`: + +- The first level represents `batch_size` of (source) sentences. +- The second level represents the candidate ID sets for translation prefix. + +For example, 3 source sentences to translate, and has 2, 3, 1 candidates. + +Unlike an RNN, in sequence decoder, the previous state and the current state have different LoD and shape, and an `lod_expand` operator is used to expand the LoD of the previous state to fit the current state. + +For example, the previous state: + +* LoD is `[0, 1, 3][0, 2, 5, 6]` +* content of tensor is `a1 a2 b1 b2 b3 c1` + +the current state is stored in `encoder_ctx_expanded`: + +* LoD is `[0, 2, 7][0 3 5 8 9 11 11]` +* the content is + - a1 a1 a1 (a1 has 3 candidates, so the state should be copied 3 times for each candidates) + - a2 a2 + - b1 b1 b1 + - b2 + - b3 b3 + - None (c1 has 0 candidates, so c1 is dropped) + +The benefit from the relative offset LoD is that the empty candidate set can be represented naturally. + +The status in each time step can be stored in `TensorArray`, and `Pack`ed to a final LoDTensor. The corresponding syntax is: + +```python +decoder.output(selected_ids) +decoder.output(selected_generation_scores) +``` + +The `selected_ids` are the candidate ids for the prefixes, and will be `Packed` by `TensorArray` to a two-level `LoDTensor`, where the first level represents the source sequences and the second level represents generated sequences. + +Packing the `selected_scores` will get a `LoDTensor` that stores scores of each translation candidate. + +Packing the `selected_generation_scores` will get a `LoDTensor`, and each tail is the probability of the translation. + +## LoD and shape changes during decoding +

+ +

+ +According to the image above, the only phase that changes the LoD is beam search. + +## Beam search design +The beam search algorithm will be implemented as one method of the sequence decoder and has 3 inputs: + +1. `topk_ids`, the top K candidate ids for each prefix. +2. `topk_scores`, the corresponding scores for `topk_ids` +3. `generated_scores`, the score of the prefixes. + +All of these are LoDTensors, so that the sequence affiliation is clear. Beam search will keep a beam for each prefix and select a smaller candidate set for each prefix. + +It will return three variables: + +1. `selected_ids`, the final candidate beam search function selected for the next step. +2. `selected_scores`, the scores for the candidates. +3. `generated_scores`, the updated scores for each prefix (with the new candidates appended). + +## Introducing the LoD-based `Pack` and `Unpack` methods in `TensorArray` +The `selected_ids`, `selected_scores` and `generated_scores` are LoDTensors that exist at each time step, +so it is natural to store them in arrays. + +Currently, PaddlePaddle has a module called `TensorArray` which can store an array of tensors. It is better to store the results of beam search in a `TensorArray`. + +The `Pack` and `UnPack` in `TensorArray` are used to pack tensors in the array to an `LoDTensor` or split the `LoDTensor` to an array of tensors. +It needs some extensions to support the packing or unpacking an array of `LoDTensors`. diff --git a/doc/design/optimizer.md b/doc/design/optimizer.md new file mode 100644 index 0000000000000000000000000000000000000000..691081c268b848811bf5ee6d6a41edfe0f47eec0 --- /dev/null +++ b/doc/design/optimizer.md @@ -0,0 +1,91 @@ +## Optimizer Design + +### The Problem + +A PaddlePaddle program, or a block, is a sequence of operators operating variables. A training program needs to do three kinds of works: + +1. the forward pass, which computes intermediate results and the cost(s), +1. the backward pass, which derives gradients from intermediate results and costs, and +1. the optimization pass, which update model parameters to optimize the cost(s). + +These works rely on three kinds of operators: + +1. forward operators, +1. gradient operators, and +1. optimization operators. + +It's true that users should be able to create all these operators manually by calling some low-level API, but it would be much more convenient if they could only describe the forward pass and let PaddlePaddle create the backward and optimization operators automatically. + +In this design, we propose a high-level API that automatically derives the optimisation pass and operators from the forward pass. + + +### High-level Python API to describe the training process + +1. User write code to describe the network: + + ```python + images = layer.data("images") + labels = layer.data("labels") + w1 = pd.var("w1") + b1 = pd.var("b1") + hidden = layer.fc(images, w=w1, b=b1) + cost = layer.mse(hidden, labels) + ``` + + The above code snippet will create forward operators in [Block](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/block.md). + + +2. Users create a certain kind of Optimizer with some argument. + + ```python + optimizer = AdagradOptimizer(learing_rate=0.001) + ``` + +3. Users use the optimizer to `minimize` a certain `cost` through updating parameters in parameter_list. + + ```python + opt_op_list = optimizer.minimize(cost, parameter_list=[w1, b1]) + ``` + The above code snippet will create gradient and optimization operators in Block. The return value of `minimize()` is list of optimization operators that will be run by session. + +4. Users use Session/Executor to run this opt_op_list as target to do training. + + ```python + sess.run(target= opt_op_list, ...) + ``` + +#### Optimizer Python interface: + +```python +class Optimizer(object): + """Optimizer Base class. + + """ + + def __init__(self): + pass + + def create_optimization_pass(self, parameters_and_grads): + """Add optimization operators to update gradients to variables. + + Args: + parameters_and_grads: a list of (variable, gradient) pair to update. + + Returns: + optmization_op_list: a list of optimization operator that will update parameter using gradient. + """ + return None + + def minimize(self, loss, parameter_list): + """Add operations to minimize `loss` by updating `parameter_list`. + + This method combines interface `append_backward()` and + `create_optimization_pass()` into one. + """ + params_grads = self.create_backward_pass(loss, parameter_list) + update_ops = self.create_optimization_pass(params_grads) + return update_ops + +``` + +Users can inherit the Optimizer above to create their own Optimizer with some special logic, such as AdagradOptimizer. diff --git a/doc/design/paddle_nccl.md b/doc/design/paddle_nccl.md new file mode 100644 index 0000000000000000000000000000000000000000..c7dac70998a6cfec3a6d2fc72b698ff9722e6805 --- /dev/null +++ b/doc/design/paddle_nccl.md @@ -0,0 +1,65 @@ +# Design Doc: NCCL support in Paddle Fluid + +## Abstract + +This Design Doc refers to the NCCL feature in paddle. We propose an approach to support NCCL library both on a single machine and multiple machines. We wrapper the NCCL primitives `Broadcast`, `Allreduce`, `Reduce` as operators to utilize Multi-GPU powers in one script. + + +## Motivation + +[NCCL](https://developer.nvidia.com/nccl) is a NVIDIA library support Multi-GPU communicating and optimized for NVIDIA GPUs, it provides routines such as all-gather, all-reduce, broadcast, reduce, reduce-scatter, that can achieve high bandwidth over PCIe and NVLink high-speed interconnect. With NCCL library, we can easily accelerate the training in parallel. + +- Pros +1. easily plug-in with [NCCL2](https://developer.nvidia.com/nccl) library. +1. high performance in NVIDIA GPUs. +1. MPI like primitives, which have low learning cost for users. + +- Cons +1. Only design for NVIDIA GPUs, not a general multi-device solution. +1. Although NCCL1 is opensourced under BSD license, but NCCL2 is not opensourced anymore. + +At the beginning of training, the framework needs to distribute the same parameters to every GPU, and merge the gradients at any time user interests. + +As a result, during training, we need the operations of peer to peer copy between different GPUs, aggregating gradients/parameters from GPUs, and broadcasting parameters to GPUs. Every GPU only need to run the operator with correct place information. + +Besides, it needs interfaces to synchronize model update with each different GPU Cards. + +## Implementation + +As mentioned above, we wrap the NCCL routines as several kinds of operators. Need to note that NCCL need to create Communicator between gpu at the beginning, so there is a NCCLInit operator created. + +### Transpiler + +To be compatible with [parameter server design doc](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/ops/dist_train.md), the transpiler compiles the user defined operation graph into sub-graphs to be executed on different devices. + +1. The user-defined model will be a single device program + +2. Broadcast/Reduce operators between GPUs will be inserted into the program, even for the multi-node, may insert the `Send`, `Recv` operator. + + *Broadcast, AllReduce in a single machine. And Broadcast, AllReduce, [Send, Recv](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/ops/dist_train.md#graph-converter) in multiple machines* + + + +After compiling, the graph as shows + + + +Operators are added to the sub-graphs. Every GPU assigned a role of `rank0`, `rank1` etc. + +- **Broadcast**. Broadcast operator distribute initialized parameter to all the GPUs from the GPU who owns it. e.g. from`rank0` GPU. +- **AllReduce**. AllReduce operator synchronizes parameters/gradients between GPUs. AllReduce implemented in the Ring-Based communicating method, avoid of the bottle neck in a single GPU. + +Need to notice that AllReduce operator force GPUs synchronized at that point. The whole training process in asynchronous or synchronous mode depends on the AllReduce point in the graph. + +As it shown in the picture, when each GPU compute the gradient of `W`, followed with a `AllReduce` operator, accumulate the `dW` to full batch of data, then run the optimize process individually and apply the gradient to its `W`. + +- **AllReduce** + Need to note that our AllReduce operator is a ring-base AllReduce implementation. If we use the NCCL2 AllReduce primitive, every GPU optimized full batch of data, wasted (n-1) GPU compute resources. In addition, NCCL2 built-in AllReduce will only utilize the communicating resource during synchronization, then update the gradient will be a subsequent phase. In fact, we can amortize the update gradient time cost into the communicating phase. The process is +1. Every parameter has its root card. That card will responsible for aggregating the gradients from GPUs. +2. The whole model's parameter will be hashed to different root card, ensure the load balance between GPUs. +3. Logically neighberhood card will start send parameter to the next one. After one round, the parameter main card will aggregate the full gradients. +4. Then the root card will optimize the parameter. +5. This parameter card will send its optimized result to its neighberhood, then the neighberhood will send parameter to its next one. +6. Finish the sychronization round. + +The total time cost will be 2 * (n-1) * per-parameter-send-time, we reach the goal of amortize the upgrade time into communicating phase. diff --git a/doc/design/parameter_average.md b/doc/design/parameter_average.md new file mode 100644 index 0000000000000000000000000000000000000000..2c4edee9fe31d502ea62b9fe5c8757c0a4c5e79f --- /dev/null +++ b/doc/design/parameter_average.md @@ -0,0 +1,72 @@ +# Averaging Parameter in PaddlePaddle + +## Why Averaging +In a large scale machine learning setup where the size of the training data is huge, it could take us a large number of iterations over the training data before we can achieve the optimal values of parameters of our model. Looking at the problem setup, it is desirable if we can obtain the optimal values of parameters by going through the data in as few passes as we can. + +Polyak and Juditsky (1992) showed that the test performance of simple average of parameters obtained by Stochastic Gradient Descent (SGD) is as good as that of parameter values that are obtained by training the model over and over again, over the training dataset. + +Hence, to accelerate the speed of Stochastic Gradient Descent, Averaged Stochastic Gradient Descent (ASGD) was proposed in Polyak and Juditsky (1992). For ASGD, the running average of parameters obtained by SGD, is used as the estimator for
. The averaging is done as follows: + +
+ +We propose averaging for any optimizer similar to how ASGD performs it, as mentioned above. + +### How to perform Parameter Averaging in PaddlePaddle + +Parameter Averaging in PaddlePaddle works in the following way during training : +1. It will take in an instance of a normal optimizer as an input, e.g. RMSPropOptimizer +2. The optimizer itself is responsible for updating the parameters. +3. The ParameterAverageOptimizer maintains a separate copy of the parameters for itself: + 1. In concept, the values of this copy are the average of the values of the parameters in the most recent N batches. + 2. However, saving all the N instances of the parameters in memory is not feasible. + 3. Therefore, an approximation algorithm is used. + +Hence, overall we have have two copies of the parameters: one for the optimizer itself, and one for the ParameterAverageOptimizer. The former should be used in back propagation, while the latter should be used during testing and should be saved. + +During the testing/ saving the model phase, we perform the following steps: +1. Perform the delayed operations. +2. Save current values of the parameters to a temporary variable. +3. Replace the values of the parameters with the averaged values. +4. Perform testing and/or save the parameters. +5. Restore the values of the parameters once done. + +### How to implement Averaging of Parameter in PaddlePaddle + +We can add the ParameterAverageOptimizer op to the graph through Python API. Using this approach, we manually add this op to the graph and direct the output of the optimizer op to this op during training. + + **Advantages**: + - Allows for greater flexibility to the users of PaddlePaddle. Using this approach, the users can plug different optimizers into ParameterAverageOptimizer by passing in the optimizer to the op. + - Makes it easy for the users to customize and extend the framework. + + **Disadvantages**: + - Implementation requires re-writing the averaging methodology in Python. + +### Low-Level implementation + +In the new design, we propose to create a new operation for averaging parameter updates (ParameterAverageOptimizer). For now, we can add an op that takes in the following as input: +- the optimizer +- the window_size to keep the updates + +The ParameterAverageOptimizer op can be like any other operator with its own CPU/GPU implementation either using Eigen or separate CPU and GPU kernels. As the initial implementation, we can implement the kernel using Eigen following the abstraction pattern implemented for [Operators](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/rmsprop_op.h). We also want to support the case when the Trainer/Optimizer runs on the GPU while ParameterAverageOptimizer runs on a CPU. + +The idea of building an op for averaging is in sync with the refactored PaddlePaddle philosophy of using operators to represent any computation unit. The way the op will be added to the computation graph will be decided by the [layer functions](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/python_api.md#layer-function) in Python API. + +### Python API implementation for ParameterAverageOptimizer + +Based on Polyak and Juditsky (1992), we can generalize the averaging of updates to any optimizer. The input to the op would be the following: +- Any optimizer (RMSProp , AdaGrad etc.) +- A window size. The op keeps accumulating updated parameter values over a window of N batches and takes an average. Move the averaged value to a buffer when window is full to avoid loss of precision. + +Using the ParameterAverageOptimizer op, any user can add the operation to their computation graphs. However, this will require a lot of lines of code and we should design Python APIs that support averaging. As per the PaddlePaddle [Python API design](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/python_api.md), the layer functions are responsible for creating operators, operator parameters and variables. Since ParameterAverageOptimizer will be an operator, it makes sense to create it in the layer functions. +We will have a wrapper written in Python that will support the functionality and implement the actual core computation in C++ core as we have done for other [Optimizers](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/rmsprop_op.cc) + +#### Creation of the ParameterAverageOptimizer operator +There are two ways for creating the ParameterAverageOptimizer op: +1. We create the op immediately while building the computation graph. +2. We add the op in a lazy manner, just before the backward pass, similar to the way the optimization ops are added. + +The proposal is to add the op immediately while building the computation graph. + +#### High-level API + +In PaddlePaddle Python API, users will primarily rely on [layer functions](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/python_api.md#layer-function) to create neural network layers. Hence, we also need to provide parameter average functionality in layer functions. diff --git a/doc/design/parameters_in_cpp.md b/doc/design/parameters_in_cpp.md index b6f99bc7d9d6fafacb0a4bcff806b65d9aef98cc..a7ac3f17c44ca94a669a8f1e283b291bceb42317 100644 --- a/doc/design/parameters_in_cpp.md +++ b/doc/design/parameters_in_cpp.md @@ -1,19 +1,19 @@ # Design Doc: The C++ Class `Parameters` -`Parameters` is a concept we designed in Paddle V2 API. `Parameters` is a container of parameters, and make Paddle can shared parameter between topologies. We described usages of `Parameter` in [api.md](./api.md). +`Parameters` is a concept we designed in PaddlePaddle V2 API. `Parameters` is a container of parameters, which makes PaddlePaddle capable of sharing parameter between topologies. We described usages of `Parameter` in [api.md](./api.md). -We used Python to implement Parameters when designing V2 API before. There are several defects for current implementation: +We used Python to implement Parameters when designing V2 API before. There are several defects for the current implementation: * We just use `memcpy` to share Parameters between topologies, but this is very inefficient. -* We did not implement share Parameters while training. We just trigger `memcpy` when start training. +* We did not support sharing Parameters while training. We just trigger `memcpy` when start training. -It is necessary that we implement Parameters in CPP side. However, it could be a code refactoring for Paddle, because Paddle was designed for training only one topology before, i.e., each GradientMachine contains its Parameter as a data member. In current Paddle implementation, there are three concepts associated with `Parameters`: +It is necessary that we implement Parameters in CPP side. However, it could result a code refactoring for PaddlePaddle, because PaddlePaddle was designed for training only one topology before, i.e., each GradientMachine contains its Parameter as a data member. In current PaddlePaddle implementation, there are three concepts associated with `Parameters`: 1. `paddle::Parameter`. A `Parameters` is a container for `paddle::Parameter`. It is evident that we should use `paddle::Parameter` when developing `Parameters`. However, the `Parameter` class contains many functions and does not have a clear interface. It contains `create/store Parameter`, `serialize/deserialize`, `optimize(i.e SGD)`, `randomize/zero`. When we developing `Parameters`, we only use `create/store Parameter` functionality. -We should extract functionalities of Parameter into many classes to clean Paddle CPP implementation. +We should extract functionalities of Parameter into many classes to clean PaddlePaddle CPP implementation. 2. `paddle::GradientMachine` and its sub-classes, e.g., `paddle::MultiGradientMachine`, `paddle::NeuralNetwork`. We should pass `Parameters` to `paddle::GradientMachine` when `forward/backward` to avoid `memcpy` between topologies. @@ -24,7 +24,7 @@ Also, we should handle multi-GPU/CPU training, because `forward` and `backward` So `Parameters` should be used by `paddle::ParameterUpdater`, and `paddle::ParameterUpdater` should optimize `Parameters` (by SGD). -The step by step approach for implementation Parameters in Paddle C++ core is listed below. Each step should be a PR and could be merged into Paddle one by one. +The step by step approach for implementation Parameters in PaddlePaddle C++ core is listed below. Each step should be a PR and could be merged into PaddlePaddle one by one. 1. Clean `paddle::Parameter` interface. Extract the functionalities of `paddle::Parameter` to prepare for the implementation of Parameters. diff --git a/doc/design/profiler.md b/doc/design/profiler.md new file mode 100644 index 0000000000000000000000000000000000000000..b20b5efdc1f1f10ce7cec835adcc6fb374ed4e20 --- /dev/null +++ b/doc/design/profiler.md @@ -0,0 +1,97 @@ +## Introduction + +There are many performance analysis tools for [different programming languages and different software frameworks](https://en.wikipedia.org/wiki/List_of_performance_analysis_tools). For most popular deep learning frameworks, they use several programming languages and adapt to heterogeneous platforms. Similar to most of the deep learning frameworks, PaddlePaddle also uses C++, CUDA and Python as the basic programming languages to adapt to run on CPU and GPU devices. The [`nvprof` tools](http://docs.nvidia.com/cuda/profiler-users-guide/index.html#nvprof-overview) is usually used to analyse the CUDA program. We have [a document](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/howto/optimization/cpu_profiling.md) to profile CPU and Python program by [yep](https://pypi.python.org/pypi/yep) and [Google's perftools](https://github.com/google/pprof) to profile only the CPU and Python program. But for [PaddlePaddle fluid](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/fluid.md), the operator is the basic computing unit. The developers usually want to collect the time of each operator and locate bottlenecks. The `nvprof` usually collect the timeline of CUDA-related activities on both CPU and GPU, including kernel execution, memory transfers, memory set and CUDA API calls and events or metrics for CUDA kernels. And the `yep` and `Google's perftools` can't collect the timeline for CUDA program. All these tools can't collect time in the operator level. So we design this profiling tool. + +## Architecture + +The work flow for most task is as follows. Each operator will run many times in the all iterations. So the profiler must collect the total time of each operator during the iteration. For more, sometimes, the developers may want to collect more detailed time span inside the operator or record time span for elsewhere, this requires that the profiler must support to record the nested time span. And in order to speedup training, all the deep learning frameworks support parallel computing, including multiple threads on CPU and multiple GPUs. So the profiler must be able to collect the timeline for each thread. In addition, the profiler also occupies certain resources. It must can be easily to be enabled or disabled by the developers. At last, the profiler should present a human-readable report. + +```python +for i in xrange(M): # M is the iteration number + for op in operator_lists: # The `operator_lists` contains all the operators in the network. + op.run(); +``` + +In summary, the proflier should have following features: + +- records time span in loop. +- supports nested time span. +- supports multiple threads/multiple GPUs. +- supports to be enabled and disabled by users. + +But how to record the time for the mixed C++ and CUDA program? There many C++ APIs to get the current calendar time in host program. But for GPU, the CUDA kernels may be executed concurrently if they are in different [streams](http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#streams) and the CUDA kernels is asynchronous with the host program if there is no the synchronous aftern the CUDA kernels. CUDA provides [event](http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#events) to monitor the device and perform accurate timing. Inspired by PyTorch and CUDA event, we also design and apply the events to record the timeline. Then summarize and present statistics based on these events. + +The overall flow is shown as the following figure. + +
+ +### Event + +In above work flow, a pair of events are needed before and after the piece of code to collect time. So the event has a flag to mark whether it is a starting event or an ending event. Except this two kinds of event, sometime, a only marker with a text message is needed, for example, a marker to specify the profiling start or end. There are three kinds of event: + +```c++ +enum EventKind { + kMark, + kPushRange, + kPopRange}; +``` +- kMark: only a marker without time range. +- kPushRange: mark the starting event for time range. +- kPopRange: mark the ending event for time range. + +For the CPU code, the events only need to record the current time. For the CUDA code, the [event management functions of CUDA](http://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__EVENT.html#group__CUDART__EVENT) are used. For many pieces of code, an event lists are used to record each piece. + +```c++ +class Event { + public: + // The DeviceContext is used to get current CUDA stream. + Event(EventKind kind, std::string name, uint32_t thread_id, + const platform::DeviceContext* dev_ctx = nullptr); + double CpuElapsedUs(const Event& e) const; + double CudaElapsedUs(const Event& e) const; + + private: + EventKind kind_; + std::string name_; + uint32_t thread_id_; + int64_t cpu_ns_; +#ifdef PADDLE_WITH_CUDA + cudaEvent_t event_ = nullptr; + int device_ = -1; +#endif +}; + +struct EventList { + std::forward_list> event_blocks; +}; +``` + +As mentioned above, there is no need to record the timeline when disabling the profiler. So there is a global state to enable or disable the profiler. + +```c++ +enum ProfilerState { + kDisabled, + kCPU, + kCUDA +}; +ProfilerState g_state; +``` +- kDisabled: the disabled state. +- kCPU: CPU profiling state. +- kCUDA: GPU profiling state. + +A pair of starting and ending events are pushed to event lists in constructor and destructor of `RecordEvent`. So the timeline is recorded for the code in the lifecycle of an object of `RecordEvent`. + +```c++ +struct RecordEvent { + explicit RecordEvent(const std::string name, + platform::DeviceContext* dev_ctx = nullptr) { + if (kState == ProfilerState::kDisabled) return; + // push the starting event to the event lists. + } + ~RecordEvent() { + if (kState == ProfilerState::kDisabled) return; + // push the ending event to the event lists. + } +}; +``` diff --git a/doc/design/program.md b/doc/design/program.md new file mode 100644 index 0000000000000000000000000000000000000000..bd2456787c4e336d357a65255a8274a7c9e465cc --- /dev/null +++ b/doc/design/program.md @@ -0,0 +1,139 @@ +# Design Doc: PaddlePaddle Programs + +## Compile and Execution + +A PaddlePaddle program consists of two parts -- the first generates a `ProgramDesc` protobuf message that describes the program, and the second runs this message using a C++ class `Executor`. + +A simple example PaddlePaddle program can be found in [graph.md](./graph.md): + +```python +x = layer.data("images") +l = layer.data("label") +y = layer.fc(x) +cost = layer.mse(y, l) +optimize(cost) +train(cost, reader=mnist.train()) +``` + +The first five lines of the following PaddlePaddle program generates, or, compiles, the `ProgramDesc` message. The last line runs it. + +## Programs and Blocks + +The basic structure of a PaddlePaddle program is some nested blocks, as a C++ or Java program. + +- program: some nested blocks +- [block](./block.md): + - some local variable definitions, and + - a sequence of operators + +The concept of block comes from usual programs. For example, the following C++ program has three blocks: + +```c++ +int main() { // block 0 + int i = 0; + if (i < 10) { // block 1 + for (int j = 0; j < 10; j++) { // block 2 + } + } + return 0; +} +``` + +The following PaddlePaddle program has three blocks: + +```python +import paddle as pd // block 0 + +x = minibatch([10, 20, 30]) # shape=[None, 1] +y = var(1) # shape=[1], value=1 +z = minibatch([10, 20, 30]) # shape=[None, 1] +cond = larger_than(x, 15) # [false, true, true] + +ie = pd.ifelse() +with ie.true_block(): // block 1 + d = pd.layer.add_scalar(x, y) + ie.output(d, pd.layer.softmax(d)) +with ie.false_block(): // block 2 + d = pd.layer.fc(z) + ie.output(d, d+1) +o1, o2 = ie(cond) +``` + +## `BlockDesc` and `ProgramDesc` + +All protobuf messages are defined in `framework.proto`. + +`BlockDesc` is straight-forward -- it includes local variable definitions, `vars`, and a sequence of operators, `ops`. + +```protobuf +message BlockDesc { + required int32 parent = 1; + repeated VarDesc vars = 2; + repeated OpDesc ops = 3; +} +``` + +The parent ID indicates the parent block so that operators in a block can refer to variables defined locally and also those defined in their ancestor blocks. + +All hierarchical blocks in a program are flattened and stored in an array. The block ID is the index of the block in this array. + +```protobuf +message ProgramDesc { + repeated BlockDesc blocks = 1; +} +``` + + +### Global Block + +The global block is the first one in the above array. + +## Operators that Use Blocks + +In the above example, the operator `IfElseOp` has two blocks -- the true branch and the false branch. + +The definition of `OpDesc` shows that an operator could have some attributes: + +```protobuf +message OpDesc { + AttrDesc attrs = 1; + ... +} +``` + +and an attribute could be of type block, which is, in fact, a block ID as described above: + +``` +message AttrDesc { + required string name = 1; + + enum AttrType { + INT = 1, + STRING = 2, + ... + BLOCK = ... + } + required AttrType type = 2; + + optional int32 block = 10; // when type == BLOCK + ... +} +``` + +## InferShape + +With this design, the InferShape function should take the following parameters: + +```c++ +void InferShape(int current_block, + int current_operator, + ProgramDesc* program // might change VarDesc values. + ) { + ... +} +``` + +where + +- `current_block` indices into `ProgramDesc::blocks`, +- `current_operator` indices into `BlockDesc::ops`. diff --git a/doc/design/prune.md b/doc/design/prune.md new file mode 100644 index 0000000000000000000000000000000000000000..4a5cf10c79a554779137f0cce5494fdd96ef6b7a --- /dev/null +++ b/doc/design/prune.md @@ -0,0 +1,63 @@ +# Prune + +## Motivation + +We want to support running inference, training and checkpointing in one `ProgramDesc`. We implement +`void Prune(const ProgramDesc* input, ProgramDesc* output)` function, which takes a `ProgramDesc` +and generate a pruned `ProgramDesc`. + +## Challenge + +Pruning need to support both variables and operators being evaluation targets. Consider the following +different situations. + +```python +# Case 1: run foward pass. +cost_np = session.run(target=cost) +# Case 2: run backward passing. +opts_np, _ = session.run(target=[cost, opt]) +# Case 3: run checkpointing +_ = session.run(target=checkpoint) +``` + +## Solution + +To support evaluation of operators, we add `is_target` field in the `OpDesc`. + +```c++ +message OpDesc { + required string type = 3; + repeated Var inputs = 1; + repeated Var outputs = 2; + repeated Attr attrs = 4; + optional bool is_target = 5 [ default = false ]; +}; +``` + +To support evaluation of variables, we add [fetch_op](https://github.com/PaddlePaddle/Paddle/pull/4599). +For each variable in the `target`, we insert a `fetch_op` into the `ProgramDesc` with `variable` being +`fetch_op`'s input. Then we also set `fetch_op` is a target. + +### Algorithm + +If an operator needs to be run, it must fall into one of the following cases: + +1. It is the target. +2. It is depended by some other ops, meaning its output is some other op's input. + +The first case can be checked by `op_desc.is_traget()` . The second case can be implement as + +```c++ +bool HasDependentVar(const OpDesc& op_desc, const std::set& dependent_vars) { + for (auto& var : op_desc.outputs()) { + for (auto& argu : var.arguments()) { + if (dependent_vars.count(argu) != 0) { + return true; + } + } + } + return false; +} +``` + +Then the whole algorithm can be implemented as the following [code](https://github.com/tonyyang-svail/Paddle/blob/prune_impl/paddle/framework/prune.cc). diff --git a/doc/design/python_api.md b/doc/design/python_api.md new file mode 100644 index 0000000000000000000000000000000000000000..73f6d7b90c7dca0d48109cf3d28d5f7cd56b5c0b --- /dev/null +++ b/doc/design/python_api.md @@ -0,0 +1,304 @@ +# Design Doc: Python API + +Due to the refactorization of the PaddlePaddle core, we need Python classes to construct corresponding protobuf messages that describe a DL program. + +| Python classes | Protobuf messages | +| --- | --- | +| Program | ProgramDesc | +| Block | BlockDesc | +| Operator | OpDesc | +| Variable | VarDesc | + +Please be aware that these Python classes need to maintain some construction-time information, which are not part of the protobuf messages. + +## Core Concepts + +### Program + +A `ProgramDesc` describes a [DL program](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/program.md), which is composed of an array of `BlockDesc`s. The `BlockDesc`s in a `ProgramDesc` can have a tree-like hierarchical structure. However, the `ProgramDesc` onlys stores a flattened array of `BlockDesc`s. A `BlockDesc` refers to its parent block by its index in the array. For example, operators in the step block of an RNN operator need to be able to access variables in its ancestor blocks. + +Whenever we create a block, we need to set its parent block to the current block, hence the Python class `Program` needs to maintain a data member `current_block`. + +```python +class Program(objects): + def __init__(self): + self.desc = core.NewProgram() # a C++ ProgramDesc pointer. + self.blocks = vector() + self.blocks.append(Block(self, -1)) # the global block + self.current_block = 0 # initialized to the global block + + def global_block(): + return self.blocks[0] + + def current_block(): + return self.get_block(self.current_block) + + def rollback(): + self.current_block = self.current_block().parent_idx + + def create_block(): + new_block_idx = len(self.block) + self.blocks.append(Block(self, self.current_block)) + self.current_block = new_block_idx + return current_block() +``` + +`Program` is an accessor to the protobuf message `ProgramDesc`, which is created in C++ space, because the InferShape function is in C++, which manipulates `VarDesc` messages, which are in turn members of `BlockDesc`, which is a member of `ProgramDesc`. + +`Program` creates the first block as the global block in its constructor. All parameters and their initializer operators are in the global block. + +### Block + +A [Block](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/block.md) includes + +1. a map from variable names to an instance of the Python `Variable` class, and +1. a list of `Operator` instances. + +```python +class Block(objects): + def __init__(self, program, parent_idx): + self.desc = core.NewBlock(program.desc) + self.program = program + self.vars = map() + self.ops = vector() + self.parent_idx = parent_idx + + def create_var(self, ...): + return Variable(self, ...) + + def _create_global_var(self, ...): + program.global_block().create_var(...) + + def create_parameter(self, name, ...): + # Parameter is a subclass of variable. See Parameter section for details. + self.vars[name] = Parameter(self._create_global_var(...), ...) + return self.vars[name] + + def append_operator(self, ...): + self.ops.append(Operator(self, ...)) + + def prepend_operator(self, ...): # Parameter's ctor prepands initialize operators. + self.ops.prepend(Operator(self, ...)) +``` + +`create_parameter` is necessary because parameters are global variables, defined in the global block, but can be created in some sub-blocks. For example, an FC layer in the step block of an RNN operator. + +`prepend_operator` is necessary because the constructor of `Parameter` needs to create the initialize (or load) operator of the parameter, and would like to put it in the *preamble* of the global block. + +### Operator + +The `Operator` class fills in the `OpDesc` message and calls the C++ function `InferShape` to infer the output shapes from the input shapes. + +```python +class Operator(object): + def __init__(self, + block, # Block + type, # string + inputs, # dict + outputs,# dict + attrs # dict + ): + self.desc = core.NewOpDesc(block.desc, type, inputs, outputs, attrs) + core.infer_shape(self.desc, inputs, outputs) + + def type(self): + return self.desc.type() +``` + +`Operator` creates the `OpDesc` message in C++ space, so that it can call the `InferShape` function, which is in C++. + +### Variable + +Operators take Variables as its inputs and outputs. + +```python +class Variable(object): + def __init__(self, + block=None, # Block + name=None, # string + shape, # tuple + dtype="float32", # string + lod_level=None # int + ): + if name is None: + name = unique_name_generator() + self.name = name + self.block = block + self.desc = core.NewVarDesc(block.desc, name, shape, lod_level) + self.writer = None +``` + +Please be aware of `self.writer`, that tracks operator who creates the variable. It possible that there are more than one operators who write a variable, but in Python space, each write to a variable is represented by a Variable class. This is guaranteed by the fact that **`core.NewVarDesc` must NOT create a new `VarDesc` message if its name already exists in the specified block**. + +### Parameter + +A parameter is a global variable with an initializer (or load) operator. + +```python +class Parameter(Variable): + def __init__(self, + block=None, # Block + name=None, # string + shape, # tuple + dtype="float32", # string + lod_level=None # int + trainable, # bool + initialize_op_attrs, + optimize_op_attrs): + super(Parameter, self).__init__(block, name, shape, dtype, lod_level) + self.trainable = trainable + self.optimize_op_attrs = optimize_op_attrs + block.prepend(Operator(block, # Block + initialize_op_attrs['type'], # string + None, # no inputs + self, # output is the parameter + initialize_op_attrs) +``` + +When users create a parameter, they can call + +```python +program.create_parameter( + ..., + init_attr={ + type: "uniform_random", + min: -1.0, + max: 1.0, + }) +) +``` + +In above example, `init_attr.type` names an initialize operator. It can also name the load operator + +```python +init_attr={ + type: "load", + filename: "something.numpy", +} +``` + +`optimize_op_attrs` is not in the `VarDesc` message, but kept in the Python instance, as it will be used in the Python space when creating the optimize operator's `OpDesc`, and will be in the `OpDesc` message. + +## Layer Function + +A layer is a Python function that creates some operators and variables. Layers simplify the work of application programmers. + +Layer functions take `Variable` and configuration parameters as its input and return the output variable(s). + +For example, `FullyConnected` take one or more variable as its input. The input could be input data or another layer's output. There are many configuration options for a `FullyConnected` layer, such as layer size, activation, parameter names, initialization strategies of parameters, and so on. The `FullyConnected` layer will return an output variable. + + +### Necessity for reusing code between layer functions + +There are a lot of code that can be reused. Such as + +* Give the default value of configuration. e.g., default initialize strategy for parameters is uniform random with `min = -1.0`, `max = 1.0`. and default initialize strategy for bias is to fill zero. +* Append the activation operator. +* Create a temporary variable. +* Create parameter. +* Generate a unique name. +* Add a bias. +* ... + +A mechanism to reuse code between layer functions is necessary. It will be around [150 lines of code](https://github.com/PaddlePaddle/Paddle/pull/4724/files#diff-823b27e07e93914ada859232ae23f846R12) if we write a `FullyConnected` layer without any helper functions. + + + +### Comparision between global functions and helper class + +The `FullyConnected` layer will be as follow when we provide global functions: + +```python +def fc_layer(input, size, param_attr=None, bias_attr=None, act=None, name=None): + if name is None: + name = unique_name("fc") + input = multiple_input(input) + param_attr = default_param_attr(param_attr) + param_attr = multiple_param_attr(param_attr, len(input)) + + # mul + mul_results = [] + for ipt, attr in zip(input, param_attr): + shape = ipt.shape[1:] + [size] + w = g_program.global_block().create_parameter(shape, ipt.dtype, name, attr) + tmp = create_tmp_var(name) + g_program.current_block().append_op("mul", {ipt, w}, {tmp}) + mul_results.append(tmp) + + # add sum + ... + # add bias + ... + # add activation + ... + return out +``` + +We can provide many helpers functions for layer developers. However, there are several disadvantages for global helper functions: + +1. We need a namespace for these methods, then layer developers can quickly figure out what method they can use. +2. Global functions will force layer developers to pass its parameter time by time. + +So we provide a helper class, `LayerHelper`, to share code between layer functions. The `FullyConnected` Layer will be as follow. + +```python +def fc_layer(input, size, param_attr=None, bias_attr=None, act=None, name=None): + helper = LayerHelper(locals()) # pass all parameter to LayerHelper + + mul_results = [] + for ipt, param in helper.iter_multiple_input_and_param(): + w = helper.create_parameter(shape=ipt.shape[1:] + [size], dtype = ipt.dtype) + tmp = helper.create_tmp_variable() + helper.append_op('mul', {ipt, w}, {tmp}) + mul_results.append(tmp) + + pre_bias = helper.add_sum(mul_results) + pre_activation = helper.add_bias(pre_bias) + return helper.add_activation(pre_activation) +``` + +We not only use the fewer lines of code to write `fc_layer` but also make the code clearer to understand. At the same time, layer developers can figure out what function they can invoke by typing `helper.` in a python editor. + + +### Implementation of layer helper + +We just keep all parameters of a layer function as a dictionary in layer helper as a private data member. Every method of layer helper will look up the dictionary after it is invoked. In that way, we can implement a layer helper for all layer functions even some layer does not contain some operator. For example, The `activation` is used by the FullyConnected layer or convolution layers, but a cross-entropy layer does not use it. The example code of `add_activation` are: + +```python +class LayerHelper(object): + def __init__(self, **kwargs): # kwargs is short for `keyword arguments` + self.kwargs = kwargs + + def add_activation(self, input_var): + act = self.kwargs.get("act", None) # default value is None + if act is None: # do nothing if no act + return input_var + + tmp = self.create_tmp_var(self) + self.append_op(type=act, input=input_var, output=tmp) + return tmp +``` + +### Return value of layer functions + +The layer will return a Variable, which is also the output of an operator. However, outputs of a layer function have more attributes than an operator. There are parameter variables, and their gradient variables need to return. To return them is useful. For example, + +1. Users can debug the network by printing parameter gradients. +2. Users can append attributes to a parameter, such as, `param.stop_gradient=True` will make a parameter stop generate the gradient. We can fix the parameter value during training by using this attribute. + +However, it is good to return a Variable for layers, since all layers and operators use Variables as their parameters. We can just append a `param` field and a `grad` field for layer function since the Python is dynamic typing. + +The sample usage is + +```python +data = fluid.layers.data(...) +hidden = fluid.layers.fc(data, ...) +... + +executor.run(fetch_list=[hidden.param, hidden.param.grad], ...) +``` + + +## Optimizer + +[Optimizer Design Doc](./optimizer.md) diff --git a/doc/design/reader/README.md b/doc/design/reader/README.md index f21f7af520df5171798326818ecb97c3bcd14a12..2cd4b6225b61cf374458e40afabad7745f61ba71 100644 --- a/doc/design/reader/README.md +++ b/doc/design/reader/README.md @@ -1,25 +1,25 @@ # Python Data Reader Design Doc -At training and testing time, PaddlePaddle programs need to read data. To ease the users' work to write data reading code, we define that +During the training and testing phases, PaddlePaddle programs need to read data. To help the users write code that performs reading input data, we define the following: -- A *reader* is a function that reads data (from file, network, random number generator, etc) and yields data items. -- A *reader creator* is a function that returns a reader function. -- A *reader decorator* is a function, which accepts one or more readers, and returns a reader. -- A *batch reader* is a function that reads data (from *reader*, file, network, random number generator, etc) and yields a batch of data items. +- A *reader*: A function that reads data (from file, network, random number generator, etc) and yields the data items. +- A *reader creator*: A function that returns a reader function. +- A *reader decorator*: A function, which takes in one or more readers, and returns a reader. +- A *batch reader*: A function that reads data (from *reader*, file, network, random number generator, etc) and yields a batch of data items. -and provide function which converts reader to batch reader, frequently used reader creators and reader decorators. +and also provide a function which can convert a reader to a batch reader, frequently used reader creators and reader decorators. ## Data Reader Interface -Indeed, *data reader* doesn't have to be a function that reads and yields data items. It can be any function with no parameter that creates a iterable (anything can be used in `for x in iterable`): +*Data reader* doesn't have to be a function that reads and yields data items. It can just be any function without any parameters that creates an iterable (anything can be used in `for x in iterable`) as follows: ``` iterable = data_reader() ``` -Element produced from the iterable should be a **single** entry of data, **not** a mini batch. That entry of data could be a single item, or a tuple of items. Item should be of [supported type](http://www.paddlepaddle.org/doc/ui/data_provider/pydataprovider2.html?highlight=dense_vector#input-types) (e.g., numpy 1d array of float32, int, list of int) +The item produced from the iterable should be a **single** entry of data and **not** a mini batch. The entry of data could be a single item or a tuple of items. Item should be of one of the [supported types](http://www.paddlepaddle.org/doc/ui/data_provider/pydataprovider2.html?highlight=dense_vector#input-types) (e.g., numpy 1d array of float32, int, list of int etc.) -An example implementation for single item data reader creator: +An example implementation for single item data reader creator is as follows: ```python def reader_creator_random_image(width, height): @@ -29,7 +29,7 @@ def reader_creator_random_image(width, height): return reader ``` -An example implementation for multiple item data reader creator: +An example implementation for multiple item data reader creator is as follows: ```python def reader_creator_random_image_and_label(width, height, label): def reader(): @@ -40,9 +40,10 @@ def reader_creator_random_image_and_label(width, height, label): ## Batch Reader Interface -*batch reader* can be any function with no parameter that creates a iterable (anything can be used in `for x in iterable`). The output of the iterable should be a batch (list) of data items. Each item inside the list must be a tuple. +*Batch reader* can be any function without any parameters that creates an iterable (anything can be used in `for x in iterable`). The output of the iterable should be a batch (list) of data items. Each item inside the list should be a tuple. + +Here are some valid outputs: -Here are valid outputs: ```python # a mini batch of three data items. Each data item consist three columns of data, each of which is 1. [(1, 1, 1), @@ -52,26 +53,28 @@ Here are valid outputs: # a mini batch of three data items, each data item is a list (single column). [([1,1,1],), ([2,2,2],), -([3,3,3],), +([3,3,3],)] ``` Please note that each item inside the list must be a tuple, below is an invalid output: ```python # wrong, [1,1,1] needs to be inside a tuple: ([1,1,1],). - # Otherwise it's ambiguous whether [1,1,1] means a single column of data [1, 1, 1], - # or three column of datas, each of which is 1. + # Otherwise it is ambiguous whether [1,1,1] means a single column of data [1, 1, 1], + # or three columns of data, each of which is 1. [[1,1,1], [2,2,2], [3,3,3]] ``` -It's easy to convert from reader to batch reader: +It is easy to convert from a reader to a batch reader: + ```python mnist_train = paddle.dataset.mnist.train() mnist_train_batch_reader = paddle.batch(mnist_train, 128) ``` -Also easy to create custom batch reader: +It is also straight forward to create a custom batch reader: + ```python def custom_batch_reader(): while True: @@ -85,7 +88,8 @@ mnist_random_image_batch_reader = custom_batch_reader ## Usage -batch reader, mapping from item(s) read to data layer, batch size and number of total pass will be passed into `paddle.train`: +Following is how we can use the reader with PaddlePaddle: +The batch reader, a mapping from item(s) to data layer, the batch size and the number of total passes will be passed into `paddle.train` as follows: ```python # two data layer is created: @@ -99,13 +103,13 @@ paddle.train(batch_reader, {"image":0, "label":1}, 128, 10, ...) ## Data Reader Decorator -*Data reader decorator* takes a single or multiple data reader, returns a new data reader. It is similar to a [python decorator](https://wiki.python.org/moin/PythonDecorators), but it does not use `@` syntax. +The *Data reader decorator* takes in a single reader or multiple data readers and returns a new data reader. It is similar to a [python decorator](https://wiki.python.org/moin/PythonDecorators), but it does not use `@` in the syntax. -Since we have a strict interface for data readers (no parameter, return a single data item). Data reader can be used flexiable via data reader decorators. Following are a few examples: +Since we have a strict interface for data readers (no parameters and return a single data item), a data reader can be used in a flexible way using data reader decorators. Following are a few examples: ### Prefetch Data -Since reading data may take time and training can not proceed without data. It is generally a good idea to prefetch data. +Since reading data may take some time and training can not proceed without data, it is generally a good idea to prefetch the data. Use `paddle.reader.buffered` to prefetch data: @@ -117,9 +121,9 @@ buffered_reader = paddle.reader.buffered(paddle.dataset.mnist.train(), 100) ### Compose Multiple Data Readers -For example, we want to use a source of real images (reusing mnist dataset), and a source of random images as input for [Generative Adversarial Networks](https://arxiv.org/abs/1406.2661). +For example, if we want to use a source of real images (say reusing mnist dataset), and a source of random images as input for [Generative Adversarial Networks](https://arxiv.org/abs/1406.2661). -We can do: +We can do the following : ```python def reader_creator_random_image(width, height): @@ -139,13 +143,13 @@ false_reader = reader_creator_bool(False) reader = paddle.reader.compose(paddle.dataset.mnist.train(), data_reader_creator_random_image(20, 20), true_reader, false_reader) # Skipped 1 because paddle.dataset.mnist.train() produces two items per data entry. -# And we don't care second item at this time. +# And we don't care about the second item at this time. paddle.train(paddle.batch(reader, 128), {"true_image":0, "fake_image": 2, "true_label": 3, "false_label": 4}, ...) ``` ### Shuffle -Given shuffle buffer size `n`, `paddle.reader.shuffle` will return a data reader that buffers `n` data entries and shuffle them before a data entry is read. +Given the shuffle buffer size `n`, `paddle.reader.shuffle` returns a data reader that buffers `n` data entries and shuffles them before a data entry is read. Example: ```python @@ -154,21 +158,21 @@ reader = paddle.reader.shuffle(paddle.dataset.mnist.train(), 512) ## Q & A -### Why reader return only a single entry, but not a mini batch? +### Why does a reader return only a single entry, and not a mini batch? -Always returning a single entry make reusing existing data readers much easier (e.g., if existing reader return not a single entry but 3 entries, training code will be more complex because it need to handle cases like batch size 2). +Returning a single entry makes reusing existing data readers much easier (for example, if an existing reader returns 3 entries instead if a single entry, the training code will be more complicated because it need to handle cases like a batch size 2). -We provide function `paddle.batch` to turn (single entry) reader into batch reader. +We provide a function: `paddle.batch` to turn (a single entry) reader into a batch reader. -### Why do we need batch reader, isn't train take reader and batch_size as arguments sufficient? +### Why do we need a batch reader, isn't is sufficient to give the reader and batch_size as arguments during training ? -In most of the case, train taking reader and batch_size as arguments would be sufficent. However sometimes user want to customize order of data entries inside a mini batch. Or even change batch size dynamically. +In most of the cases, it would be sufficient to give the reader and batch_size as arguments to the train method. However sometimes the user wants to customize the order of data entries inside a mini batch, or even change the batch size dynamically. For these cases using a batch reader is very efficient and helpful. -### Why use a dictionary but not a list to provide mapping? +### Why use a dictionary instead of a list to provide mapping? -We decided to use dictionary (`{"image":0, "label":1}`) instead of list (`["image", "label"]`) is because that user can easily resue item (e.g., using `{"image_a":0, "image_b":0, "label":1}`) or skip item (e.g., using `{"image_a":0, "label":2}`). +Using a dictionary (`{"image":0, "label":1}`) instead of a list (`["image", "label"]`) gives the advantage that the user can easily reuse the items (e.g., using `{"image_a":0, "image_b":0, "label":1}`) or even skip an item (e.g., using `{"image_a":0, "label":2}`). -### How to create custom data reader creator +### How to create a custom data reader creator ? ```python def image_reader_creator(image_path, label_path, n): @@ -192,7 +196,7 @@ paddle.train(paddle.batch(reader, 128), {"image":0, "label":1}, ...) ### How is `paddle.train` implemented -An example implementation of paddle.train could be: +An example implementation of paddle.train is: ```python def train(batch_reader, mapping, batch_size, total_pass): diff --git a/doc/design/refactorization.md b/doc/design/refactorization.md new file mode 100644 index 0000000000000000000000000000000000000000..f93d6155e1764386b01d2f0df3f141ab75cd55d4 --- /dev/null +++ b/doc/design/refactorization.md @@ -0,0 +1,249 @@ +# Design Doc: Refactorization Overview + +The goals of refactoring include: + +1. Making it easy for external contributors to write new elementary computation operations. +1. Making the codebase clean and readable. +1. Designing a new computation representation -- a computation graph of operators and variables. +1. Implementing auto-scalability and auto fault recoverable distributed computing with the help of computation graphs. + +## Computation Graphs + +1. PaddlePaddle represents the computation, training and inference of Deep Learning models, by computation graphs. + + 1. Please refer to [computation graphs](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/graph.md) for a concrete example. + +1. Users write Python programs to describe the graphs and run them (locally or remotely). + +1. A graph is composed of *variables* and *operators*. + +1. The description of graphs must be serializable/deserializable, so that: + + 1. It can be sent to the cloud for distributed execution, and + 1. It can be sent to clients for mobile or enterprise deployment. + +1. The Python program does two things + + 1. *Compilation* runs a Python program to generate a protobuf message representation of the graph and send it to + 1. the C++ library `libpaddle.so` for local execution, + 1. the master process of a distributed training job for training, or + 1. the server process of a Kubernetes serving job for distributed serving. + 1. *Execution* executes the graph by constructing instances of class [`Variable`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/variable.h#L24) and [`OperatorBase`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/operator.h#L70), according to the protobuf message. + +## Description and Realization of Computation Graph + +At compile time, the Python program generates a protobuf message representation of the graph, or a description of the graph. + +At runtime, the C++ program realizes the graph and runs it. + +| | Representation (protobuf messages) | Realization (C++ class objects) | +|---|---|---| +|Data|[VarDesc](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/framework.proto#L107)|[Variable](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/variable.h#L24)| +|Operation|[OpDesc](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/framework.proto#L35)|[Operator](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/operator.h#L64)| +|Block|BlockDesc|Block| + +The word *graph* is interchangeable with *block* in this document. A graph consists of computation steps and local variables similar to a C++/Java program block, or a pair of parentheses(`{` and `}`). + +## Compilation and Execution + +1. Run a Python program to describe the graph. In particular, the Python application program does the following: + + 1. Create `VarDesc` to represent local/intermediate variables, + 1. Create operators and set attributes, + 1. Validate attribute values, + 1. Infer the type and the shape of variables, + 1. Plan memory-reuse for variables, + 1. Generate the backward graph + 1. Add optimization operators to the computation graph. + 1. Optionally, split the graph for distributed training. + +1. The invocation of `train` or [`infer`](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/v2/inference.py#L108) methods in the Python program does the following: + + 1. Create a new Scope instance in the [scope hierarchy](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/scope.md) for each run of a block, + 1. realize local variables defined in the BlockDesc message in the new scope, + 1. a scope is similar to the stack frame in programming languages, + + 1. Create an instance of class `Block`, in which, + 1. realize operators in the BlockDesc message, + + 1. Run the Block by calling + 1. `Block::Eval(vector* targets)` for forward and backward computations, or + 1. `Block::Eval(vector* targets)` for optimization. + + +## Intermediate Representation (IR) + +```text +Compile Time -> IR -> Runtime +``` + +### Benefits of IR + +- Optimization + ```text + Compile Time -> IR -> Optimized IR -> Runtime + ``` +- Automatically send partitioned IR to different nodes. + - Automatic Data Parallelism + ```text + Compile Time + |-> Single GPU IR + |-> [trainer-IR-0, trainer-IR-1, pserver-IR] + |-> Node-0 (runs trainer-IR-0) + |-> Node-1 (runs trainer-IR-1) + |-> Node-2 (runs pserver-IR) + ``` + - Automatic Model Parallelism (planned for future) + +--- + +# Operator/OpWithKernel/OpKernel + +![class_diagram](http://api.paddlepaddle.org/graphviz?dot=https://gist.githubusercontent.com/reyoung/53df507f6749762675dff3e7ce53372f/raw/49caf1fb70820fb4a6c217634317c9306f361f36/op_op_with_kern_class_diagram.dot) + +--- + +# Operator +![class_diagram](http://api.paddlepaddle.org/graphviz?dot=https://gist.githubusercontent.com/reyoung/53df507f6749762675dff3e7ce53372f/raw/dd598e8f1976f5759f58af5e5ef94738a6b2e661/op.dot) + +* `Operator` is the fundamental building block of the user interface. + * Operator stores input/output variable names and attributes. + * The `InferShape` interface is used to infer the shape of the output variables based on the shapes of the input variables. + * Use `Run` to compute the `output` variables from the `input` variables. + +--- + +# OpWithKernel/Kernel + +![class_diagram](http://api.paddlepaddle.org/graphviz?dot=https://gist.githubusercontent.com/reyoung/53df507f6749762675dff3e7ce53372f/raw/9d7f4eba185cf41c8e2fbfb40ae21890dbddcd39/op_with_kernel.dot) + +* `OpWithKernel` inherits `Operator`. +* `OpWithKernel` contains a Kernel map. + * `OpWithKernel::Run` get device's kernel, and invoke `OpKernel::Compute`. + * `OpKernelKey` is the map key. Only device place now, but may be data type later. + +--- + +# Why separate Kernel and Operator + +* Separate GPU and CPU code. + * Make Paddle capable of running without GPU. +* Make one operator (which is a user interface) and create many implementations. + * For example, same multiplication op can have different implementations kernels such as FP16 kernel, FP32 kernel, MKL, eigen kernel. +--- + +# Libraries for Kernel development + +* `Eigen::Tensor` contains basic math and element-wise functions. + * Note that `Eigen::Tensor` has broadcast implementation. + * Limit the number of `tensor.device(dev) = ` in your code. +* `thrust::transform` and `std::transform`. + * `thrust` has the same API as C++ standard library. Using `transform`, one can quickly implement customized element-wise kernels. + * `thrust`, in addition, supports more complex APIs, like `scan`, `reduce`, `reduce_by_key`. +* Hand-writing `GPUKernel` and `CPU` code + * Do not write in header (`.h`) files. CPU Kernel should be in cpp source (`.cc`) and GPU kernels should be in cuda (`.cu`) files. (GCC cannot compile GPU code.) +--- +# Operator Registration + +## Why is registration necessary? +We need a method to build mappings between Op type names and Op classes. + +## How is registration implemented? +Maintaining a map, whose key is the type name and the value is the corresponding Op constructor. + +--- +# The Registry Map + +### `OpInfoMap` + +`op_type(string)` -> `OpInfo` + +`OpInfo`: + +- **`creator`**: The Op constructor. +- **`grad_op_type`**: The type of the gradient Op. +- **`proto`**: The Op's Protobuf, including inputs, outputs and required attributes. +- **`checker`**: Used to check attributes. + +--- +# Related Concepts + +### Op_Maker +It's constructor takes `proto` and `checker`. They are completed during Op_Maker's construction. ([ScaleOpMaker](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/scale_op.cc#L37)) + +### Register Macros +```cpp +REGISTER_OP(op_type, op_class, op_maker_class, grad_op_type, grad_op_class) +REGISTER_OP_WITHOUT_GRADIENT(op_type, op_class, op_maker_class) +``` + +--- +# Registration Process +1. Write an Op class and its gradient Op class, if required. +2. Write an Op maker class. In the constructor of this class, describe the inputs, outputs and attributes of the operator. +3. Invoke the macro `REGISTER_OP`. This macro will + 1. Call maker class to complete `proto` and `checker` + 2. Using the completed `proto` and `checker`, it will add a new key-value pair to the `OpInfoMap` + +--- +# Backward Module (1/2) +### Create Backward Operator +- Mapping from forward Op to backward Op +![backward](https://gist.githubusercontent.com/dzhwinter/a6fbd4623ee76c459f7f94591fd1abf0/raw/61026ab6e518e66bde66a889bc42557a1fccff33/backward.png) + +--- +# Backward Module (2/2) +### Build Backward Network +- **Input**: a graph of forward operators +- **Output**: a graph of backward operators +- **Corner cases in construction** + - Shared Variables => insert an `Add` operator to combine gradients + - No Gradient => insert a `fill_zero_grad` operator + - Recursive NetOp => call `Backward` recursively + - RNN Op => recursively call `Backward` on stepnet + - RNN Op => recursively call `Backward` on stepnet + + +--- +# Scope, Variable, Tensor + +* `Tensor` is an n-dimension array with type. + * Only dims and data pointers are stored in `Tensor`. + * All operations on `Tensor` are written in `Operator` or global functions. + * Variable length Tensor design [LoDTensor](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/lod_tensor.md) +* `Variable` instances are the inputs and the outputs of an operator, not just `Tensor`. + * `step_scopes` in RNN is a variable and not a tensor. +* `Scope` is where variables are stored. + * map + * `Scope` has a hierarchical structure. The local scope can get variables from its parent scope. + +--- +# Block (in design) +## the difference between original RNNOp and Block +- As an operator is more intuitive than `RNNOp`, +- Offers a new interface `Eval(targets)` to deduce the minimal block to `Run`, +- Fits the compile-time/ runtime separation design paradigm. + - During the compilation, `SymbolTable` stores `VarDesc`s and `OpDesc`s and serialize to a `BlockDesc` + - When graph executes, a Block with `BlockDesc` is passed. It then creates `Op` and `Var` instances and then invokes `Run`. + +--- +# Milestone +- Take Paddle/books as the main line, the requirement of the models motivates framework refactoring, +- Model migration + - Framework development gives **priority support** to model migration, for example, + - the MNIST demo needs a Python interface, + - the RNN models require the framework to support `LoDTensor`. + - Determine some timelines, + - Frequently used Ops need to be migrated first, + - Different models can be migrated in parallel. +- Improve the framework at the same time +- Accept imperfection, concentrate on solving the specific problem at the right price. + +--- +# Control the migration quality +- Compare the performance of migrated models with old ones. +- Follow the google C++ style guide. +- Build the automatic workflow of generating Python/C++ documentations. + - The documentation of layers and ops should be written inside the code. + - Take the documentation quality into account when submitting pull requests. + - Preview the documentations, read and improve them from a user's perspective. diff --git a/doc/design/register_grad_op.md b/doc/design/register_grad_op.md new file mode 100644 index 0000000000000000000000000000000000000000..8d973eb53178c3e889c845144553a453e11f067c --- /dev/null +++ b/doc/design/register_grad_op.md @@ -0,0 +1,92 @@ +# Design Doc: Gradient Operators Registration + + +## The Problem Posed + +Currently, for each C++ operator class definition, a *gradient operator creator* function is registered, which takes as input a C++ operator instance and returns the corresponding gradient operator instance. + +However, we noticed two problems with the current design: + +1. As we decided to separate the *compilation* and the *execution* phases, we need to change the creator to take an `OpDesc` protobuf message in a `ProgramDesc` and inserts corresponding `OpDesc` messages into the `ProgramDesc` message. + +1. For some operators, the gradient computation can be written in terms of existing operators. For example, the gradient of *minus* operator consists of two operators -- an *identity* operator followed by a *scale* operator. Hence the registration mechanism needs to support mapping from an operator to a set of operators for the gradient computation. + +## The Current Implementation + +Instances of the C++ class `OpInfo` are stored an associative map whose key is the operator type. The `grad_op_type` indicates the associated gradient operator type. An operator can create the gradient operator by invoking `OpInfo::creator_` of the gradient operator. The pseudo code is as follows + +```cpp +struct OpInfo { + std::function creator_; + std::string grad_op_type_; + ... +}; + +map OpInfoMap; + +OperatorBase* CreateGradientOperator(const OperatorBase& op) { + return OpInfoMap.at(op.Type()).creator_(...); +} +``` + +## Proposed Solution + +The mapping relationship between an operator and its gradient operators is a function. The interface of this function is: + +```cpp +// (OpDesc) --> vector +std::function(const OpDescBind&)>; +``` + +The function takes an `OpDescBind` of the forward operator and returns one or many gradient operator descriptions. `OpDescBind` is a C++ wrapper for the protobuf message `OpDesc` for rapid manipulation of `OpDesc`. + +The `GradOpDescMaker` will be registered in `OpInfo` and will replace the `grad_op_type_` field. The `OpInfo` should look like + +```cpp +struct OpInfo { + std::function>(const OpDescBind&)> grad_op_maker_; + ... +}; +``` + +The `grad_op_maker_ ` is a `nullptr` if the operator does not have any associated gradient operators. + +We propose a base class called `GradOpDescMakerBase` to let operator developers generate `Gradient Operators` easily. The public interface of that class is + +```cpp +class GradOpDescMakerBase { +public: + GradOpDescMakerBase(const OpDescBind& ); + virtual std::vector> operator()()const = 0; +}; +``` + +We can convert `GradOpDescMakerBase` to `std::function>(const OpDescBind&)>` by + +```cpp +using GradOpMaker = ...; +std::function(const OpDescBind&)> func; +func = [] (const OpDescBind& fwd_op) { + GradOpMaker maker(fwd_op); + return maker(); +}; +``` + +We can write many helper functions since the `GradOpDescMakerBase` is a class now. The basic helper functions get the variables of `Input`, `Output`, `InputGradient` and `OutputGradient` in the forwarding operator. + +We should change register macros at the same time. In the current solution, there is no difference between forwarding operators and backward operators. So `REGISTER_OP` just register one operator. If the `REGISTER_OPERATOR ` contains `OpProtoAndCheckerMaker` and `GradOpDescMaker`, we just list them in the same macro. It can be done by a macro contains `__VA_ARGS__`. + +The user interface should be + +```cpp +vector MinusOpGradMaker(OpDesc) {...} +REGISTER_OPERATOR(minus, MinusOp, MinusOpProtoAndCheckerMaker, SumOpGradMaker); +// Developers can still manually implement gradient operator. +REGISTER_OPERATOR(minus_grad, MinusGradOp); +``` + +The interface of current `REGISTER_OP` macro could not be changed. In `REGISTER_OP`, it will invoke `REGISTER_OPERATOR` two times and generate GradOpDescMaker inside. + +```cpp +REGISTER_OP(minus, MinusOp, MinusOpProtoAndCheckerMaker, minus_grad, MinusGradOp); +``` diff --git a/doc/design/regularization.md b/doc/design/regularization.md new file mode 100644 index 0000000000000000000000000000000000000000..21280ac898feb4dd5e5a5d9e88d121e856850f0b --- /dev/null +++ b/doc/design/regularization.md @@ -0,0 +1,72 @@ +# Regularization in PaddlePaddle + +## Introduction to Regularization +A central problem in machine learning is how to design an algorithm that will perform well not just on the training data, but also on new data. A frequently faced problem is the problem of **overfitting**, where the model does not make reliable predictions on new unseen data. **Regularization** is the process of introducing additional information in order to prevent overfitting. This is usually done by adding extra penalties to the loss function that restricts the parameter spaces that an optimization algorithm can explore. + +### Parameter Norm Penalties +Most common regularization approaches in deep learning are based on limiting the capacity of the models by adding a parameter norm penalty to the objective function `J`. This is given as follows: + +
+ +The parameter `alpha` is a hyperparameter that weights the relative contribution of the norm penalty term, `omega`, relative to the standard objective function `J`. + +The most commonly used norm penalties are the L2 norm penalty and the L1 norm penalty. These are given as follows: + +##### L2 Regularization: +
+ +##### L1 Regularization +
+ +A much more detailed mathematical background of regularization can be found [here](http://www.deeplearningbook.org/contents/regularization.html). + +## Regularization Survey + +A detailed survey of regularization in various deep learning frameworks can be found [here](https://github.com/PaddlePaddle/Paddle/wiki/Regularization-Survey). + +## Proposal for Regularization in PaddlePaddle + +### Low-Level implementation + +In the new design, we propose to create new operations for regularization. For now, we can add 2 ops that correspond to the most frequently used regularizations: +- L2_regularization_op +- L1_regularization_op + +These ops can be like any other ops with their own CPU/GPU implementations either using Eigen or separate CPU and GPU kernels. As the initial implementation, we can implement their kernels using Eigen following the abstraction pattern implemented for [Activation Ops](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/accuracy_op.h). This abstraction pattern can make it very easy to implement new regularization schemes other than L1 and L2 norm penalties. + +The idea of building ops for regularization is in sync with the refactored Paddle philosophy of using operators to represent any computation unit. The way these ops will be added to the computation graph, will be decided by the [layer functions](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/python_api.md#layer-function) in Python API. + +### Computation Graph + +Below is an example of a really simple feed forward neural network. + +
+ +The Python API will modify this computation graph to add regularization operators. The modified computation graph will look as follows: + +
+    +### Python API implementation for Regularization + +Using the low level ops, `L2_regularization_op` and `L1_regularization_op`, any user can add regularization to their computation graphs. However, this will require a lot of lines of code and we should design Python APIs that support regularization. An example of such an API can be seen in [Keras](https://keras.io/regularizers/). As per the PaddlePaddle [Python API design](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/python_api.md), the layer functions are responsible for creating operators, operator parameters and variables. Since regularization is a property of parameters, it makes sense to create these in the layer functions. + +#### Creation of Regularization ops +There are two possibilities for creating the regularization ops: +1. We create these ops immediately while building the computation graph. +2. We add these ops in a lazy manner, just before the backward, similar to the way the optimization ops are added. + +The proposal is to add these ops in a lazy manner just before the backward pass. + +#### Storage of Regularization attributes + +Since we want to create the regularization ops in a lazy manner, the regularization attributes (type of regularization and weight of regularization penalty) can be stored as attributes of the [`Parameter`](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/v2/framework/framework.py#L421) class. This is because regularization is a property of the parameters and storing regularization properties with Parameters also allows for shared parameters. + +#### High-level API + +In PaddlePaddle Python API, users will primarily rely on [layer functions](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/python_api.md#layer-function) to create neural network layers. Hence, we also need to provide regularization functionality in layer functions. The design of these APIs can be postponed for later right now. A good reference for these APIs can be found in [Keras](https://keras.io/regularizers/) and also by looking at Tensorflow in [`tf.contrib.layers`](https://www.tensorflow.org/api_guides/python/contrib.layers). + + + + + + diff --git a/doc/design/releasing_process.md b/doc/design/releasing_process.md index 3692a5248a355cfcfd1cfd0911d43d65166921b1..b9787261092f1f27377886152cb1596d9ff54188 100644 --- a/doc/design/releasing_process.md +++ b/doc/design/releasing_process.md @@ -1,31 +1,63 @@ -# Paddle发行规范 +# PaddlePaddle发行规范 -Paddle使用git-flow branching model做分支管理,使用[Semantic Versioning](http://semver.org/)标准表示Paddle版本号。 +PaddlePaddle使用git-flow branching model做分支管理,使用[Semantic Versioning](http://semver.org/)标准表示PaddlePaddle版本号。 -Paddle每次发新的版本,遵循以下流程: +PaddlePaddle每次发新的版本,遵循以下流程: 1. 从`develop`分支派生出新的分支,分支名为`release/版本号`。例如,`release/0.10.0` -2. 将新分支的版本打上tag,tag为`版本号rc.Patch号`。第一个tag为`0.10.0rc1`,第二个为`0.10.0rc2`,依次类推。 -3. 对这个版本的提交,做如下几个操作: - * 编译这个版本的Docker发行镜像,发布到dockerhub。如果失败,修复Docker编译镜像问题,Patch号加一,返回第二步 - * 编译这个版本的Ubuntu Deb包。如果失败,修复Ubuntu Deb包编译问题,Patch号加一,返回第二步。 - * 使用Regression Test List作为检查列表,测试Docker镜像/ubuntu安装包的功能正确性 - * 如果失败,记录下所有失败的例子,在这个`release/版本号`分支中,修复所有bug后,Patch号加一,返回第二步 -4. 第三步完成后,将`release/版本号`分支合入master分支,并删除`release/版本号`分支。将master分支的合入commit打上tag,tag为`版本号`。同时再将`master`分支合入`develop`分支。最后删除`release/版本号`分支。 -5. 编译master分支的Docker发行镜像,发布到dockerhub。编译ubuntu的deb包,发布到github release页面 -6. 协同完成Release Note的书写 +1. 将新分支的版本打上tag,tag为`版本号rc.Patch号`。第一个tag为`0.10.0rc1`,第二个为`0.10.0rc2`,依次类推。 +1. 对这个版本的提交,做如下几个操作: + * 使用Regression Test List作为检查列表,测试本次release的正确性。 + * 如果失败,记录下所有失败的例子,在这个`release/版本号`分支中,修复所有bug后,Patch号加一,到第二步 + * 修改`python/setup.py.in`中的版本信息,并将`istaged`字段设为`True`。 + * 编译这个版本的python wheel包,并发布到pypi。 + * 由于pypi.python.org目前遵循[严格的命名规范PEP 513](https://www.python.org/dev/peps/pep-0513),在使用twine上传之前,需要重命名wheel包中platform相关的后缀,比如将`linux_x86_64`修改成`manylinux1_x86_64`。 + * pypi上的package名称为paddlepaddle和paddlepaddle_gpu,如果要上传GPU版本的包,需要修改build/python/setup.py中,name: "paddlepaddle_gpu"并重新打包wheel包:`python setup.py bdist_wheel`。 + * 上传方法: + ``` + cd build/python + pip install twine + twine upload dist/[package to upload] + ``` + * 编译这个版本的Docker发行镜像,发布到dockerhub。如果失败,修复Docker编译镜像问题,Patch号加一,返回第二步 +1. 第三步完成后,将`release/版本号`分支合入master分支,并删除`release/版本号`分支。将master分支的合入commit打上tag,tag为`版本号`。同时再将`master`分支合入`develop`分支。最后删除`release/版本号`分支。 +1. 协同完成Release Note的书写 需要注意的是: -* `release/版本号`分支一旦建立,一般不允许再从`develop`分支合入`release/版本号`。这样保证`release/版本号`分支功能的封闭,方便测试人员测试Paddle的行为。 +* `release/版本号`分支一旦建立,一般不允许再从`develop`分支合入`release/版本号`。这样保证`release/版本号`分支功能的封闭,方便测试人员测试PaddlePaddle的行为。 * 在`release/版本号`分支存在的时候,如果有bugfix的行为,需要将bugfix的分支同时merge到`master`, `develop`和`release/版本号`这三个分支。 -# Paddle 分支规范 +## 发布wheel包到pypi -Paddle开发过程使用[git-flow](http://nvie.com/posts/a-successful-git-branching-model/)分支规范,并适应github的特性做了一些区别。 +使用[PaddlePaddle CI](https://paddleci.ngrok.io/project.html?projectId=Manylinux1&tab=projectOverview) +完成自动化二进制编译,参考下图,选择需要发布的版本(通常包含一个CPU版本和一个GPU版本),点击"run"右侧的"..."按钮,可以 +弹出下面的选择框,在第二个tab (Changes)里选择需要发布的分支,这里选择0.11.0,然后点击"Run Build"按钮。等待编译完成后 +可以在此页面的"Artifacts"下拉框中找到生成的3个二进制文件,分别对应CAPI,`cp27m`和`cp27mu`的版本。然后按照上述的方法 +使用`twine`工具上传即可。 -* Paddle的主版本库遵循[git-flow](http://nvie.com/posts/a-successful-git-branching-model/)分支规范。其中: + + +* 注:CI环境使用 https://github.com/PaddlePaddle/buildtools 这里的DockerImage作为编译环境以支持更多的Linux + 发型版,如果需要手动编译,也可以使用这些镜像。这些镜像也可以从 https://hub.docker.com/r/paddlepaddle/paddle_manylinux_devel/tags/ 下载得到。 +* pypi不支持覆盖上传,所以一个版本号的wheel包发布之后,不可以更改。下一个wheel包需要更新版本号才可以上传。 + +## 发布Docker镜像 + +上述PaddlePaddle CI编译wheel完成后会自动将Docker镜像push到DockerHub,所以,发布Docker镜像只需要对自动push的镜像打上 +版本号对应的tag即可: + +1. 进入 https://hub.docker.com/r/paddlepaddle/paddle/tags/ 查看latest tag的更新时间是否在上述编译wheel包完成后是否最新。 +1. 执行 `docker pull paddlepaddle/paddle:[latest tag]`,latest tag可以是latest或latest-gpu等。 +1. 执行 `docker tag paddlepaddle/paddle:[latest tag] paddlepaddle/paddle:[version]` +1. 执行 `docker push paddlepaddle/paddle:[version]` + +## PaddlePaddle 分支规范 + +PaddlePaddle开发过程使用[git-flow](http://nvie.com/posts/a-successful-git-branching-model/)分支规范,并适应github的特性做了一些区别。 + +* PaddlePaddle的主版本库遵循[git-flow](http://nvie.com/posts/a-successful-git-branching-model/)分支规范。其中: * `master`分支为稳定(stable branch)版本分支。每一个`master`分支的版本都是经过单元测试和回归测试的版本。 * `develop`分支为开发(develop branch)版本分支。每一个`develop`分支的版本都经过单元测试,但并没有经过回归测试。 * `release/版本号`分支为每一次Release时建立的临时分支。在这个阶段的代码正在经历回归测试。 @@ -33,18 +65,18 @@ Paddle开发过程使用[git-flow](http://nvie.com/posts/a-successful-git-branch * 其他用户的fork版本库并不需要严格遵守[git-flow](http://nvie.com/posts/a-successful-git-branching-model/)分支规范,但所有fork的版本库的所有分支都相当于特性分支。 * 建议,开发者fork的版本库使用`develop`分支同步主版本库的`develop`分支 * 建议,开发者fork的版本库中,再基于`develop`版本fork出自己的功能分支。 - * 当功能分支开发完毕后,向Paddle的主版本库提交`Pull Reuqest`,进而进行代码评审。 + * 当功能分支开发完毕后,向PaddlePaddle的主版本库提交`Pull Reuqest`,进而进行代码评审。 * 在评审过程中,开发者修改自己的代码,可以继续在自己的功能分支提交代码。 * BugFix分支也是在开发者自己的fork版本库维护,与功能分支不同的是,BugFix分支需要分别给主版本库的`master`、`develop`与可能有的`release/版本号`分支,同时提起`Pull Request`。 -# Paddle回归测试列表 +## PaddlePaddle回归测试列表 -本列表说明Paddle发版之前需要测试的功能点。 +本列表说明PaddlePaddle发版之前需要测试的功能点。 -## Paddle Book中所有章节 +### PaddlePaddle Book中所有章节 -Paddle每次发版本首先要保证Paddle Book中所有章节功能的正确性。功能的正确性包括验证Paddle目前的`paddle_trainer`训练和纯使用`Python`训练模型正确性。 +PaddlePaddle每次发版本首先要保证PaddlePaddle Book中所有章节功能的正确性。功能的正确性包括验证PaddlePaddle目前的`paddle_trainer`训练和纯使用`Python`训练模型正确性。 | | 新手入门章节 | 识别数字 | 图像分类 | 词向量 | 情感分析 | 语意角色标注 | 机器翻译 | 个性化推荐 | | --- | --- | --- | --- | --- | --- | --- | --- | --- | diff --git a/doc/design/scope.md b/doc/design/scope.md new file mode 100644 index 0000000000000000000000000000000000000000..4da76eebb74abcd26ec2b8671399e6bc4fb58574 --- /dev/null +++ b/doc/design/scope.md @@ -0,0 +1,124 @@ +# Design of Scope in Paddle + +## Overview + +Scope is an important concept in programming languages, which defines a program region that a set of bindings between names and entities applies. In a specific scope, a valid name is uniquely associated with an entity, such as a variable. And in another scope, this name may refer to other entity or nothing at all. It clearly restricts the visibility and validity of names in a program. Hence **Scope** is introduced to PaddlePaddle to manage variables in context. But different from the original abstract concept, Scope now becomes an object with two important attributes: + +- Scope is an association of a name to variable. +- Variables in a parent scope can be retrieved from local scope. + +A detailed explanation of these two attributes goes as following. + + +## Scope is an association of a name to variable. + +Scope is an association of a name to variable. All variables belong to `Scope`. You need to specify a scope to run a Net, i.e., `net.Run(&scope)`. One net can run in different scopes and update different variable in the scope. + + +1. Scope only contains a map of a name to variable. + + All parameters, data, states in a Net should be variables and stored inside a scope. Each op should get inputs and outputs to do computation from a scope, such as data buffer, state (momentum) etc. + +1. Variable can only be created by Scope and a variable can only be got from Scope. User cannot create or get a variable outside a scope. This is a constraints of our framework, and will keep our framework simple and clear. + +1. Scope only contains methods that are used to Create and Get Variables. Scope do not contain Operators and have no information to run them. + `Net` is designed to drive the computation and Scope only contains a map of variables. There is no computation logic inside a `Scope`. Scope just handles the lifetime management of variables. + - `Create` is used to create a Variable by its name and add the mapping relation. + - `Get` is used to find a Variable by name. + +1. Every variable only belongs to one certain Scope. + + Variable can not belong to many scopes. If you want to use variables from parent scope, you can use `parent scope`. + +1. Scope should destruct all Variables inside it when itself is destructed. User can never store `Variable` pointer somewhere else. + + Because Variable can only be got from Scope. When destroying Scope, we also need to destroy all the Variables in it. If user store `Variable` pointer to private data member or some global variable, the pointer will be an invalid pointer when associated `Scope` is destroyed. + +```cpp +class Scope { + public: + Variable* Var(const std::string& name); + const Variable* FindVar(const std::string& name) const; + + private: + std::unordered_map> vars_; +}; +``` + + +## Parent scope and local scope + +Just like [scope](https://en.wikipedia.org/wiki/Scope_(computer_science)) in programming languages, `Scope` in the neural network can also be a local scope. There are two attributes about local scope. + +1. We can create local variables in a local scope. When that local scope is destroyed, all local variables should also be destroyed. +2. Variables in a parent scope can be retrieved from local scopes of that parent scope, i.e., when user get a variable from a scope, it will try to search this variable in current scope. If there is no such variable in the local scope, `scope` will keep searching from its parent, until the variable is found or there is no parent. + +```cpp +class Scope { + public: + Scope(const std::shared_ptr& scope): parent_(scope) {} + + Variable* FindVar(const std::string& name) const { + auto it = vars_.find(name); + if (it != vars_.end()) { + return it->second.get(); + } else if (parent_ != nullptr) { + return parent_->FindVar(name); + } else { + return nullptr; + } + } + + private: + std::shared_ptr parent_ {nullptr}; +}; +``` + +In `Scope` class, there is a private data member called `parent_`. `parent_` is a smart pointer to its parent scope. When user `Get` a variable by its `name`, the `name` will be searched inside the current scope. If the variable cannot be found locally and parent scope is not a `nullptr`, the variable will be searched inside that parent scope. `parent_` pointer's default value is `nullptr`. It means that the scope is a global scope when `parent_` is nullptr. + +A local scope is very useful when we implement Recurrent Neural Network. Each timestep of an RNN should be a `Net`. Each `Net` of timestep (`StepNet` for short) should use an independent local scope. Just like variables in a while loop is inside a local scope in programming languages. By using a single `StepNet` and changing local scope, we can implement an RNN easily. + +# Interface Design + +```cpp +class Variable { + private: + Variable() = default; + friend class Scope; +}; + +class Scope { + private: + Scope(const std::shared_ptr& parent = nullptr); + + public: + static std::shared_ptr Create(const std::shared_ptr& parent = nullptr); + + // return nullptr if not found. + Variable* FindVar(const std::string& name) const; + + // return if already contains same name variable. + Variable* Var(const std::string& name); + + private: + std::shared_ptr parent_; + std::unordered_map> vars_; +}; +``` +## Only scope can create a variable + +To ensure `only scope can create a variable`, we should mark `Variable`'s constructor as a private member function, and Scope is a friend class of Variable. And then only `Var` can construct `Variable`. + +## When scope destroyed, all variables inside this scope should be destroyed together + +The scope hold unique pointers for all variables. User can `FindVar` from scope, but he should not hold this pointer as a member variable. Because when scope is destroyed, all variables inside this scope will be destroyed together. + +## Sharing a parent scope + +Local scope contains a `parent_` pointer. It is a linked-list for scopes. Using a `shared_ptr` because when a local scope is using, its parents cannot be destroyed. + +Also, as the parent scope is a `shared_ptr`, we can only `Create()` a scope shared pointer. We cannot construct a scope variable, because it cannot be passed to other scope as `parent` pointer. + +## Orthogonal interface + +`FindVar` will return `nullptr` when `name` is not found. It can be used as `Contains` method. `Var` will return an `Error` when there is a name conflict locally. Combine `FindVar` and `Var`, we can implement `Var` easily. diff --git a/doc/design/selected_rows.md b/doc/design/selected_rows.md new file mode 100644 index 0000000000000000000000000000000000000000..1a98839a957612b91b2276b58818623ecc62d1d5 --- /dev/null +++ b/doc/design/selected_rows.md @@ -0,0 +1,74 @@ +# Design Doc: Selected Rows + +`SelectedRows` is a type of sparse tensor data type, which is designed to support `embedding` operators. The gradient of embedding table is a sparse tensor. Only a few rows are non-zero values in this tensor. It is straight-forward to represent a sparse tensor by the following sparse tensor data structure: + +```cpp +class SelectedRows { + private: + vector rows_; + Tensor value_; + int height_; +}; +``` + +The field `height_` is the first dimension of `SelectedRows`. The `rows` are the indices of the non-zero rows of `SelectedRows`. The `value_` field is an N-dim tensor of shape `[rows.size() /* NUM_ROWS */, ...]`, which supplies values for each row. The dimension of `SelectedRows` satisfies `[height_] + value_.shape[1:]`. + +Suppose that a SelectedRows-typed variable `x` has many rows, but only two of them have values -- row 73 is `[1, 2]` and row 84 is `[3, 4]`, the `SelectedRows` representation would be: + +``` +x = SelectedRow { + rows = [73, 84], + value = [[1, 2], [3,4]] +} +``` + + +## SelectedRows in Protobuf + +`SelectedRows` is a type of `Variable`. `VarDesc` in protobuf should describe the `SelectedRows` information. Only the tensor dimension of a `SelectedRows` will be described in compile-time because the `rows_` and `value_` are dependent on the training data. +So we use `TensorDesc` to unify `data_type` and `dims`. A LodTensorDesc contains a `TensorDesc` and `lod_level`. The description of `SelectedRows` is a Tensor description. + +```proto +message TensorDesc { + required DataType data_type = 1; + repeated int64 dims = 2; // [UNK, 640, 480] is saved as [-1, 640, 480] +} + +message LodTensorDesc { + required TensorDesc tensor = 1; + optional int lod_level = 2; +} + +message VarDesc { + required string name = 1; + enum VarType { + LOD_TENSOR = 0; + SELECTED_ROWS = 1; + } + required VarType type = 2; + optional LodTensorDesc lod_desc = 3; + optional TensorDesc selected_rows_desc = 4; + optional bool persistable = 5 [ default = false ]; +} +``` + +## InferShape for Selected Rows + +Just like `LoD` information, `InferShape` method will infer the output tensor type as well. The operator should decide whether its output is a `SelectedRows` or `Dense` tensor. + +For example, the gradient operator of `TableLookup` will always generate `SelectedRows`. Its `InferShape` method should be like following + +```cpp +void TableLookupGrad::InferShape(context) { + ... + context.SetDataType("Embedding.Grad", kSelectedRows); +} +``` + + +## Sparse Operators + +There are several operators that need to be written to support `SelectedRows`. These are: + +1. Operators which generate `SelectedRows` gradient. e.g. Gradient of `TableLookupOp`. +2. Optimize operators which support `SelectedRows` gradient. e.g. `SGD` or `AdaGrad` for `SelectedRows`. However, there should be only one `SGD` operator. `OpWithKernel::Run` should select a suitable kernel for both `dense` tensor or `SelectedRows`. diff --git a/doc/design/simple_op_design.md b/doc/design/simple_op_design.md new file mode 100644 index 0000000000000000000000000000000000000000..c7aeed7f9b4637e1c29d530f37b42d12500af82f --- /dev/null +++ b/doc/design/simple_op_design.md @@ -0,0 +1,202 @@ +## Interaction between C++ and Python + +Users employ API in Python to describe their own network, however, the network construction actually happens in C++. so Protobuf is introduced to send the message between Python and C++. + +The Interaction between Python and C++ can be simplified as two steps: + +1. C++ tells Python how many Ops there are, and what parameter do users need to offer to initialize a new Op. Python then builds API for each Op at compile time. + +2. Users invoke APIs built by Python and provide necessary parameters. These parameters will be sent to C++ for finishing the Op construction task. + +### Message from C++ to Python + +We define a Protobuf message class `OpProto` to hold message needed in the first step. What should an `OpProto` contain? This question is equivalent to “What message do we need to offer, to build a Python API which is legal and user oriented and can use to describe a whole Op.” + +Following message are necessary: + +1. Op's name, and its simple comment. +2. Input and output variable number; each variable's name, type, and comment. +3. Op's attributes; each attribute includes name, type, comment, **default value** and **value range**. + +So `OpProto` can be defined as follows: + +```proto +enum AttrType { + INT = 1; + FLOAT = 2; + STRING = 3; + INTS = 4; + FLOATS = 5; + STRINGS = 6; +}; + +message AttrValue { + AttrType type = 1; + optional int iv = 2; + optional float fv = 3; + optional string sv = 4; + repeated int ivs = 5; + repeated float fvs = 6; + repeated string svs = 7; +}; + +message AttrProto { + required string name = 1; + required string comment = 2; + required AttrType type = 3; +}; + +message VarProto { + required string name = 1; + required string comment = 2; + required bool is_tensor = 3; +}; + +message OpProto { + repeated VarProto inputs = 1; + repeated VarProto outputs = 2; + repeated AttrProto attrs = 3; + required string type = 4; + required string comment = 5; +}; +``` + +To generate Python code automatically: + +```python +def create_python_ops_creatation_functions(): + op_protos = paddle.framework.OpRegistry.get_all_op_proto() + for type_name in op_protos: + op_proto = op_protos[type_name] + def __impl__(**kwargs): # User must use key word args in Paddle API + inputs = [kwargs.get(ipt.name, "") for ipt in op_proto.inputs] + outputs = [kwargs.get(opt.name, "") for opt in op_proto.outputs] + attrs = [cast_to_op_attr(attr, kwargs.get(attr.name, None)) for attr in op_proto.attrs] + opdesc = (input, outputs, type_name, attrs) + return paddle.framework.OpRegistry.CreateOp(opdesc) + __impl__.__doc__ = create_doc_string(op_proto) + globals()[type_name] = __impl__ + +create_python_ops_creatation_functions() +``` + +### Message from Python to C++ + +To hold message needed in the above second step, we define Protobuf message class `OpDesc`. It is used to hold user-specified parameters in Op describing. + +```proto +message OpDesc { + required string type = 1; + repeated string inputs = 2; + repeated string outputs = 3; + map attrs = 4; +}; +``` + +## OpProto Register + +Every Op has its own `OpProto`. For using convenience, we need to register them and record all their messages. For each `Op` class, we define a corresponding `OpMaker` class, in whose constructor we implement the `OpProto`'s building process. `OpMaker`'s constructor will be invoked by another function `OpRegistry::RegisterOp()`. + +```cpp +class OpProtoMaker { +public: + OpProtoMaker(OpProto* proto): proto_(proto) {} +protected: + OpProto* proto_; + void AddInput(const std::string& name, const std::string& desc) {...} + void AddAttr(const std::string& name, const std::string& desc, TypeId type) {...} + void AddComment(const std::string& comment) { ... } +}; + +class OpRegistry { +public: + using OpCreator = std::function; + + template + static void RegisterOp(const std::string& name) { + gCreators_[name] = [](const OpDesc& desc) { + return new OpType(desc); + }; + OpProto& opProto = gProtos_[name]; + OpMaker()(&opProto); + } + + static map gCreators_; + static map gProtos_; +}; + +template +class OpRegister { + public: + OpRegister(std::string type) { + OpRegistry::RegisterOp(type); + } +}; + +#define REGISTER_OP(op_class, op_maker_class, type_name) \ + class op_class##Register { \ + private: \ + const static OpRegister<#op_class, #op_maker_class> reg; \ + }; \ + const Register op_class##Register::reg(#type_name); + +class CosineOp { +// ... +} + +struct CosineOpProtoMaker : public OpProtoMaker { + CosineOpProtoMaker(OpProto* proto) : OpProtoMaker(proto) { + AddInput("input", "input of cosine op"); + AddAttr("scale", "scale of cosine op", float).Default(1.0).GreaterThan(0.0); + AddType("cos"); + AddComment("This is cos op"); + } +} + +REGISTER_OP(CosineOp, CosineOpProtoMaker, cos); +``` + +In `REGISTER_OP(CosineOp, CosineOpProtoMaker, cos)`, we register not only `CosineOp` but also `CosineOpProto`. As fields of `CosineOpProto`, the default value and value range of `scale` are also registered here. + +## Python API + +Python APIs are divided into two types, high-level API and low-level API. + +### High-Level API + +High-level API is called by users directly, so it should keep its style consistent with existing V2 APIs. + +Here is a sample about how a define a fc layer: + +```python +hd = fc_layer(input=data, size=56, with_bias=True, activation="sigmoid"); +``` + +`hd` is the output of `fc_layer` and it's a `variable`. It can be further sent into other layers as input. + +The definition of `fc_layer()`: + +```python +def fc_layer(input, size, with_bias, activation): + attr_map = {"size":size} + check_attrs(attr_map) + w = make_variable('w') + if with_bias: + b = make_variable('b') + else: + b = None + fc_output = make_variable('fc_output'); + fc_op(input, w, b, fc_output, attr_map) + act_output = make_variable('sigmod_output'); + if activation == "sigmod": + sigmod_op(fc_output, act_output); + elif: + # ... + return act_output; +``` + +### Low Leval API + +In above sample, `fc_op` and `sigmod_op` are low-level API. They build `OpDesc` and invoke corresponding C++ code. + +*TODO* diff --git a/doc/design/support_new_device.md b/doc/design/support_new_device.md new file mode 100644 index 0000000000000000000000000000000000000000..8983df900460127fc130043c52373dab505363ba --- /dev/null +++ b/doc/design/support_new_device.md @@ -0,0 +1,240 @@ +# Design Doc: Supporting new Device/Library + +## Background + +Deep learning has a high demand for computing resources. New high-performance devices and computing libraries are appearing very frequently. Deep learning frameworks have to integrate these high-performance devices and computing libraries in a flexible and efficient manner. + +On one hand, hardware and computing libraries usually do not have a one-to-one correspondence. For example, Intel CPUs support Eigen and MKL computing libraries while Nvidia GPUs support Eigen and cuDNN computing libraries. We have to implement operator specific kernels for each computing library. + +On the other hand, users usually do not want to care about the low-level hardware and computing libraries when writing a neural network configuration. In Fluid, `Layer` is exposed in `Python`, and `Operator` is exposed in `C++`. Both `Layer` and `Operator` are hardware independent. + +So, how to support a new Device/Library in Fluid becomes a challenge. + + +## Basic: Integrate A New Device/Library + +For a general overview of fluid, please refer to the [overview doc](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/howto/read_source.md). + +There are mainly three parts that we have to consider while integrating a new device/library: + +- Place and DeviceContext: indicate the device id and manage hardware resources + +- Memory and Tensor: malloc/free data on certain device + +- Math Functor and OpKernel: implement computing unit on certain devices/libraries + +### Place and DeviceContext + +Please note that device and computing library are not one-to-one corresponding. A device can have a lot of computing libraries and a computing library can also support several devices. + +#### Place +Fluid uses class [Place](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/platform/place.h#L55) to represent the device memory where data is located. If we add another device, we have to add the corresponding `DevicePlace`. + +``` + | CPUPlace +Place --| CUDAPlace + | FPGAPlace +``` + +And `Place` is defined as follows: + +``` +typedef boost::variant Place; +``` + +#### DeviceContext + +Fluid uses class [DeviceContext](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/platform/device_context.h#L30) to manage the resources in different libraries, such as CUDA stream in `CDUADeviceContext`. There are also inheritance relationships between different kinds of `DeviceContext`. + + +``` + /-> CPUDeviceContext +DeviceContext ----> CUDADeviceContext + \-> FPGADeviceContext +``` + +An example of Nvidia GPU is as follows: + +- DeviceContext + + +``` +class DeviceContext { + virtual Place GetPlace() const = 0; +}; +``` + + +- CUDADeviceContext + + +``` +class CUDADeviceContext : public DeviceContext { + Place GetPlace() const override { return place_; } +private: + CUDAPlace place_; + cudaStream_t stream_; + cublasHandle_t cublas_handle_; + std::unique_ptr eigen_device_; // binds with stream_ +}; +``` + +### Memory and Tensor + + +#### memory module + +Fluid provides the following [memory interfaces](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/memory/memory.h#L36): + +``` +template +void* Alloc(Place place, size_t size); + +template +void Free(Place place, void* ptr); + +template +size_t Used(Place place); +``` + +To implement these interfaces, we have to implement MemoryAllocator for different Devices. + + +#### Tensor + +[Tensor](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/tensor.h#L36) holds data with some shape in a specific Place. + +```cpp +class Tensor { + public: + /*! Return a pointer to mutable memory block. */ + template + inline T* data(); + + /** + * @brief Return a pointer to mutable memory block. + * @note If not exist, then allocation. + */ + template + inline T* mutable_data(platform::Place place); + + /** + * @brief Return a pointer to mutable memory block. + * + * @param[in] dims The dimensions of the memory block. + * @param[in] place The place of the memory block. + * + * @note If not exist, then allocation. + */ + template + inline T* mutable_data(DDim dims, platform::Place place); + + /*! Resize the dimensions of the memory block. */ + inline Tensor& Resize(const DDim& dims); + + /*! Return the dimensions of the memory block. */ + inline const DDim& dims() const; + + private: + /*! holds the memory block if allocated. */ + std::shared_ptr holder_; + + /*! points to dimensions of memory block. */ + DDim dim_; +}; +``` + +`Placeholder` is used to delay memory allocation; that is, we can first define a tensor, using `Resize` to configurate its shape, and then call `mutuable_data` to allocate the actual memory. + +```cpp +paddle::framework::Tensor t; +paddle::platform::CPUPlace place; +// set size first +t.Resize({2, 3}); +// allocate memory on CPU later +t.mutable_data(place); +``` + + + +### Math Functor and OpKernel + +Fluid implements computing units based on different DeviceContexts. Some computing units are shared between operators. This common part will be put in operators/math directory as basic Functors. + +Let's take [MaxOutFunctor](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/math/maxouting.h#L27) as an example: + +The interface is defined in the header file. + +``` +template +class MaxOutFunctor { + public: + void operator()(const DeviceContext& context, const framework::Tensor& input, + framework::Tensor* output, int groups); +}; +``` + +CPU implementation is in .cc file + +``` +template +class MaxOutFunctor { + public: + void operator()(const platform::CPUDeviceContext& context, + const framework::Tensor& input, framework::Tensor* output, + int groups) { + ... + } +}; +``` + +CUDA implementation is in .cu file + +``` +template +class MaxOutFunctor { + public: + void operator()(const platform::CUDADeviceContext& context, + const framework::Tensor& input, framework::Tensor* output, + int groups) { + ... + } +}; +``` + + +We first obtain the computing handle from a concrete DeviceContext and then compute on tensors. + +The implementation of `OpKernel` is similar to math functors, the extra thing we need to do is to register the OpKernel in a global map. + +Fluid provides different register interfaces in op_registry.h + + +Let's take [Crop](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/crop_op.cc#L134) operator as an example: + +In .cc file: + +``` +REGISTER_OP_CPU_KERNEL(crop, ops::CropKernel); +REGISTER_OP_CPU_KERNEL( + crop_grad, ops::CropGradKernel); +``` + +In .cu file: + +``` +REGISTER_OP_CUDA_KERNEL(crop, ops::CropKernel); +REGISTER_OP_CUDA_KERNEL( + crop_grad, ops::CropGradKernel); +``` + + +## Advanced topics: How to switch between different Device/Library + +Generally, we will implement OpKernel for all Device/Library of an Operator. We can easily train a Convolutional Neural Network in GPU. However, some OpKernel is not suitable on a specific Device. For example, crf operator can only run on CPU, whereas most other operators can run on GPU. To achieve high performance in such circumstance, we have to switch between different Device/Library. + + +For more details, please refer to following docs: + +- operator kernel type [doc](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/operator_kernel_type.md) +- switch kernel [doc](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/switch_kernel.md) diff --git a/doc/design/switch_kernel.md b/doc/design/switch_kernel.md new file mode 100644 index 0000000000000000000000000000000000000000..9719e031c70979cd95400701efd30879662e19bc --- /dev/null +++ b/doc/design/switch_kernel.md @@ -0,0 +1,99 @@ +## Background +Every operator has many kernels because there are multiple data types, places, data layout, library type that Fluid supports. We use the `OpKernelType ` to describe kernel types that operators can hold. + +The `OpKernelType ` is as follows: + +```cpp +struct OpKernelType { + Place place_; + DataType data_type_; + DataLayout data_layout_; + LibraryType library_type_; +}; +``` + +- The `place_` is a descriptor of the device, e.g., CPUPlace, CUDAPlace. + +- The `data_type_` is the data type that this kernel performs on, e.g., `FP32`, `INT64`. Note that one kernel may have inputs with different data types. However, it will be a major `data_type`. For example, the `cross_entropy` takes `int64` as it label, and `double`/`float` as its input logit and output cost. The major `data_type` of `cross_entropy` is `float` or `double`. + +- The `data_layout_ ` is useful for some computational library. One example is that MKLDNN uses many kinds of layout, such as `nChw8c`. Each kind of layout will invoke the different kernel. + +- The `library_type_` describes the computational library, e.g., `MKLDNN`, `CUDNN`. + +## Problem + +We register a kernel for every operator and every kernel type ideally. However, it is impracticable for the following situations. + +1. Some operators, like CRF, are complicated and inefficient to be implemented on GPU. The CRF operator will only have a CPU kernel. +2. Some operators will take too many memory. It is better to force them into CPU. However, the rest of operators in this neural network will be performed on GPU, i.e., model parallel problem. +3. Some layout and place are particular. One example is that MKLDNN uses `nChw8` and there is no other library uses `nChw8c`. + +Take one situation to give a detailed explanation, if we have two Operators: OP1 and OP2, OP1 has one output `op1_to_op2`, and `op1_to_op2` is the input of OP2. + +If OP1 and OP2 run on the same place(for example CPUPlace), then `op1_2_op2` can be used directly by OP2. + +``` +OP1(CPUPlace) + | + op1_2_op2 + | +OP2(CPUPlace) +``` + +If OP1 and OP2 run one different place, then OP2 cannot `use op1_2_op2` directly. + +Problems under these situations are similar. We can formalize this problem as follow. + +We register kernels with types $KT = \{kt_1, kt_2, kt_3, ...\}$ for one operator. The inputs of this operator should be run on kernel type $kt_{?}$, which the $kt_{?} \notin KT$. How to cast the input of this operator from $kt_{?}$ to any of kernel type in $KT$. + +## Solution: data transform + +It is clear that transforming inputs of an operator to adapt another kernel type is not related to the particular operator. So we should register these transformation methods as global methods. + +We can infer kernel type for each input of an operator. We let this kernel type as `actual kernel type for var`, which means this kernel type is the kernel type that can process this input variable. + +We can get a kernel type by 1) The configuration of operator description. (Users may want to force use `MKL` for `conv` operator). 2) The place of the current executor. (Executor is running on GPU). This kernel type is what we expect the operator will be performed on. We let this kernel type as `expect kernel type`. + +We transform the input data from `actual` to `expect` if the actual kernel type is not as same as expect kernel type. + +The algorithm is described as following + +```cpp +void OperatorWithKernel::Run( + const Scope& scope, + const platform::Place& place) const { + ExecutionContext ctx(...); + auto expected_kernel_key = this->GetExpectedKernelType(ctx); + + Scope& new_scope = scope.NewScope(); + + for (auto& var_name : this->Inputs()) { + auto* tensor_in = GetTensor(var_name); + auto kernel_type_for_var = this->GetKernelTypeForVar(...); + if (kernel_type_for_var.place_ != expected_kernel_key.place_) { + auto* trans_var = new_scope.Var(var_name); + auto* out = DataTransform(expected_kernel_key, + kernel_type_for_var, + *tensor_in); + CopyVariableWithTensor(...); + } + } + + auto kernel = kernels.find(expected_kernel_key); + kernel->Compute(ExecutionContext(...)); +} +``` + +then the actual process for the multi-device above will be: + +``` +OP1(CPUPlace) + | +op1_2_op2(on CPU) + | +[transform](from CPU to GPU) + | +op1_2_op2(on GPU) + | +OP2(CUDAPlace) +``` diff --git a/doc/design/tensor_array.md b/doc/design/tensor_array.md new file mode 100644 index 0000000000000000000000000000000000000000..37e4f7b90f94fa3eb015e733999cd84c96b2239c --- /dev/null +++ b/doc/design/tensor_array.md @@ -0,0 +1,271 @@ +# Design for TensorArray +This design doc presents the necessity of a new C++ class `TensorArray`. +In addition to the very simple C++ implementation + +```c++ +class TensorArray { + public: + explicit TensorArray(const LoDTensor&); + explicit TensorArray(size_t size); + + private: + vector values_; +}; +``` + +We also need to expose it to PaddlePaddle's Python API, +because users would want to use it with our very flexible operators `WhileLoop`. +An example for a RNN based on dynamic operators is + +```python +input = pd.data(...) +num_steps = Var(12) + +TensorArray states(size=num_steps) +TensorArray step_inputs(unstack_from=input) +TensorArray step_outputs(size=num_steps) + +W = Tensor(...) +U = Tensor(...) +default_state = some_op() + +step = Var(1) + +wloop = paddle.create_whileloop(loop_vars=[step]) +with wloop.frame(): + wloop.break_if(pd.equal(step, num_steps) + pre_state = states.read(step-1, default_state) + step_input = step_inputs.read(step) + state = pd.sigmoid(pd.matmul(U, pre_state) + pd.matmul(W, step_input)) + states.write(step, state) + step_outputs.write(step, state) # output state + step.update(state+1) + +output = step_outputs.stack() +``` + +## Background +Steps are one of the core concepts of RNN. In each time step of RNN, there should be several input segments, states, and output segments; all these components act like arrays, for example, call `states[step_id]` will get the state in `step_id`th time step. + +An RNN can be implemented with the following pseudocode + +```c++ +Array states; +Array input_segments; +Array output_segments; +Parameter W, U; + +step = 1 +seq_len = 12 +while_loop { + if (step == seq_len) break; + states[step] = sigmoid(W * states[step-1] + U * input_segments[step]); + output_segments[step] = states[step] // take state as output + step++; +} +``` +According to the [RNN roadmap](https://github.com/PaddlePaddle/Paddle/issues/4561), there are several different RNNs that PaddlePaddle will eventually support. + +Currently, the basic RNN implementation supported by PaddlePaddle is the `recurrent_op` which takes tensors as input and splits them into `input_segments`. + + +Since a tensor cannot store variable-length sequences directly, PaddlePaddle implements the tensor with level of details (`LoDTensor` for short). +Segmenting the `LoDTensor` is much more complicated than splitting a tensor, that makes it necessary to refactor the `recurrent_op` with `LoDTensor` segmenting support. + +As the next step in RNN support, `dynamic_recurrent_op` should be introduced to handle inputs with variable-length sequences. + +The implementation is similar to `recurrent_op`. +The key difference is the way **the original input `LoDTensors` and outupts are split to get the `input_segments` and the `output_segments`.** + + +Though it can't be built over `recurrent_op` or `dynamic_recurrent_op` directly, +the logic behind splitting a tensor or a LoD tensor into `input_segments` remains the same. + +## Why `TensorArray` +The logic behind splitting the inputs to segments, states and outputs is similar and can be shared in a seperate module. + +The array of `states`, `input_segments` and `output_segments` would be exposed to users when writing a dynamic RNN model similar to the above pseudo codes. + +So there should be an array-like container, which can store the segments of a tensor or LoD tensor. + +**This container can store an array of tensors and provides several methods to split a tensor or a LoD tensor** . +This is where the notion of `TensorArray` comes from. + +## Introduce TensorArray to uniform all the three RNNs +TensorArray as a new concept is borrowed from TensorFlow, +it is meant to be used with dynamic iteration primitives such as `while_loop` and `map_fn`. + +This concept can be used to support our new design of dynamic operations, and help to refactor some existing variant-sentence-related layers, +such as `recurrent_op`, `RecurrentGradientMachine`. + +In [our design for dynamic RNN](https://github.com/PaddlePaddle/Paddle/pull/4401), +`TensorArray` is used to segment inputs and store states in all time steps. +By providing some methods similar to a C++ array, +the definition of some state-based dynamic models such as RNN can be more natural and highly flexible. + +## Dynamic-operations on TensorArray + +`TensorArray` will be used directly when defining dynamic models, so some operators listed below should be implemented + +```python +# several helper operators for TensorArray +def tensor_array_stack(ta, tensor): + ''' + get a tensor array `ta`, return a packed `tensor`. + ''' + pass + +def tensor_array_unstack(tensor, ta): + ''' + get a `tensor`, unstack it and get a tensor array `ta`. + ''' + pass + +def tensor_array_write(ta, index, tensor, data_shared): + ''' + get a `tensor` and a scalar tensor `index`, write `tensor` into index-th + value of the tensor array `ta`. + `data_shared` is an attribute that specifies whether to copy or reference the tensors. + ''' + pass + +def tensor_array_read(ta, index, tensor): + ''' + get a tensor array `ta`, a scalar tensor `index`, read the index-th value of + `ta` and return as the `tensor`. + ''' + pass + +def tensor_array_size(ta, tensor): + ''' + get a tensor array `ta`, return the size of `ta` and return as the scalar `tensor`. + ''' + pass +``` + +It is trivial for users to use so many low-level operators, so some helper methods should be proposed in python wrapper to make `TensorArray` easier to use, +for example + +```python +class TensorArray: + def __init__(self, name): + self.name = name + self.desc = TensorArrayDesc() + + def stack(self, name=None): + ''' + Pack the values in a `TensorArray` into a tensor with rank one higher + than each tensor in `values`. + `stack` can be used to split tensor into time steps for RNN or whileloop. + + @name: str + the name of the variable to output. + ''' + tensor = Var(name) + tensor_array_stack(self.name, tensor) + return tensor + + def unstack(self, input): + ''' + Unpacks the given dimension of a rank-`R` tensor into rank-`(R-1)` tensors. + `unstack` can be used to concatenate all the time steps for RNN or whileloop. + + @input: str + the name of input tensor + ''' + tensor_array_unstack(tensor, self.name) + + def write(self, index, value, data_shared=True): + ''' + Write value into index of the TensorArray. + If `data_shared` is set to True, than the index-th value in TensorArray will + be shared with the tensor passed in. + + @index: str + name of a scalar tensor + @value: str + name of a tensor + @data_shared: bool + ''' + tensor_array_write(self.name, index, value, data_shared) + + def read(self, index, output): + ''' + Read the value at location `index` in the `TensorArray`. + + @index: str + name of a scalar tensor + @output: + name of a output variable + ''' + tensor_array_read(self.name, index, output) + + + def size(self, output): + ''' + Return the number of values. + + @output: str + name of a scalar tensor + ''' + tensor_array_size(self.name, output) +``` + +## LoDTensor-related Supports +The `RecurrentGradientMachine` in Paddle serves as a flexible RNN layer; it takes varience-length sequences as input, and output sequences too. + +Since each step of RNN can only take a tensor-represented batch of data as input, +some preprocess should be taken on the inputs such as sorting the sentences by their length in descending order and cut each word and pack to new batches. + +Such cut-like operations can be embedded into `TensorArray` as general methods called `unpack` and `pack`, +these two operations are similar to `stack` and `unstack` except that they operate on variable-length sequences formated as a LoD tensor rather than a tensor. + +Some definitions are like + +```python +def unpack(level): + ''' + Split LodTensor in some `level` and generate batches, if set `sort_by_length`, + will sort by length. + + Returns: + - a new `TensorArray`, whose values are LodTensors and represents batches + of data. + - an int32 Tensor, which stores the map from the new batch's indices to + original LoDTensor + ''' + pass + +def pack(level, indices_map): + ''' + Recover the original LoD-arranged LoDTensor with the values in a `TensorArray` + and `level` and `indices_map`. + ''' + pass +``` + +With these two methods, a varience-length sentence supported RNN can be implemented like + +```c++ +// input is the varient-length data +LodTensor sentence_input(xxx); +TensorArray ta; +Tensor indice_map; +Tensor boot_state = xxx; // to initialize rnn's first state +TensorArray::unpack(input, 1/*level*/, true/*sort_by_length*/, &ta, &indice_map); +TessorArray step_outputs; +TensorArray states; + +for (int step = 0; step = ta.size(); step++) { + auto state = states.read(step); + // rnnstep is a function which acts like a step of RNN + auto step_input = ta.read(step); + auto step_output = rnnstep(step_input, state); + step_outputs.write(step_output, true/*data_shared*/); +} + +// rnn_output is the final output of an rnn +LoDTensor rnn_output = ta.pack(ta, indice_map); +``` +the code above shows that by embedding the LoDTensor-related preprocess operations into `TensorArray`, +the implementation of a RNN that supports varient-length sentences is far more concise than `RecurrentGradientMachine` because the latter mixes all the codes together, hard to read and extend. diff --git a/doc/design/test.dot b/doc/design/test.dot new file mode 100644 index 0000000000000000000000000000000000000000..62c69b8fc8010a26a54a6ee8ef1488aad94d747a --- /dev/null +++ b/doc/design/test.dot @@ -0,0 +1,35 @@ + +digraph Test { + z -> generator -> G_img; + G_img -> discriminator -> D_f -> d_loss_f; + label0 -> d_loss_f -> d_loss; + + img -> discriminator -> D_t -> d_loss_t; + label1 -> d_loss_t -> d_loss; + + d_loss -> d_loss_t[color=red, style=dashed]; + d_loss -> d_loss_f[color=red, style=dashed]; + d_loss_t -> D_t[color=red, style=dashed]; + d_loss_f -> D_f[color=red, style=dashed]; + D_t -> discriminator[color=red, style=dashed]; + D_f -> discriminator[color=red, style=dashed]; + + D_f -> g_loss; + label2 -> g_loss; + + g_loss -> D_f[color=green, style=dashed]; + D_f -> discriminator[color=green, style=dashed]; + discriminator -> G_img[color=green, style=dashed]; + G_img -> generator[color=green, style=dashed]; + + discriminator [color=red, shape=box]; + generator [color=green, shape=box]; + z [shape=diamond]; + img [shape=diamond]; + label0 [shape=diamond]; + label1 [shape=diamond]; + label2 [shape=diamond]; + + d_loss [color=red]; + g_loss [color=green]; +} diff --git a/doc/design/test.dot.png b/doc/design/test.dot.png new file mode 100644 index 0000000000000000000000000000000000000000..4e121a40b9f7b2232d7cdda315bad15926446f55 Binary files /dev/null and b/doc/design/test.dot.png differ diff --git a/doc/design/var_desc.md b/doc/design/var_desc.md new file mode 100644 index 0000000000000000000000000000000000000000..89fa95326c5c4909137544c6b5fd574e1281abe2 --- /dev/null +++ b/doc/design/var_desc.md @@ -0,0 +1,69 @@ +## Background +PaddlePaddle divides the description of neural network computation into two stages: compile time and runtime. At compile time, the neural network computation is described as a `ProgramDesc` whereas at runtime an `Executor` interprets the `ProgramDesc` to compute the operations. + +PaddlePaddle use proto message to describe compile time program because + +1. The computation program description must be serializable and saved in a file. +1. During distributed training, the sreialized program will be sent to multiple workers. It should also be possible to break the program into different components, each of which can be executed on different workers. + +The computation `Program` consists of nested `Blocks`. Each `Block` will consist of data(i.e. `Variable`) and `Operations`. The concept to represent them is in the table below. + +| |compile time|runtime| +|---|---|---| +|Data|VarDesc(proto)|Variable(cpp)| +|Operation|OpDesc(proto)|Operator(cpp)| + + +## Definition of VarDesc + +A VarDesc should have a name, and value. The are two kinds of variable type in compile time, they are `LoDTensor` and `SelectedRows`. + +```proto +message VarDesc { + required string name = 1; + enum VarType { + LOD_TENSOR = 0; + SELECTED_ROWS = 1; + } + required VarType type = 2; + optional LoDTensorDesc lod_desc = 3; + optional TensorDesc selected_rows_desc = 4; + optional bool persistable = 5 [ default = false ]; +} +``` + +## Definition of TensorDesc + +```proto +enum DataType { + BOOL = 0; + INT16 = 1; + INT32 = 2; + INT64 = 3; + FP16 = 4; + FP32 = 5; + FP64 = 6; +} + +message TensorDesc { + required DataType data_type = 1; + repeated int64 dims = 2; // [UNK, 640, 480] is saved as [-1, 640, 480] +} +``` + +A TensorDesc describes `SelectedRows` and `LoDTensor`. For details of `SelectedRows`, please reference [`SelectedRows`](./selected_rows.md). + +## Definition of LodTensorDesc + +```proto +message LoDTensorDesc { + required TensorDesc tensor = 1; + optional int lod_level = 2; +} +``` + +A LoDTensorDesc contains a tensor and a lod_level. + +## Definition of Variable in Python + +For Variable in Python, please reference [`Python API`](./python_api.md). diff --git a/doc/faq/build_and_install/index_cn.rst b/doc/faq/build_and_install/index_cn.rst new file mode 100644 index 0000000000000000000000000000000000000000..ed8a0c7e87da133138ecfc7ba6a8217d58b8f71d --- /dev/null +++ b/doc/faq/build_and_install/index_cn.rst @@ -0,0 +1,139 @@ +################### +编译安装与单元测试 +################### + +.. contents:: + +1. 运行Docker GPU镜像出现 "CUDA driver version is insufficient" +---------------------------------------------------------------- + +用户在使用PaddlePaddle GPU的Docker镜像的时候,常常出现 `Cuda Error: CUDA driver version is insufficient for CUDA runtime version`, 原因在于没有把机器上CUDA相关的驱动和库映射到容器内部。 +具体的解决方法是: + +.. code-block:: bash + + $ export CUDA_SO="$(\ls usr/lib64/libcuda* | xargs -I{} echo '-v {}:{}') $(\ls /usr/lib64/libnvidia* | xargs -I{} echo '-v {}:{}')" + $ export DEVICES=$(\ls /dev/nvidia* | xargs -I{} echo '--device {}:{}') + $ docker run ${CUDA_SO} ${DEVICES} -it paddlepaddle/paddle:latest-gpu + +更多关于Docker的安装与使用, 请参考 `PaddlePaddle Docker 文档 `_ 。 + + +2. CMake源码编译, 找到的PythonLibs和PythonInterp版本不一致 +---------------------------------------------------------------- + +这是目前CMake寻找Python的逻辑存在缺陷,如果系统安装了多个Python版本,CMake找到的Python库和Python解释器版本可能有不一致现象,导致编译PaddlePaddle失败。正确的解决方法是, +用户强制指定特定的Python版本,具体操作如下: + + .. code-block:: bash + + cmake .. -DPYTHON_EXECUTABLE= -DPYTHON_LIBRARY= -DPYTHON_INCLUDE_DIR= + +用户需要指定本机上Python的路径:````, ````, ```` + +3. CMake源码编译,Paddle版本号为0.0.0 +-------------------------------------- + +如果运行 :code:`paddle version`, 出现 :code:`PaddlePaddle 0.0.0`;或者运行 :code:`cmake ..`,出现 + +.. code-block:: bash + + CMake Warning at cmake/version.cmake:20 (message): + Cannot add paddle version from git tag + +那么用户需要拉取所有的远程分支到本机,命令为 :code:`git fetch upstream`,然后重新cmake即可。 + +4. paddlepaddle\*.whl is not a supported wheel on this platform. +------------------------------------------------------------------------ + +出现这个问题的主要原因是,没有找到和当前系统匹配的paddlepaddle安装包。最新的paddlepaddle python安装包支持Linux x86_64和MacOS 10.12操作系统,并安装了python 2.7和pip 9.0.1。 + +更新 :code:`pip` 包的方法是\: + +.. code-block:: bash + + pip install --upgrade pip + +如果还不行,可以执行 :code:`python -c "import pip; print(pip.pep425tags.get_supported())"` 获取当前系统支持的python包的后缀, +并对比是否和正在安装的后缀一致。 + +如果系统支持的是 :code:`linux_x86_64` 而安装包是 :code:`manylinux1_x86_64` ,需要升级pip版本到最新; +如果系统支持 :code:`manylinux1_x86_64` 而安装包(本地)是 :code:`linux_x86_64` ,可以重命名这个whl包为 :code:`manylinux1_x86_64` 再安装。 + +5. 编译安装后执行 import paddle.v2 as paddle 报ImportError: No module named v2 +------------------------------------------------------------------------------------------ +先查看一下是否曾经安装过paddle v1版本,有的话需要先卸载: + +pip uninstall py_paddle paddle + +然后安装paddle的python环境, 在build目录下执行 + +pip install python/dist/paddle*.whl && pip install ../paddle/dist/py_paddle*.whl + +6. 遇到“非法指令”或者是“illegal instruction” +-------------------------------------------- + +PaddlePaddle使用avx SIMD指令提高cpu执行效率,因此错误的使用二进制发行版可能会导致这种错误,请选择正确的版本。 + +7. python相关的单元测试都过不了 +-------------------------------- + +如果出现以下python相关的单元测试都过不了的情况: + +.. code-block:: bash + + 24 - test_PyDataProvider (Failed) + 26 - test_RecurrentGradientMachine (Failed) + 27 - test_NetworkCompare (Failed) + 28 - test_PyDataProvider2 (Failed) + 32 - test_Prediction (Failed) + 33 - test_Compare (Failed) + 34 - test_Trainer (Failed) + 35 - test_TrainerOnePass (Failed) + 36 - test_CompareTwoNets (Failed) + 37 - test_CompareTwoOpts (Failed) + 38 - test_CompareSparse (Failed) + 39 - test_recurrent_machine_generation (Failed) + 40 - test_PyDataProviderWrapper (Failed) + 41 - test_config_parser (Failed) + 42 - test_swig_api (Failed) + 43 - layers_test (Failed) + +并且查询PaddlePaddle单元测试的日志,提示: + +.. code-block:: bash + + paddle package is already in your PYTHONPATH. But unittest need a clean environment. + Please uninstall paddle package before start unittest. Try to 'pip uninstall paddle'. + +解决办法是: + +* 卸载PaddlePaddle包 :code:`pip uninstall paddle`, 清理掉老旧的PaddlePaddle安装包,使得单元测试有一个干净的环境。如果PaddlePaddle包已经在python的site-packages里面,单元测试会引用site-packages里面的python包,而不是源码目录里 :code:`/python` 目录下的python包。同时,即便设置 :code:`PYTHONPATH` 到 :code:`/python` 也没用,因为python的搜索路径是优先已经安装的python包。 + +8. 下载MKLML库失败 +------------------ + +.. code-block:: bash + + make[2]: *** [third_party/mklml/src/extern_mklml-stamp/extern_mklml-download] 错误 4 + make[1]: *** [CMakeFiles/extern_mklml.dir/all] 错误 2 + make[1]: *** 正在等待未完成的任务.... + +原因:网速或SSL链接原因,导致MKLML库下载不成功。 + +解决办法是:手动下载并安装,具体步骤如下。 + +.. code-block:: bash + + // 1. 进入对应的目录 + cd build/third_party/mklml/src/extern_mklml + + // 2. 查看包的大小, 正常情况下是75M,如果小于75M,即下载失败: + du -sh mklml_lnx_2018.0.1.20171007.tgz + + // 3. 手动下载且解压缩,并手动生成download成功标签: + wget --no-check-certificate https://github.com/01org/mkl-dnn/releases/download/v0.11/mklml_lnx_2018.0.1.20171007.tgz -c -O mklml_lnx_2018.0.1.20171007.tgz + tar zxf mklml_lnx_2018.0.1.20171007.tgz + touch ../extern_mklml-stamp/extern_mklml-download + + // 4. 接着编译即可 diff --git a/doc/faq/cluster/index_cn.rst b/doc/faq/cluster/index_cn.rst new file mode 100644 index 0000000000000000000000000000000000000000..e59c1e1a54a0c876d1e6e89f88030de59fb9fc1a --- /dev/null +++ b/doc/faq/cluster/index_cn.rst @@ -0,0 +1,17 @@ +############### +集群训练与预测 +############### + +.. contents:: + +1. 集群多节点训练,日志中保存均为网络通信类错误 +------------------------------------------------ + +集群多节点训练,日志报错为网络通信类错误,比如 :code:`Connection reset by peer` 等。 +此类报错通常是由于某一个节点的错误导致这个节点的训练进程退出,从而引发其他节点无法连接导致,可以参考下面的步骤排查: + +* 从 :code:`train.log` , :code:`server.log` 找到最早报错的地方,查看是否是其他错误引发的报错(比如FPE,内存不足,磁盘空间不足等)。 + +* 如果发现最早的报错就是网络通信的问题,很有可能是非独占方式执行导致的端口冲突,可以联系OP,看当前MPI集群是否支持resource=full参数提交,如果支持增加此参数提交,并更换job 端口。 + +* 如果当前MPI集群并不支持任务独占模式,可以联系OP是否可以更换集群或升级当前集群。 diff --git a/doc/faq/index_cn.rst b/doc/faq/index_cn.rst index c14160d55ec8fdb9fc552da33f3a3dac13c1a764..9929767cac212237b3e2c3a547ba9a3c9d5f0979 100644 --- a/doc/faq/index_cn.rst +++ b/doc/faq/index_cn.rst @@ -1,313 +1,11 @@ -#################### FAQ -#################### +==== -.. contents:: +.. toctree:: + :maxdepth: 1 -1. 如何减少内存占用 ---------------------------------- - -神经网络的训练本身是一个非常消耗内存和显存的工作,经常会消耗数10GB的内存和数GB的显存。 -PaddlePaddle的内存占用主要分为如下几个方面\: - -* DataProvider缓冲池内存(只针对内存) -* 神经元激活内存(针对内存和显存) -* 参数内存 (针对内存和显存) -* 其他内存杂项 - -其中,其他内存杂项是指PaddlePaddle本身所用的一些内存,包括字符串分配,临时变量等等,暂不考虑在内。 - -减少DataProvider缓冲池内存 -++++++++++++++++++++++++++ - -PyDataProvider使用的是异步加载,同时在内存里直接随即选取数据来做Shuffle。即 - -.. graphviz:: - - digraph { - rankdir=LR; - 数据文件 -> 内存池 -> PaddlePaddle训练 - } - -所以,减小这个内存池即可减小内存占用,同时也可以加速开始训练前数据载入的过程。但是,这 -个内存池实际上决定了shuffle的粒度。所以,如果将这个内存池减小,又要保证数据是随机的, -那么最好将数据文件在每次读取之前做一次shuffle。可能的代码为 - -.. literalinclude:: src/reduce_min_pool_size.py - -这样做可以极大的减少内存占用,并且可能会加速训练过程,详细文档参考 :ref:`api_pydataprovider2` 。 - -神经元激活内存 -++++++++++++++ - -神经网络在训练的时候,会对每一个激活暂存一些数据,如神经元激活值等。 -在反向传递的时候,这些数据会被用来更新参数。这些数据使用的内存主要和两个参数有关系, -一是batch size,另一个是每条序列(Sequence)长度。所以,其实也是和每个mini-batch中包含 -的时间步信息成正比。 - -所以做法可以有两种: - -* 减小batch size。 即在网络配置中 :code:`settings(batch_size=1000)` 设置成一个小一些的值。但是batch size本身是神经网络的超参数,减小batch size可能会对训练结果产生影响。 -* 减小序列的长度,或者直接扔掉非常长的序列。比如,一个数据集大部分序列长度是100-200, - 但是突然有一个10000长的序列,就很容易导致内存超限,特别是在LSTM等RNN中。 - -参数内存 -++++++++ - -PaddlePaddle支持非常多的优化算法(Optimizer),不同的优化算法需要使用不同大小的内存。 -例如使用 :code:`adadelta` 算法,则需要使用等于权重参数规模大约5倍的内存。举例,如果参数保存下来的模型目录 -文件为 :code:`100M`, 那么该优化算法至少需要 :code:`500M` 的内存。 - -可以考虑使用一些优化算法,例如 :code:`momentum`。 - -2. 如何加速PaddlePaddle的训练速度 ---------------------------------- - -加速PaddlePaddle训练可以考虑从以下几个方面\: - -* 减少数据载入的耗时 -* 加速训练速度 -* 利用分布式训练驾驭更多的计算资源 - -减少数据载入的耗时 -++++++++++++++++++ - -使用\ :code:`pydataprovider`\ 时,可以减少缓存池的大小,同时设置内存缓存功能,即可以极大的加速数据载入流程。 -:code:`DataProvider` 缓存池的减小,和之前减小通过减小缓存池来减小内存占用的原理一致。 - -.. literalinclude:: src/reduce_min_pool_size.py - -同时 :code:`@provider` 接口有一个 :code:`cache` 参数来控制缓存方法,将其设置成 :code:`CacheType.CACHE_PASS_IN_MEM` 的话,会将第一个 :code:`pass` (过完所有训练数据即为一个pass)生成的数据缓存在内存里,在之后的 :code:`pass` 中,不会再从 :code:`python` 端读取数据,而是直接从内存的缓存里读取数据。这也会极大减少数据读入的耗时。 - - -加速训练速度 -++++++++++++ - -PaddlePaddle支持Sparse的训练,sparse训练需要训练特征是 :code:`sparse_binary_vector` 、 :code:`sparse_vector` 、或者 :code:`integer_value` 的任一一种。同时,与这个训练数据交互的Layer,需要将其Parameter设置成 sparse 更新模式,即设置 :code:`sparse_update=True` - -这里使用简单的 :code:`word2vec` 训练语言模型距离,具体使用方法为\: - -使用一个词前两个词和后两个词,来预测这个中间的词。这个任务的DataProvider为\: - -.. literalinclude:: src/word2vec_dataprovider.py - -这个任务的配置为\: - -.. literalinclude:: src/word2vec_config.py - - -利用更多的计算资源 -++++++++++++++++++ - -利用更多的计算资源可以分为一下几个方式来进行\: - -* 单机CPU训练 - - * 使用多线程训练。设置命令行参数 :code:`trainer_count`。 - -* 单机GPU训练 - - * 使用显卡训练。设置命令行参数 :code:`use_gpu`。 - * 使用多块显卡训练。设置命令行参数 :code:`use_gpu` 和 :code:`trainer_count` 。 - -* 多机训练 - - * 请参考 :ref:`cluster_train` 。 - - -3. 遇到“非法指令”或者是“illegal instruction” --------------------------------------------- - -PaddlePaddle使用avx SIMD指令提高cpu执行效率,因此错误的使用二进制发行版可能会导致这种错误,请选择正确的版本。 - -4. 如何选择SGD算法的学习率 --------------------------- - -在采用sgd/async_sgd进行训练时,一个重要的问题是选择正确的learning_rate。如果learning_rate太大,那么训练有可能不收敛,如果learning_rate太小,那么收敛可能很慢,导致训练时间过长。 - -通常做法是从一个比较大的learning_rate开始试,如果不收敛,那减少学习率10倍继续试验,直到训练收敛为止。那么如何判断训练不收敛呢?可以估计出如果模型采用不变的输出最小的cost0是多少。 - -如果训练过程的的cost明显高于这个常数输出的cost,那么我们可以判断为训练不收敛。举一个例子,假如我们是三分类问题,采用multi-class-cross-entropy作为cost,数据中0,1,2三类的比例为 :code:`0.2, 0.5, 0.3` , 那么常数输出所能达到的最小cost是 :code:`-(0.2*log(0.2)+0.5*log(0.5)+0.3*log(0.3))=1.03` 。如果训练一个pass(或者更早)后,cost还大于这个数,那么可以认为训练不收敛,应该降低学习率。 - - -5. 如何初始化参数 ------------------ - -默认情况下,PaddlePaddle使用均值0,标准差为 :math:`\frac{1}{\sqrt{d}}` 来初始化参数。其中 :math:`d` 为参数矩阵的宽度。这种初始化方式在一般情况下不会产生很差的结果。如果用户想要自定义初始化方式,PaddlePaddle目前提供两种参数初始化的方式\: - -* 高斯分布。将 :code:`param_attr` 设置成 :code:`param_attr=ParamAttr(initial_mean=0.0, initial_std=1.0)` -* 均匀分布。将 :code:`param_attr` 设置成 :code:`param_attr=ParamAttr(initial_max=1.0, initial_min=-1.0)` - -比如设置一个全连接层的参数初始化方式和bias初始化方式,可以使用如下代码。 - -.. code-block:: python - - hidden = fc_layer(input=ipt, param_attr=ParamAttr(initial_max=1.0, initial_min=-1.0), - bias_attr=ParamAttr(initial_mean=1.0, initial_std=0.0)) - -上述代码将bias全部初始化为1.0, 同时将参数初始化为 :code:`[1.0, -1.0]` 的均匀分布。 - -6. 如何共享参数 ---------------- - -PaddlePaddle的参数使用名字 :code:`name` 作为参数的ID,相同名字的参数,会共享参数。设置参数的名字,可以使用 :code:`ParamAttr(name="YOUR_PARAM_NAME")` 来设置。更方便的设置方式,是使得要共享的参数使用同样的 :code:`ParamAttr` 对象。 - -简单的全连接网络,参数共享的配置示例为\: - -.. literalinclude:: ../../python/paddle/trainer_config_helpers/tests/configs/shared_fc.py - -这里 :code:`hidden_a` 和 :code:`hidden_b` 使用了同样的parameter和bias。并且softmax层的两个输入也使用了同样的参数 :code:`softmax_param`。 - -7. \*-cp27mu-linux_x86_64.whl is not a supported wheel on this platform. ------------------------------------------------------------------------- - -出现这个问题的主要原因是,系统编译wheel包的时候,使用的 :code:`wheel` 包是最新的, -而系统中的 :code:`pip` 包比较老。具体的解决方法是,更新 :code:`pip` 包并重新编译PaddlePaddle。 -更新 :code:`pip` 包的方法是\: - -.. code-block:: bash - - pip install --upgrade pip - -8. python相关的单元测试都过不了 --------------------------------- - -如果出现以下python相关的单元测试都过不了的情况: - -.. code-block:: bash - - 24 - test_PyDataProvider (Failed) - 26 - test_RecurrentGradientMachine (Failed) - 27 - test_NetworkCompare (Failed) - 28 - test_PyDataProvider2 (Failed) - 32 - test_Prediction (Failed) - 33 - test_Compare (Failed) - 34 - test_Trainer (Failed) - 35 - test_TrainerOnePass (Failed) - 36 - test_CompareTwoNets (Failed) - 37 - test_CompareTwoOpts (Failed) - 38 - test_CompareSparse (Failed) - 39 - test_recurrent_machine_generation (Failed) - 40 - test_PyDataProviderWrapper (Failed) - 41 - test_config_parser (Failed) - 42 - test_swig_api (Failed) - 43 - layers_test (Failed) - -并且查询PaddlePaddle单元测试的日志,提示: - -.. code-block:: bash - - paddle package is already in your PYTHONPATH. But unittest need a clean environment. - Please uninstall paddle package before start unittest. Try to 'pip uninstall paddle'. - -解决办法是: - -* 卸载PaddlePaddle包 :code:`pip uninstall paddle`, 清理掉老旧的PaddlePaddle安装包,使得单元测试有一个干净的环境。如果PaddlePaddle包已经在python的site-packages里面,单元测试会引用site-packages里面的python包,而不是源码目录里 :code:`/python` 目录下的python包。同时,即便设置 :code:`PYTHONPATH` 到 :code:`/python` 也没用,因为python的搜索路径是优先已经安装的python包。 - - -9. 运行Docker GPU镜像出现 "CUDA driver version is insufficient" ----------------------------------------------------------------- - -用户在使用PaddlePaddle GPU的Docker镜像的时候,常常出现 `Cuda Error: CUDA driver version is insufficient for CUDA runtime version`, 原因在于没有把机器上CUDA相关的驱动和库映射到容器内部。 -具体的解决方法是: - -.. code-block:: bash - - $ export CUDA_SO="$(\ls usr/lib64/libcuda* | xargs -I{} echo '-v {}:{}') $(\ls /usr/lib64/libnvidia* | xargs -I{} echo '-v {}:{}')" - $ export DEVICES=$(\ls /dev/nvidia* | xargs -I{} echo '--device {}:{}') - $ docker run ${CUDA_SO} ${DEVICES} -it paddledev/paddlepaddle:latest-gpu - -更多关于Docker的安装与使用, 请参考 `PaddlePaddle Docker 文档 `_ 。 - - -10. CMake源码编译, 找到的PythonLibs和PythonInterp版本不一致 ----------------------------------------------------------------- - -这是目前CMake寻找Python的逻辑存在缺陷,如果系统安装了多个Python版本,CMake找到的Python库和Python解释器版本可能有不一致现象,导致编译PaddlePaddle失败。正确的解决方法是, -用户强制指定特定的Python版本,具体操作如下: - - .. code-block:: bash - - cmake .. -DPYTHON_EXECUTABLE= -DPYTHON_LIBRARY= -DPYTHON_INCLUDE_DIR= - -用户需要指定本机上Python的路径:````, ````, ```` - -11. CMake源码编译,Paddle版本号为0.0.0 --------------------------------------- - -如果运行 :code:`paddle version`, 出现 :code:`PaddlePaddle 0.0.0`;或者运行 :code:`cmake ..`,出现 - -.. code-block:: bash - - CMake Warning at cmake/version.cmake:20 (message): - Cannot add paddle version from git tag - -那么用户需要拉取所有的远程分支到本机,命令为 :code:`git fetch upstream`,然后重新cmake即可。 - -12. A protocol message was rejected because it was too big ----------------------------------------------------------- - -如果在训练NLP相关模型时,出现以下错误: - -.. code-block:: bash - - [libprotobuf ERROR google/protobuf/io/coded_stream.cc:171] A protocol message was rejected because it was too big (more than 67108864 bytes). To increase the limit (or to disable these warnings), see CodedInputStream::SetTotalBytesLimit() in google/protobuf/io/coded_stream.h. - F1205 14:59:50.295174 14703 TrainerConfigHelper.cpp:59] Check failed: m->conf.ParseFromString(configProtoStr) - -可能的原因是:传给dataprovider的某一个args过大,一般是由于直接传递大字典导致的。错误的define_py_data_sources2类似: - -.. code-block:: python - - src_dict = dict() - for line_count, line in enumerate(open(src_dict_path, "r")): - src_dict[line.strip()] = line_count - - define_py_data_sources2( - train_list, - test_list, - module="dataprovider", - obj="process", - args={"src_dict": src_dict}) - -解决方案是:将字典的地址作为args传给dataprovider,然后在dataprovider里面根据该地址加载字典。即define_py_data_sources2应改为: - -.. code-block:: python - - define_py_data_sources2( - train_list, - test_list, - module="dataprovider", - obj="process", - args={"src_dict_path": src_dict_path}) - -完整源码可参考 `seqToseq `_ 示例。 - -13. 如何指定GPU设备 -------------------- - -例如机器上有4块GPU,编号从0开始,指定使用2、3号GPU: - -* 方式1:通过 `CUDA_VISIBLE_DEVICES `_ 环境变量来指定特定的GPU。 - -.. code-block:: bash - - env CUDA_VISIBLE_DEVICES=2,3 paddle train --use_gpu=true --trainer_count=2 - -* 方式2:通过命令行参数 ``--gpu_id`` 指定。 - -.. code-block:: bash - - paddle train --use_gpu=true --trainer_count=2 --gpu_id=2 - - -14. 训练过程中出现 :code:`Floating point exception`, 训练因此退出怎么办? ------------------------------------------------------------------------- - -Paddle二进制在运行时捕获了浮点数异常,只要出现浮点数异常(即训练过程中出现NaN或者Inf),立刻退出。浮点异常通常的原因是浮点数溢出、除零等问题。 -主要原因包括两个方面: - -* 训练过程中参数或者训练过程中的梯度尺度过大,导致参数累加,乘除等时候,导致了浮点数溢出。 -* 模型一直不收敛,发散到了一个数值特别大的地方。 -* 训练数据有问题,导致参数收敛到了一些奇异的情况。或者输入数据尺度过大,有些特征的取值达到数百万,这时进行矩阵乘法运算就可能导致浮点数溢出。 - -主要的解决办法是减小学习律或者对数据进行归一化处理。 + build_and_install/index_cn.rst + model/index_cn.rst + parameter/index_cn.rst + local/index_cn.rst + cluster/index_cn.rst diff --git a/doc/faq/local/index_cn.rst b/doc/faq/local/index_cn.rst new file mode 100644 index 0000000000000000000000000000000000000000..0306b1e5dd25a55545e464ce847291c33576575f --- /dev/null +++ b/doc/faq/local/index_cn.rst @@ -0,0 +1,259 @@ +############### +本地训练与预测 +############### + +.. contents:: + +1. 如何减少内存占用 +------------------- + +神经网络的训练本身是一个非常消耗内存和显存的工作,经常会消耗数10GB的内存和数GB的显存。 +PaddlePaddle的内存占用主要分为如下几个方面\: + +* DataProvider缓冲池内存(只针对内存) +* 神经元激活内存(针对内存和显存) +* 参数内存 (针对内存和显存) +* 其他内存杂项 + +其中,其他内存杂项是指PaddlePaddle本身所用的一些内存,包括字符串分配,临时变量等等,暂不考虑在内。 + +减少DataProvider缓冲池内存 +++++++++++++++++++++++++++ + +PyDataProvider使用的是异步加载,同时在内存里直接随即选取数据来做Shuffle。即 + +.. graphviz:: + + digraph { + rankdir=LR; + 数据文件 -> 内存池 -> PaddlePaddle训练 + } + +所以,减小这个内存池即可减小内存占用,同时也可以加速开始训练前数据载入的过程。但是,这 +个内存池实际上决定了shuffle的粒度。所以,如果将这个内存池减小,又要保证数据是随机的, +那么最好将数据文件在每次读取之前做一次shuffle。可能的代码为 + +.. literalinclude:: src/reduce_min_pool_size.py + +这样做可以极大的减少内存占用,并且可能会加速训练过程,详细文档参考 :ref:`api_pydataprovider2` 。 + +神经元激活内存 +++++++++++++++ + +神经网络在训练的时候,会对每一个激活暂存一些数据,如神经元激活值等。 +在反向传递的时候,这些数据会被用来更新参数。这些数据使用的内存主要和两个参数有关系, +一是batch size,另一个是每条序列(Sequence)长度。所以,其实也是和每个mini-batch中包含 +的时间步信息成正比。 + +所以做法可以有两种: + +* 减小batch size。 即在网络配置中 :code:`settings(batch_size=1000)` 设置成一个小一些的值。但是batch size本身是神经网络的超参数,减小batch size可能会对训练结果产生影响。 +* 减小序列的长度,或者直接扔掉非常长的序列。比如,一个数据集大部分序列长度是100-200, + 但是突然有一个10000长的序列,就很容易导致内存超限,特别是在LSTM等RNN中。 + +参数内存 +++++++++ + +PaddlePaddle支持非常多的优化算法(Optimizer),不同的优化算法需要使用不同大小的内存。 +例如使用 :code:`adadelta` 算法,则需要使用等于权重参数规模大约5倍的内存。举例,如果参数保存下来的模型目录 +文件为 :code:`100M`, 那么该优化算法至少需要 :code:`500M` 的内存。 + +可以考虑使用一些优化算法,例如 :code:`momentum`。 + +2. 如何加速训练速度 +------------------- + +加速PaddlePaddle训练可以考虑从以下几个方面\: + +* 减少数据载入的耗时 +* 加速训练速度 +* 利用分布式训练驾驭更多的计算资源 + +减少数据载入的耗时 +++++++++++++++++++ + +使用\ :code:`pydataprovider`\ 时,可以减少缓存池的大小,同时设置内存缓存功能,即可以极大的加速数据载入流程。 +:code:`DataProvider` 缓存池的减小,和之前减小通过减小缓存池来减小内存占用的原理一致。 + +.. literalinclude:: src/reduce_min_pool_size.py + +同时 :code:`@provider` 接口有一个 :code:`cache` 参数来控制缓存方法,将其设置成 :code:`CacheType.CACHE_PASS_IN_MEM` 的话,会将第一个 :code:`pass` (过完所有训练数据即为一个pass)生成的数据缓存在内存里,在之后的 :code:`pass` 中,不会再从 :code:`python` 端读取数据,而是直接从内存的缓存里读取数据。这也会极大减少数据读入的耗时。 + + +加速训练速度 +++++++++++++ + +PaddlePaddle支持Sparse的训练,sparse训练需要训练特征是 :code:`sparse_binary_vector` 、 :code:`sparse_vector` 、或者 :code:`integer_value` 的任一一种。同时,与这个训练数据交互的Layer,需要将其Parameter设置成 sparse 更新模式,即设置 :code:`sparse_update=True` + +这里使用简单的 :code:`word2vec` 训练语言模型距离,具体使用方法为\: + +使用一个词前两个词和后两个词,来预测这个中间的词。这个任务的DataProvider为\: + +.. literalinclude:: src/word2vec_dataprovider.py + +这个任务的配置为\: + +.. literalinclude:: src/word2vec_config.py + + +利用更多的计算资源 +++++++++++++++++++ + +利用更多的计算资源可以分为以下几个方式来进行\: + +* 单机CPU训练 + + * 使用多线程训练。设置命令行参数 :code:`trainer_count`。 + +* 单机GPU训练 + + * 使用显卡训练。设置命令行参数 :code:`use_gpu`。 + * 使用多块显卡训练。设置命令行参数 :code:`use_gpu` 和 :code:`trainer_count` 。 + +* 多机训练 + + * 请参考 :ref:`cluster_train` 。 + +3. 如何指定GPU设备 +------------------ + +例如机器上有4块GPU,编号从0开始,指定使用2、3号GPU: + +* 方式1:通过 `CUDA_VISIBLE_DEVICES `_ 环境变量来指定特定的GPU。 + +.. code-block:: bash + + env CUDA_VISIBLE_DEVICES=2,3 paddle train --use_gpu=true --trainer_count=2 + +* 方式2:通过命令行参数 ``--gpu_id`` 指定。 + +.. code-block:: bash + + paddle train --use_gpu=true --trainer_count=2 --gpu_id=2 + + +4. 训练过程中出现 :code:`Floating point exception`, 训练因此退出怎么办? +------------------------------------------------------------------------ + +Paddle二进制在运行时捕获了浮点数异常,只要出现浮点数异常(即训练过程中出现NaN或者Inf),立刻退出。浮点异常通常的原因是浮点数溢出、除零等问题。 +主要原因包括两个方面: + +* 训练过程中参数或者训练过程中的梯度尺度过大,导致参数累加,乘除等时候,导致了浮点数溢出。 +* 模型一直不收敛,发散到了一个数值特别大的地方。 +* 训练数据有问题,导致参数收敛到了一些奇异的情况。或者输入数据尺度过大,有些特征的取值达到数百万,这时进行矩阵乘法运算就可能导致浮点数溢出。 + +这里有两种有效的解决方法: + +1. 设置 :code:`gradient_clipping_threshold` 参数,示例代码如下: + +.. code-block:: python + +optimizer = paddle.optimizer.RMSProp( + learning_rate=1e-3, + gradient_clipping_threshold=10.0, + regularization=paddle.optimizer.L2Regularization(rate=8e-4)) + +具体可以参考 `nmt_without_attention `_ 示例。 + +2. 设置 :code:`error_clipping_threshold` 参数,示例代码如下: + +.. code-block:: python + +decoder_inputs = paddle.layer.fc( + act=paddle.activation.Linear(), + size=decoder_size * 3, + bias_attr=False, + input=[context, current_word], + layer_attr=paddle.attr.ExtraLayerAttribute( + error_clipping_threshold=100.0)) + +完整代码可以参考示例 `machine translation `_ 。 + +两种方法的区别: + +1. 两者都是对梯度的截断,但截断时机不同,前者在 :code:`optimzier` 更新网络参数时应用;后者在激活函数反向计算时被调用; +2. 截断对象不同:前者截断可学习参数的梯度,后者截断回传给前层的梯度; + +除此之外,还可以通过减小学习率或者对数据进行归一化处理来解决这类问题。 + +5. 如何调用 infer 接口输出多个layer的预测结果 +----------------------------------------------- + +* 将需要输出的层作为 :code:`paddle.inference.Inference()` 接口的 :code:`output_layer` 参数输入,代码如下: + +.. code-block:: python + + inferer = paddle.inference.Inference(output_layer=[layer1, layer2], parameters=parameters) + +* 指定要输出的字段进行输出。以输出 :code:`value` 字段为例,代码如下: + +.. code-block:: python + + out = inferer.infer(input=data_batch, field=["value"]) + +需要注意的是: + +* 如果指定了2个layer作为输出层,实际上需要的输出结果是两个矩阵; +* 假设第一个layer的输出A是一个 N1 * M1 的矩阵,第二个 Layer 的输出B是一个 N2 * M2 的矩阵; +* paddle.v2 默认会将A和B 横向拼接,当N1 和 N2 大小不一样时,会报如下的错误: + +.. code-block:: python + + ValueError: all the input array dimensions except for the concatenation axis must match exactly + +多个层的输出矩阵的高度不一致导致拼接失败,这种情况常常发生在: + +* 同时输出序列层和非序列层; +* 多个输出层处理多个不同长度的序列; + +此时可以在调用infer接口时通过设置 :code:`flatten_result=False` , 跳过“拼接”步骤,来解决上面的问题。这时,infer接口的返回值是一个python list: + +* list 中元素的个数等于网络中输出层的个数; +* list 中每个元素是一个layer的输出结果矩阵,类型是numpy的ndarray; +* 每一个layer输出矩阵的高度,在非序列输入时:等于样本数;序列输入时等于:输入序列中元素的总数;宽度等于配置中layer的size; + +6. 如何在训练过程中获得某一个layer的output +----------------------------------------------- + +可以在event_handler中,通过 :code:`event.gm.getLayerOutputs("layer_name")` 获得在模型配置中某一层的name :code:`layer_name` 在当前 +mini-batch forward的output的值。获得的值类型均为 :code:`numpy.ndarray` ,可以通过这个输出来完成自定义的评估指标计算等功能。例如下面代码: + +.. code-block:: python + + def score_diff(right_score, left_score): + return np.average(np.abs(right_score - left_score)) + + def event_handler(event): + if isinstance(event, paddle.event.EndIteration): + if event.batch_id % 25 == 0: + diff = score_diff( + event.gm.getLayerOutputs("right_score")["right_score"][ + "value"], + event.gm.getLayerOutputs("left_score")["left_score"][ + "value"]) + logger.info(("Pass %d Batch %d : Cost %.6f, " + "average absolute diff scores: %.6f") % + (event.pass_id, event.batch_id, event.cost, diff)) + +注意:此方法不能获取 :code:`paddle.layer.recurrent_group` 里step的内容,但可以获取 :code:`paddle.layer.recurrent_group` 的输出。 + +7. 如何在训练过程中获得参数的权重和梯度 +----------------------------------------------- + +在某些情况下,获得当前mini-batch的权重(或称作weights, parameters)有助于在训练时观察具体数值,方便排查以及快速定位问题。 +可以通过在 :code:`event_handler` 中打印其值(注意,需要使用 :code:`paddle.event.EndForwardBackward` 保证使用GPU训练时也可以获得), +示例代码如下: + +.. code-block:: python + + ... + parameters = paddle.parameters.create(cost) + ... + def event_handler(event): + if isinstance(event, paddle.event.EndForwardBackward): + if event.batch_id % 25 == 0: + for p in parameters.keys(): + logger.info("Param %s, Grad %s", + parameters.get(p), parameters.get_grad(p)) + +注意:“在训练过程中获得某一个layer的output”和“在训练过程中获得参数的权重和梯度”都会造成训练中的数据从C++拷贝到numpy,会对训练性能造成影响。不要在注重性能的训练场景下使用。 \ No newline at end of file diff --git a/doc/faq/local/src/reduce_min_pool_size.py b/doc/faq/local/src/reduce_min_pool_size.py new file mode 100644 index 0000000000000000000000000000000000000000..9efdb5707ac4a0cb701ec4fef6dfff399d6150cb --- /dev/null +++ b/doc/faq/local/src/reduce_min_pool_size.py @@ -0,0 +1,21 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +@provider(min_pool_size=0, ...) +def process(settings, filename): + os.system('shuf %s > %s.shuf' % (filename, filename)) # shuffle before. + with open('%s.shuf' % filename, 'r') as f: + for line in f: + yield get_sample_from_line(line) diff --git a/doc/faq/local/src/word2vec_config.py b/doc/faq/local/src/word2vec_config.py new file mode 100644 index 0000000000000000000000000000000000000000..b4fcf0960eda37f98f9f7f2949148f8d51cd2008 --- /dev/null +++ b/doc/faq/local/src/word2vec_config.py @@ -0,0 +1,26 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +... # the settings and define data provider is omitted. +DICT_DIM = 3000 # dictionary dimension. +word_ids = data_layer('word_ids', size=DICT_DIM) + +emb = embedding_layer( + input=word_ids, size=256, param_attr=ParamAttr(sparse_update=True)) +emb_sum = pooling_layer(input=emb, pooling_type=SumPooling()) +predict = fc_layer(input=emb_sum, size=DICT_DIM, act=Softmax()) +outputs( + classification_cost( + input=predict, label=data_layer( + 'label', size=DICT_DIM))) diff --git a/doc/faq/local/src/word2vec_dataprovider.py b/doc/faq/local/src/word2vec_dataprovider.py new file mode 100644 index 0000000000000000000000000000000000000000..3b6273b0574fb54a7c374a1f19fdd993fff5c730 --- /dev/null +++ b/doc/faq/local/src/word2vec_dataprovider.py @@ -0,0 +1,24 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +DICT_DIM = 3000 + + +@provider(input_types=[integer_sequence(DICT_DIM), integer_value(DICT_DIM)]) +def process(settings, filename): + with open(filename) as f: + # yield word ids to predict inner word id + # such as [28, 29, 10, 4], 4 + # It means the sentance is 28, 29, 4, 10, 4. + yield read_next_from_file(f) diff --git a/doc/faq/model/index_cn.rst b/doc/faq/model/index_cn.rst new file mode 100644 index 0000000000000000000000000000000000000000..6947948bc79f4dba63954c459afb940e3242c405 --- /dev/null +++ b/doc/faq/model/index_cn.rst @@ -0,0 +1,80 @@ +######### +模型配置 +######### + +.. contents:: + +1. 出现 :code:`Duplicated layer name` 错误怎么办 +-------------------------------------------------- + +出现该错误的原因一般是用户对不同layer的参数 :code:`name` 设置了相同的取值。遇到该错误时,先找出参数 :code:`name` 取值相同的layer,然后将这些layer的参数 :code:`name` 设置为不同的值。 + +2. :code:`paddle.layer.memory` 的参数 :code:`name` 如何使用 +------------------------------------------------------------- + +* :code:`paddle.layer.memory` 用于获取特定layer上一时间步的输出,该layer是通过参数 :code:`name` 指定,即,:code:`paddle.layer.memory` 会关联参数 :code:`name` 取值相同的layer,并将该layer上一时间步的输出作为自身当前时间步的输出。 + +* PaddlePaddle的所有layer都有唯一的name,用户通过参数 :code:`name` 设定,当用户没有显式设定时,PaddlePaddle会自动设定。而 :code:`paddle.layer.memory` 不是真正的layer,其name由参数 :code:`memory_name` 设定,当用户没有显式设定时,PaddlePaddle会自动设定。:code:`paddle.layer.memory` 的参数 :code:`name` 用于指定其要关联的layer,需要用户显式设定。 + +3. 两种使用 drop_out 的方法有何区别 +------------------------------------ + +* 在PaddlePaddle中使用dropout有两种方式 + + * 在相应layer的 :code:`layer_atter` 设置 :code:`drop_rate`,以 :code:`paddle.layer.fc` 为例,代码如下: + + .. code-block:: python + + fc = paddle.layer.fc(input=input, layer_attr=paddle.attr.ExtraLayerAttribute(drop_rate=0.5)) + + * 使用 :code:`paddle.layer.dropout`,以 :code:`paddle.layer.fc` 为例,代码如下: + + .. code-block:: python + + fc = paddle.layer.fc(input=input) + drop_fc = paddle.layer.dropout(input=fc, dropout_rate=0.5) + +* :code:`paddle.layer.dropout` 实际上使用了 :code:`paddle.layer.add_to`,并在该layer里采用第一种方式设置 :code:`drop_rate` 来使用dropout的。这种方式对内存消耗较大。 + +* PaddlePaddle在激活函数里实现dropout,而不是在layer里实现。 + +* :code:`paddle.layer.lstmemory`、:code:`paddle.layer.grumemory`、:code:`paddle.layer.recurrent` 不是通过一般的方式来实现对输出的激活,所以不能采用第一种方式在这几个layer里设置 :code:`drop_rate` 来使用dropout。若要对这几个layer使用dropout,可采用第二种方式,即使用 :code:`paddle.layer.dropout`。 + +4. 不同的 recurrent layer 的区别 +---------------------------------- +以LSTM为例,在PaddlePaddle中包含以下 recurrent layer: + +* :code:`paddle.layer.lstmemory` +* :code:`paddle.networks.simple_lstm` +* :code:`paddle.networks.lstmemory_group` +* :code:`paddle.networks.bidirectional_lstm` + +按照具体实现方式可以归纳为2类: + +1. 由 recurrent_group 实现的 recurrent layer: + + * 用户在使用这一类recurrent layer时,可以访问由recurrent unit在一个时间步内计算得到的中间值(例如:hidden states, memory cells等); + * 上述的 :code:`paddle.networks.lstmemory_group` 是这一类的 recurrent layer ; + +2. 将recurrent layer作为一个整体来实现: + + * 用户在使用这一类recurrent layer,只能访问它们的输出值; + * 上述的 :code:`paddle.networks.lstmemory_group` 、 :code:`paddle.networks.simple_lstm` 和 :code:`paddle.networks.bidirectional_lstm` 属于这一类的实现; + +将recurrent layer作为一个整体来实现, 能够针对CPU和GPU的计算做更多优化, 所以相比于recurrent group的实现方式, 第二类 recurrent layer 计算效率更高。 在实际应用中,如果用户不需要访问LSTM的中间变量,而只需要获得recurrent layer计算的输出,我们建议使用第二类实现。 + +此外,关于LSTM, PaddlePaddle中还包含 :code:`paddle.networks.lstmemory_unit` 这一计算单元: + + * 不同于上述介绍的recurrent layer , :code:`paddle.networks.lstmemory_unit` 定义了LSTM单元在一个时间步内的计算过程,它并不是一个完整的recurrent layer,也不能接收序列数据作为输入; + * :code:`paddle.networks.lstmemory_unit` 只能在recurrent_group中作为step function使用; + +5. PaddlePaddle的softmax能否指定计算的维度 +----------------------------------------- + +PaddlePaddle的softmax不能指定计算维度,只能按行计算。 +在图像任务中,对于NCHW,如果需要在C维度计算softmax,可以先使用 :code:`paddle.layer.switch_order` 改变维度顺序,即将NCHW转换成NHWC,再做一定的reshape,最后计算softmax。 + +6. PaddlePaddle是否支持维数可变的数据输入 +------------------------------------------ + +PaddlePaddle提供的 :code:`paddle.data_type.dense_array` 支持维数可变的数据输入。在使用时,将对应数据层的维数设置成一个大于输入数据维数的值用于占位即可。 diff --git a/doc/faq/parameter/index_cn.rst b/doc/faq/parameter/index_cn.rst new file mode 100644 index 0000000000000000000000000000000000000000..6fa0c64413be1616a435640b0347904a49873349 --- /dev/null +++ b/doc/faq/parameter/index_cn.rst @@ -0,0 +1,201 @@ +######### +参数设置 +######### + +.. contents:: + +1. 如何选择SGD算法的学习率 +-------------------------- + +在采用sgd/async_sgd进行训练时,一个重要的问题是选择正确的learning_rate。如果learning_rate太大,那么训练有可能不收敛,如果learning_rate太小,那么收敛可能很慢,导致训练时间过长。 + +通常做法是从一个比较大的learning_rate开始试,如果不收敛,那减少学习率10倍继续试验,直到训练收敛为止。那么如何判断训练不收敛呢?可以估计出如果模型采用不变的输出最小的cost0是多少。 + +如果训练过程的的cost明显高于这个常数输出的cost,那么我们可以判断为训练不收敛。举一个例子,假如我们是三分类问题,采用multi-class-cross-entropy作为cost,数据中0,1,2三类的比例为 :code:`0.2, 0.5, 0.3` , 那么常数输出所能达到的最小cost是 :code:`-(0.2*log(0.2)+0.5*log(0.5)+0.3*log(0.3))=1.03` 。如果训练一个pass(或者更早)后,cost还大于这个数,那么可以认为训练不收敛,应该降低学习率。 + +2. 如何设置学习率退火(learning rate annealing) +------------------------------------------------ + +在相应的优化算法里设置learning_rate_schedule及相关参数,以使用Adam算法为例,代码如下: + +.. code-block:: python + + optimizer = paddle.optimizer.Adam( + learning_rate=1e-3, + learning_rate_decay_a=0.5, + learning_rate_decay_b=0.75, + learning_rate_schedule="poly",) + +PaddlePaddle目前支持8种learning_rate_schedule,这8种learning_rate_schedule及其对应学习率计算方式如下: + +* "constant" + + lr = learning_rate + +* "poly" + + lr = learning_rate * pow(1 + learning_rate_decay_a * num_samples_processed, -learning_rate_decay_b) + + 其中,num_samples_processed为已训练样本数,下同。 + +* "caffe_poly" + + lr = learning_rate * pow(1.0 - num_samples_processed / learning_rate_decay_a, learning_rate_decay_b) + +* "exp" + + lr = learning_rate * pow(learning_rate_decay_a, num_samples_processed / learning_rate_decay_b) + +* "discexp" + + lr = learning_rate * pow(learning_rate_decay_a, floor(num_samples_processed / learning_rate_decay_b)) + +* "linear" + + lr = max(learning_rate - learning_rate_decay_a * num_samples_processed, learning_rate_decay_b) + +* "manual" + + 这是一种按已训练样本数分段取值的学习率退火方法。使用该learning_rate_schedule时,用户通过参数 :code:`learning_rate_args` 设置学习率衰减因子分段函数,当前的学习率为所设置 :code:`learning_rate` 与当前的衰减因子的乘积。以使用Adam算法为例,代码如下: + + .. code-block:: python + + optimizer = paddle.optimizer.Adam( + learning_rate=1e-3, + learning_rate_schedule="manual", + learning_rate_args="1000:1.0,2000:0.9,3000:0.8",) + + 在该示例中,当已训练样本数小于等于1000时,学习率为 :code:`1e-3 * 1.0`;当已训练样本数大于1000小于等于2000时,学习率为 :code:`1e-3 * 0.9`;当已训练样本数大于2000时,学习率为 :code:`1e-3 * 0.8`。 + +* "pass_manual" + + 这是一种按已训练pass数分段取值的学习率退火方法。使用该learning_rate_schedule时,用户通过参数 :code:`learning_rate_args` 设置学习率衰减因子分段函数,当前的学习率为所设置 :code:`learning_rate` 与当前的衰减因子的乘积。以使用Adam算法为例,代码如下: + + .. code-block:: python + + optimizer = paddle.optimizer.Adam( + learning_rate=1e-3, + learning_rate_schedule="pass_manual", + learning_rate_args="1:1.0,2:0.9,3:0.8",) + + 在该示例中,当已训练pass数小于等于1时,学习率为 :code:`1e-3 * 1.0`;当已训练pass数大于1小于等于2时,学习率为 :code:`1e-3 * 0.9`;当已训练pass数大于2时,学习率为 :code:`1e-3 * 0.8`。 + +3. 如何初始化参数 +----------------- + +默认情况下,PaddlePaddle使用均值0,标准差为 :math:`\frac{1}{\sqrt{d}}` 来初始化参数。其中 :math:`d` 为参数矩阵的宽度。这种初始化方式在一般情况下不会产生很差的结果。如果用户想要自定义初始化方式,PaddlePaddle目前提供两种参数初始化的方式\: + +* 高斯分布。将 :code:`param_attr` 设置成 :code:`param_attr=ParamAttr(initial_mean=0.0, initial_std=1.0)` +* 均匀分布。将 :code:`param_attr` 设置成 :code:`param_attr=ParamAttr(initial_max=1.0, initial_min=-1.0)` + +比如设置一个全连接层的参数初始化方式和bias初始化方式,可以使用如下代码。 + +.. code-block:: python + + hidden = fc_layer(input=ipt, param_attr=ParamAttr(initial_max=1.0, initial_min=-1.0), + bias_attr=ParamAttr(initial_mean=1.0, initial_std=0.0)) + +上述代码将bias全部初始化为1.0, 同时将参数初始化为 :code:`[1.0, -1.0]` 的均匀分布。 + +4. 如何共享参数 +--------------- + +PaddlePaddle的参数使用名字 :code:`name` 作为参数的ID,相同名字的参数,会共享参数。设置参数的名字,可以使用 :code:`ParamAttr(name="YOUR_PARAM_NAME")` 来设置。更方便的设置方式,是使得要共享的参数使用同样的 :code:`ParamAttr` 对象。 + +简单的全连接网络,参数共享的配置示例为\: + +.. literalinclude:: ../../python/paddle/trainer_config_helpers/tests/configs/shared_fc.py + +这里 :code:`hidden_a` 和 :code:`hidden_b` 使用了同样的parameter和bias。并且softmax层的两个输入也使用了同样的参数 :code:`softmax_param`。 + +5. 如何加载预训练参数 +------------------------ + +* 对加载预训练参数的层,设置其参数属性 :code:`is_static=True`,使该层的参数在训练过程中保持不变。以embedding层为例,代码如下: + +.. code-block:: python + + emb_para = paddle.attr.Param(name='emb', is_static=True) + paddle.layer.embedding(size=word_dim, input=x, param_attr=emb_para) + + +* 从模型文件将预训练参数载入 :code:`numpy.array`,在创建parameters后,使用 :code:`parameters.set()` 加载预训练参数。PaddlePaddle保存的模型参数文件前16字节为头信息,用户将参数载入 :code:`numpy.array` 时须从第17字节开始。以embedding层为例,代码如下: + +.. code-block:: python + + def load_parameter(file_name, h, w): + with open(file_name, 'rb') as f: + f.read(16) # skip header. + return np.fromfile(f, dtype=np.float32).reshape(h, w) + + parameters = paddle.parameters.create(my_cost) + parameters.set('emb', load_parameter(emb_param_file, 30000, 256)) + +6. 存储的参数格式是什么,如何和明文进行相互转化 +-------------------------------------------------- + +PaddlePaddle保存的模型参数文件内容由16字节头信息和网络参数两部分组成。头信息中,1~4字节表示PaddlePaddle版本信息,请直接填充0;5~8字节表示每个参数占用的字节数,当保存的网络参数为float类型时为4,double类型时为8;9~16字节表示保存的参数总个数。 + +将PaddlePaddle保存的模型参数还原回明文时,可以使用相应数据类型的 :code:`numpy.array` 加载具体网络参数,此时可以跳过PaddlePaddle模型参数文件的头信息。若在PaddlePaddle编译时,未指定按照double精度编译,默认情况下按照float精度计算,保存的参数也是float类型。这时在使用 :code:`numpy.array` 时,一般设置 :code:`dtype=float32` 。示例如下: + +.. code-block:: python + + def read_parameter(fname, width): + s = open(fname).read() + # skip header + vec = np.fromstring(s[16:], dtype=np.float32) + # width is the size of the corresponding layer + np.savetxt(fname + ".csv", vec.reshape(width, -1), + fmt="%.6f", delimiter=",") + + +将明文参数转化为PaddlePaddle可加载的模型参数时,首先构造头信息,再写入网络参数。下面的代码将随机生成的矩阵转化为可以被PaddlePaddle加载的模型参数。 + +.. code-block:: python + + def gen_rand_param(param_file, width, height, need_trans): + np.random.seed() + header = struct.pack("iil", 0, 4, height * width) + param = np.float32(np.random.rand(height, width)) + with open(param_file, "w") as fparam: + fparam.write(header + param.tostring()) + +7. A protocol message was rejected because it was too big +------------------------------------------------------------ + +如果在训练NLP相关模型时,出现以下错误: + +.. code-block:: bash + + [libprotobuf ERROR google/protobuf/io/coded_stream.cc:171] A protocol message was rejected because it was too big (more than 67108864 bytes). To increase the limit (or to disable these warnings), see CodedInputStream::SetTotalBytesLimit() in google/protobuf/io/coded_stream.h. + F1205 14:59:50.295174 14703 TrainerConfigHelper.cpp:59] Check failed: m->conf.ParseFromString(configProtoStr) + +可能的原因是:传给dataprovider的某一个args过大,一般是由于直接传递大字典导致的。错误的define_py_data_sources2类似: + +.. code-block:: python + + src_dict = dict() + for line_count, line in enumerate(open(src_dict_path, "r")): + src_dict[line.strip()] = line_count + + define_py_data_sources2( + train_list, + test_list, + module="dataprovider", + obj="process", + args={"src_dict": src_dict}) + +解决方案是:将字典的地址作为args传给dataprovider,然后在dataprovider里面根据该地址加载字典。即define_py_data_sources2应改为: + +.. code-block:: python + + define_py_data_sources2( + train_list, + test_list, + module="dataprovider", + obj="process", + args={"src_dict_path": src_dict_path}) + +完整源码可参考 `seqToseq `_ 示例。 + + diff --git a/doc/faq/src/reduce_min_pool_size.py b/doc/faq/src/reduce_min_pool_size.py deleted file mode 100644 index 5715397cc11e18246b8522fcc5b4f05780c9a0a7..0000000000000000000000000000000000000000 --- a/doc/faq/src/reduce_min_pool_size.py +++ /dev/null @@ -1,6 +0,0 @@ -@provider(min_pool_size=0, ...) -def process(settings, filename): - os.system('shuf %s > %s.shuf' % (filename, filename)) # shuffle before. - with open('%s.shuf' % filename, 'r') as f: - for line in f: - yield get_sample_from_line(line) diff --git a/doc/faq/src/word2vec_config.py b/doc/faq/src/word2vec_config.py deleted file mode 100644 index 866b40c3d4c96c1213b3f716f29b14dd38763edb..0000000000000000000000000000000000000000 --- a/doc/faq/src/word2vec_config.py +++ /dev/null @@ -1,12 +0,0 @@ -... # the settings and define data provider is omitted. -DICT_DIM = 3000 # dictionary dimension. -word_ids = data_layer('word_ids', size=DICT_DIM) - -emb = embedding_layer( - input=word_ids, size=256, param_attr=ParamAttr(sparse_update=True)) -emb_sum = pooling_layer(input=emb, pooling_type=SumPooling()) -predict = fc_layer(input=emb_sum, size=DICT_DIM, act=Softmax()) -outputs( - classification_cost( - input=predict, label=data_layer( - 'label', size=DICT_DIM))) diff --git a/doc/faq/src/word2vec_dataprovider.py b/doc/faq/src/word2vec_dataprovider.py deleted file mode 100644 index ec2753a7d01d7dd4d804c3bed0bac1be9c3fb3d3..0000000000000000000000000000000000000000 --- a/doc/faq/src/word2vec_dataprovider.py +++ /dev/null @@ -1,10 +0,0 @@ -DICT_DIM = 3000 - - -@provider(input_types=[integer_sequence(DICT_DIM), integer_value(DICT_DIM)]) -def process(settings, filename): - with open(filename) as f: - # yield word ids to predict inner word id - # such as [28, 29, 10, 4], 4 - # It means the sentance is 28, 29, 4, 10, 4. - yield read_next_from_file(f) diff --git a/doc/getstarted/basic_usage/index_cn.rst b/doc/getstarted/basic_usage/index_cn.rst deleted file mode 100644 index 428f58830e0b10c024f31238b7404c6df193eecd..0000000000000000000000000000000000000000 --- a/doc/getstarted/basic_usage/index_cn.rst +++ /dev/null @@ -1,108 +0,0 @@ -经典的线性回归任务 -================== - -PaddlePaddle是源于百度的一个深度学习平台。这份简短的介绍将向你展示如何利用PaddlePaddle来解决一个经典的线性回归问题。 - -任务简介 --------- - -我们展示如何用PaddlePaddle解决 `单变量的线性回归 `_ 问题。线性回归的输入是一批点 `(x, y)` ,其中 `y = wx + b + ε`, 而 ε 是一个符合高斯分布的随机变量。线性回归的输出是从这批点估计出来的参数 `w` 和 `b` 。 - -一个例子是房产估值。我们假设房产的价格(y)是其大小(x)的一个线性函数,那么我们可以通过收集市场上房子的大小和价格,用来估计线性函数的参数w 和 b。 - -准备数据 ------------ - -假设变量 `x` 和 `y` 的真实关系为: `y = 2x + 0.3 + ε`,这里展示如何使用观测数据来拟合这一线性关系。首先,Python代码将随机产生2000个观测点,作为线性回归的输入。下面脚本符合PaddlePaddle期待的读取数据的Python程序的模式。 - -.. code-block:: python - - # dataprovider.py - from paddle.trainer.PyDataProvider2 import * - import random - - # 定义输入数据的类型: 2个浮点数 - @provider(input_types=[dense_vector(1), dense_vector(1)],use_seq=False) - def process(settings, input_file): - for i in xrange(2000): - x = random.random() - yield [x], [2*x+0.3] - -训练模型 ------------ - -为了还原 `y = 2x + 0.3`,我们先从一条随机的直线 `y' = wx + b` 开始,然后利用观测数据调整 `w` 和 `b` 使得 `y'` 和 `y` 的差距不断减小,最终趋于接近。这个过程就是模型的训练过程,而 `w` 和 `b` 就是模型的参数,即我们的训练目标。 - -在PaddlePaddle里,该模型的网络配置如下。 - -.. code-block:: python - - # trainer_config.py - from paddle.trainer_config_helpers import * - - # 1. 定义数据来源,调用上面的process函数获得观测数据 - data_file = 'empty.list' - with open(data_file, 'w') as f: f.writelines(' ') - define_py_data_sources2(train_list=data_file, test_list=None, - module='dataprovider', obj='process',args={}) - - # 2. 学习算法。控制如何改变模型参数 w 和 b - settings(batch_size=12, learning_rate=1e-3, learning_method=MomentumOptimizer()) - - # 3. 神经网络配置 - x = data_layer(name='x', size=1) - y = data_layer(name='y', size=1) - # 线性计算网络层: ȳ = wx + b - ȳ = fc_layer(input=x, param_attr=ParamAttr(name='w'), size=1, act=LinearActivation(), bias_attr=ParamAttr(name='b')) - # 计算误差函数,即 ȳ 和真实 y 之间的距离 - cost = mse_cost(input= ȳ, label=y) - outputs(cost) - - -这段简短的配置展示了PaddlePaddle的基本用法: - -- 第一部分定义了数据输入。一般情况下,PaddlePaddle先从一个文件列表里获得数据文件地址,然后交给用户自定义的函数(例如上面的 `process`函数)进行读入和预处理从而得到真实输入。本文中由于输入数据是随机生成的不需要读输入文件,所以放一个空列表(`empty.list`)即可。 - -- 第二部分主要是选择学习算法,它定义了模型参数改变的规则。PaddlePaddle提供了很多优秀的学习算法,这里使用一个基于momentum的随机梯度下降(SGD)算法,该算法每批量(batch)读取12个采样数据进行随机梯度计算来更新更新。 - -- 最后一部分是神经网络的配置。由于PaddlePaddle已经实现了丰富的网络层,所以很多时候你需要做的只是定义正确的网络层并把它们连接起来。这里使用了三种网络单元: - - - **数据层**:数据层 `data_layer` 是神经网络的入口,它读入数据并将它们传输到接下来的网络层。这里数据层有两个,分别对应于变量 `x` 和 `y`。 - - **全连接层**:全连接层 `fc_layer` 是基础的计算单元,这里利用它建模变量之间的线性关系。计算单元是神经网络的核心,PaddlePaddle支持大量的计算单元和任意深度的网络连接,从而可以拟合任意的函数来学习复杂的数据关系。 - - **回归误差代价层**:回归误差代价层 `mse_cost` 是众多误差代价函数层的一种,它们在训练过程作为网络的出口,用来计算模型的误差,是模型参数优化的目标函数。 - -定义了网络结构并保存为 `trainer_config.py` 之后,运行以下训练命令: - -.. code-block:: bash - - paddle train --config=trainer_config.py --save_dir=./output --num_passes=30 - -PaddlePaddle将在观测数据集上迭代训练30轮,并将每轮的模型结果存放在 `./output` 路径下。从输出日志可以看到,随着轮数增加误差代价函数的输出在不断的减小,这意味着模型在训练数据上不断的改进,直到逼近真实解:` y = 2x + 0.3 ` - -模型检验 ------------ - -训练完成后,我们希望能够检验模型的好坏。一种常用的做法是用学习的模型对另外一组测试数据进行预测,评价预测的效果。在这个例子中,由于已经知道了真实答案,我们可以直接观察模型的参数是否符合预期来进行检验。 - -PaddlePaddle将每个模型参数作为一个numpy数组单独存为一个文件,所以可以利用如下方法读取模型的参数。 - -.. code-block:: python - - import numpy as np - import os - - def load(file_name): - with open(file_name, 'rb') as f: - f.read(16) # skip header for float type. - return np.fromfile(f, dtype=np.float32) - - print 'w=%.6f, b=%.6f' % (load('output/pass-00029/w'), load('output/pass-00029/b')) - # w=1.999743, b=0.300137 - -.. image:: ./parameters.png - :align: center - :scale: 80 % - -从图中可以看到,虽然 `w` 和 `b` 都使用随机值初始化,但在起初的几轮训练中它们都在快速逼近真实值,并且后续仍在不断改进,使得最终得到的模型几乎与真实模型一致。 - -这样,我们用PaddlePaddle解决了单变量线性回归问题, 包括数据输入、模型训练和最后的结果验证。 diff --git a/doc/getstarted/basic_usage/index_en.rst b/doc/getstarted/basic_usage/index_en.rst deleted file mode 100644 index 6775da20c2f51000f305b095d40abd27b8fa6c0e..0000000000000000000000000000000000000000 --- a/doc/getstarted/basic_usage/index_en.rst +++ /dev/null @@ -1,101 +0,0 @@ -Simple Linear Regression -======================== - -PaddlePaddle is a deep learning platform open-sourced by Baidu. With PaddlePaddle, you can easily train a classic neural network within a couple lines of configuration, or you can build sophisticated models that provide state-of-the-art performance on difficult learning tasks like sentiment analysis, machine translation, image caption and so on. - -Problem Background ------------------- - -Now, to give you a hint of what using PaddlePaddle looks like, let's start with a fundamental learning problem - `simple linear regression `_: you have observed a set of two-dimensional data points of ``X`` and ``Y``, where ``X`` is an explanatory variable and ``Y`` is corresponding dependent variable, and you want to recover the underlying correlation between ``X`` and ``Y``. Linear regression can be used in many practical scenarios. For example, ``X`` can be a variable about house size, and ``Y`` a variable about house price. You can build a model that captures relationship between them by observing real estate markets. - -Prepare the Data ------------------ - -Suppose the true relationship can be characterized as ``Y = 2X + 0.3``, let's see how to recover this pattern only from observed data. Here is a piece of python code that feeds synthetic data to PaddlePaddle. The code is pretty self-explanatory, the only extra thing you need to add for PaddlePaddle is a definition of input data types. - - .. code-block:: python - - # dataprovider.py - from paddle.trainer.PyDataProvider2 import * - import random - - # define data types of input: 2 real numbers - @provider(input_types=[dense_vector(1), dense_vector(1)],use_seq=False) - def process(settings, input_file): - for i in xrange(2000): - x = random.random() - yield [x], [2*x+0.3] - -Train a NeuralNetwork ----------------------- - -To recover this relationship between ``X`` and ``Y``, we use a neural network with one layer of linear activation units and a square error cost layer. Don't worry if you are not familiar with these terminologies, it's just saying that we are starting from a random line ``Y' = wX + b`` , then we gradually adapt ``w`` and ``b`` to minimize the difference between ``Y'`` and ``Y``. Here is what it looks like in PaddlePaddle: - - .. code-block:: python - - # trainer_config.py - from paddle.trainer_config_helpers import * - - # 1. read data. Suppose you saved above python code as dataprovider.py - data_file = 'empty.list' - with open(data_file, 'w') as f: f.writelines(' ') - define_py_data_sources2(train_list=data_file, test_list=None, - module='dataprovider', obj='process',args={}) - - # 2. learning algorithm - settings(batch_size=12, learning_rate=1e-3, learning_method=MomentumOptimizer()) - - # 3. Network configuration - x = data_layer(name='x', size=1) - y = data_layer(name='y', size=1) - y_predict = fc_layer(input=x, param_attr=ParamAttr(name='w'), size=1, act=LinearActivation(), bias_attr=ParamAttr(name='b')) - cost = mse_cost(input=y_predict, label=y) - outputs(cost) - -Some of the most fundamental usages of PaddlePaddle are demonstrated: - -- The first part shows how to feed data into PaddlePaddle. In general cases, PaddlePaddle reads raw data from a list of files, and then do some user-defined process to get real input. In this case, we only need to create a placeholder file since we are generating synthetic data on the fly. - -- The second part describes learning algorithm. It defines in what ways adjustments are made to model parameters. PaddlePaddle provides a rich set of optimizers, but a simple momentum based optimizer will suffice here, and it processes 12 data points each time. - -- Finally, the network configuration. It usually is as simple as "stacking" layers. Three kinds of layers are used in this configuration: - - **Data Layer**: a network always starts with one or more data layers. They provide input data to the rest of the network. In this problem, two data layers are used respectively for ``X`` and ``Y``. - - **FC Layer**: FC layer is short for Fully Connected Layer, which connects all the input units to current layer and does the actual computation specified as activation function. Computation layers like this are the fundamental building blocks of a deeper model. - - **Cost Layer**: in training phase, cost layers are usually the last layers of the network. They measure the performance of current model, and provide guidence to adjust parameters. - -Now that everything is ready, you can train the network with a simple command line call: - - .. code-block:: bash - - paddle train --config=trainer_config.py --save_dir=./output --num_passes=30 - - -This means that PaddlePaddle will train this network on the synthectic dataset for 30 passes, and save all the models under path ``./output``. You will see from the messages printed out during training phase that the model cost is decreasing as time goes by, which indicates we are getting a closer guess. - - -Evaluate the Model -------------------- - -Usually, a different dataset that left out during training phase should be used to evalute the models. However, we are lucky enough to know the real answer: ``w=2, b=0.3``, thus a better option is to check out model parameters directly. - -In PaddlePaddle, training is just to get a collection of model parameters, which are ``w`` and ``b`` in this case. Each parameter is saved in an individual file in the popular ``numpy`` array format. Here is the code that reads parameters from last pass. - - .. code-block:: python - - import numpy as np - import os - - def load(file_name): - with open(file_name, 'rb') as f: - f.read(16) # skip header for float type. - return np.fromfile(f, dtype=np.float32) - - print 'w=%.6f, b=%.6f' % (load('output/pass-00029/w'), load('output/pass-00029/b')) - # w=1.999743, b=0.300137 - - .. image:: parameters.png - :align: center - -Although starts from a random guess, you can see that value of ``w`` changes quickly towards 2 and ``b`` changes quickly towards 0.3. In the end, the predicted line is almost identical with real answer. - -There, you have recovered the underlying pattern between ``X`` and ``Y`` only from observed data. diff --git a/doc/getstarted/basic_usage/parameters.png b/doc/getstarted/basic_usage/parameters.png deleted file mode 100644 index 2ec67480951e21f0400bce1c34b3108dcd65c18c..0000000000000000000000000000000000000000 Binary files a/doc/getstarted/basic_usage/parameters.png and /dev/null differ diff --git a/doc/getstarted/build_and_install/build_from_source_cn.rst b/doc/getstarted/build_and_install/build_from_source_cn.rst new file mode 100644 index 0000000000000000000000000000000000000000..71904dc41ed0d946867d890cc585e1b88450ca8c --- /dev/null +++ b/doc/getstarted/build_and_install/build_from_source_cn.rst @@ -0,0 +1,151 @@ +从源码编译 +====================== + +.. _build_step: + +编译方法 +---------------- + +PaddlePaddle主要使用 `CMake `_ 以及GCC, G++作为编译工具。 +我们推荐您使用PaddlePaddle Docker编译环境镜像完成编译,这样可以免去单独安装编译依赖的步骤,可选的不同编译环境Docker镜像 +可以在 `这里 `_ 找到。 + +如果您选择不使用Docker镜像,则需要在本机安装下面章节列出的 `编译依赖`_ 之后才能开始编译的步骤。 + +编译PaddlePaddle,需要执行: + +.. code-block:: bash + + git clone https://github.com/PaddlePaddle/Paddle.git + cd Paddle + # 如果使用Docker编译环境,执行下面的命令编译CPU-Only的二进制 + docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 bash -x /paddle/paddle/scripts/docker/build.sh + # 如果不使用Docker编译环境,执行下面的命令 + mkdir build + cd build + cmake -DWITH_GPU=OFF -DWITH_TESTING=OFF .. + make + +编译完成后会在build/python/dist目录下生成输出的whl包,可以选在在当前机器安装也可以拷贝到目标机器安装: + +.. code-block:: bash + + pip install build/python/dist/*.whl + +如果机器中已经安装过PaddlePaddle,有两种方法: + +.. code-block:: bash + + 1. 先卸载之前的版本,再重新安装 + pip uninstall paddlepaddle + pip install build/python/dist/*.whl + + 2. 直接升级到更新的版本 + pip install build/python/dist/*.whl -U + +.. _run_test: + +执行单元测试 +---------------- + +如果您期望在编译完成后立即执行所有的单元测试,可以按照下面的方法: + +使用Docker的情况下,设置 :code:`RUN_TEST=ON` 和 :code:`WITH_TESTING=ON` 就会在完成编译之后,立即执行单元测试。 +开启 :code:`WITH_GPU=ON` 可以指定同时执行GPU上的单元测试。 + +.. code-block:: bash + + docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=ON" -e "RUN_TEST=ON" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 bash -x /paddle/paddle/scripts/docker/build.sh + +如果不使用Docker,可以执行ctest命令即可: + +.. code-block:: bash + + mkdir build + cd build + cmake -DWITH_GPU=OFF -DWITH_TESTING=OFF .. + make + ctest + # 指定执行其中一个单元测试 test_mul_op + ctest -R test_mul_op + +.. _compile_deps: + +编译依赖 +---------------- + +PaddlePaddle编译需要使用到下面的依赖(包含但不限于),其他的依赖软件,会自动在编译时下载。 + +.. csv-table:: PaddlePaddle编译依赖 + :header: "依赖", "版本", "说明" + :widths: 10, 15, 30 + + "CMake", ">=3.2", "" + "GCC", "4.8.2", "推荐使用CentOS的devtools2" + "Python", "2.7.x", "依赖libpython2.7.so" + "pip", ">=9.0", "" + "numpy", "", "" + "SWIG", ">=2.0", "" + "Go", ">=1.8", "可选" + + +.. _build_options: + +编译选项 +---------------- + +PaddlePaddle的编译选项,包括生成CPU/GPU二进制文件、链接何种BLAS库等。 +用户可在调用cmake的时候设置它们,详细的cmake使用方法可以参考 +`官方文档 `_ 。 + +在cmake的命令行中,通过使用 ``-D`` 命令设置该类编译选项,例如: + +.. code-block:: bash + + cmake .. -DWITH_GPU=OFF + +.. csv-table:: 编译选项说明 + :header: "选项", "说明", "默认值" + :widths: 1, 7, 2 + + "WITH_GPU", "是否支持GPU", "ON" + "WITH_C_API", "是否仅编译CAPI", "OFF" + "WITH_DOUBLE", "是否使用双精度浮点数", "OFF" + "WITH_DSO", "是否运行时动态加载CUDA动态库,而非静态加载CUDA动态库。", "ON" + "WITH_AVX", "是否编译含有AVX指令集的PaddlePaddle二进制文件", "ON" + "WITH_PYTHON", "是否内嵌PYTHON解释器", "ON" + "WITH_STYLE_CHECK", "是否编译时进行代码风格检查", "ON" + "WITH_TESTING", "是否开启单元测试", "ON" + "WITH_DOC", "是否编译中英文文档", "OFF" + "WITH_SWIG_PY", "是否编译PYTHON的SWIG接口,该接口可用于预测和定制化训练", "Auto" + "WITH_GOLANG", "是否编译go语言的可容错parameter server", "ON" + "WITH_MKL", "是否使用MKL数学库,如果为否则是用OpenBLAS", "ON" + +BLAS ++++++ + +PaddlePaddle支持 `MKL `_ 和 +`OpenBlAS `_ 两种BLAS库。默认使用MKL。如果使用MKL并且机器含有AVX2指令集, +还会下载MKL-DNN数学库,详细参考 `这里 `_ 。 + +如果关闭MKL,则会使用OpenBLAS作为BLAS库。 + +CUDA/cuDNN ++++++++++++ + +PaddlePaddle在编译时/运行时会自动找到系统中安装的CUDA和cuDNN库进行编译和执行。 +使用参数 :code:`-DCUDA_ARCH_NAME=Auto` 可以指定开启自动检测SM架构,加速编译。 + +PaddlePaddle可以使用cuDNN v5.1之后的任何一个版本来编译运行,但尽量请保持编译和运行使用的cuDNN是同一个版本。 +我们推荐使用最新版本的cuDNN。 + +编译选项的设置 +++++++++++++++ + +PaddePaddle通过编译时指定路径来实现引用各种BLAS/CUDA/cuDNN库。cmake编译时,首先在系统路径( :code:`/usr/lib:/usr/local/lib` )中搜索这几个库,同时也会读取相关路径变量来进行搜索。 通过使用 ``-D`` 命令可以设置,例如 + +.. code-block:: bash + + cmake .. -DWITH_GPU=ON -DWITH_TESTING=OFF -DCUDNN_ROOT=/opt/cudnnv5 + +**注意:这几个编译选项的设置,只在第一次cmake的时候有效。如果之后想要重新设置,推荐清理整个编译目录(** :code:`rm -rf` )**后,再指定。** diff --git a/doc/getstarted/build_and_install/build_from_source_en.md b/doc/getstarted/build_and_install/build_from_source_en.md deleted file mode 100644 index 69f4501f370dcc9d603ec54a63d68568d66e832e..0000000000000000000000000000000000000000 --- a/doc/getstarted/build_and_install/build_from_source_en.md +++ /dev/null @@ -1,222 +0,0 @@ -Installing from Sources -========================== - -* [1. Download and Setup](#download) -* [2. Requirements](#requirements) -* [3. Build on Ubuntu](#ubuntu) -* [4. Build on Centos](#centos) - - -## Download and Setup -You can download PaddlePaddle from the [github source](https://github.com/PaddlePaddle/Paddle). - -```bash -git clone https://github.com/PaddlePaddle/Paddle paddle -cd paddle -``` -## Requirements - -To compile the source code, your computer must be equipped with the following dependencies. - -- **Compiler**: GCC >= 4.8 or Clang >= 3.3 (AppleClang >= 5.1) and gfortran compiler -- **CMake**: CMake >= 3.0 (at least CMake 3.4 on Mac OS X) -- **BLAS**: MKL, OpenBlas or ATLAS -- **Python**: only support Python 2.7 - -**Note:** For CUDA 7.0 and CUDA 7.5, GCC 5.0 and up are not supported! -For CUDA 8.0, GCC versions later than 5.3 are not supported! - -### Options - -PaddlePaddle supports some build options. - - - - - - - - - - - - - - - - - - - - - - - - - - -
OptionalDescription
WITH_GPUCompile PaddlePaddle with NVIDIA GPU
WITH_AVXCompile PaddlePaddle with AVX intrinsics
WITH_DSOCompile PaddlePaddle with dynamic linked CUDA
WITH_TESTINGCompile PaddlePaddle with unit testing
WITH_SWIG_PYCompile PaddlePaddle with inference api
WITH_STYLE_CHECKCompile PaddlePaddle with style check
WITH_PYTHONCompile PaddlePaddle with python interpreter
WITH_DOUBLECompile PaddlePaddle with double precision
WITH_RDMACompile PaddlePaddle with RDMA support
WITH_TIMERCompile PaddlePaddle with stats timer
WITH_PROFILERCompile PaddlePaddle with GPU profiler
WITH_DOCCompile PaddlePaddle with documentation
WITH_COVERAGECompile PaddlePaddle with code coverage
COVERALLS_UPLOADPackage code coverage data to coveralls
ON_TRAVISExclude special unit test on Travis CI
- - -**Note:** - - The GPU version works best with Cuda Toolkit 8.0 and cuDNN v5. - - Other versions like Cuda Toolkit 7.0, 7.5 and cuDNN v3, v4 are also supported. - - **To utilize cuDNN v5, Cuda Toolkit 7.5 is prerequisite and vice versa.** - -As a simple example, consider the following: - -1. **BLAS Dependencies(optional)** - - CMake will search BLAS libraries from system. If not found, OpenBLAS will be downloaded, built and installed automatically. - To utilize preinstalled BLAS, you can simply specify MKL, OpenBLAS or ATLAS via `MKL_ROOT`, `OPENBLAS_ROOT` or `ATLAS_ROOT`. - - ```bash - # specify MKL - cmake .. -DMKL_ROOT= - # or specify OpenBLAS - cmake .. -DOPENBLAS_ROOT= - ``` - -2. **Doc Dependencies(optional)** - - To generate PaddlePaddle's documentation, install dependencies and set `-DWITH_DOC=ON` as follows: - - ```bash - pip install 'sphinx>=1.4.0' - pip install sphinx_rtd_theme recommonmark - - # install doxygen on Ubuntu - sudo apt-get install doxygen - # install doxygen on Mac OS X - brew install doxygen - - # active docs in cmake - cmake .. -DWITH_DOC=ON` - ``` - -## Build on Ubuntu 14.04 - -### Install Dependencies - -- **Paddle Dependencies** - - ```bash - # necessary - sudo apt-get update - sudo apt-get install -y git curl gcc g++ gfortran make build-essential automake - sudo apt-get install -y python python-pip python-numpy libpython-dev bison - sudo pip install 'protobuf==3.1.0.post1' - - # install cmake 3.4 - curl -sSL https://cmake.org/files/v3.4/cmake-3.4.1.tar.gz | tar -xz && \ - cd cmake-3.4.1 && ./bootstrap && make -j4 && sudo make install && \ - cd .. && rm -rf cmake-3.4.1 - ``` - -- **GPU Dependencies (optional)** - - To build GPU version, you will need the following installed: - - 1. a CUDA-capable GPU - 2. A supported version of Linux with a gcc compiler and toolchain - 3. NVIDIA CUDA Toolkit (available at http://developer.nvidia.com/cuda-downloads) - 4. NVIDIA cuDNN Library (availabel at https://developer.nvidia.com/cudnn) - - The CUDA development environment relies on tight integration with the host development environment, - including the host compiler and C runtime libraries, and is therefore only supported on - distribution versions that have been qualified for this CUDA Toolkit release. - - After downloading cuDNN library, issue the following commands: - - ```bash - sudo tar -xzf cudnn-7.5-linux-x64-v5.1.tgz -C /usr/local - sudo chmod a+r /usr/local/cuda/include/cudnn.h /usr/local/cuda/lib64/libcudnn* - ``` - Then you need to set LD\_LIBRARY\_PATH, PATH environment variables in ~/.bashrc. - - ```bash - export LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH - export PATH=/usr/local/cuda/bin:$PATH - ``` - -### Build and Install - -As usual, the best option is to create build folder under paddle project directory. - -```bash -mkdir build && cd build -``` - -Finally, you can build and install PaddlePaddle: - -```bash -# you can add build option here, such as: -cmake .. -DCMAKE_INSTALL_PREFIX= -# please use sudo make install, if you want to install PaddlePaddle into the system -make -j `nproc` && make install -# set PaddlePaddle installation path in ~/.bashrc -export PATH=/bin:$PATH -# install PaddlePaddle Python modules. -sudo pip install /opt/paddle/share/wheels/*.whl -``` -## Build on Centos 7 - -### Install Dependencies - -- **CPU Dependencies** - - ```bash - # necessary - sudo yum update - sudo yum install -y epel-release - sudo yum install -y make cmake3 python-devel python-pip gcc-gfortran swig git - sudo pip install wheel numpy - sudo pip install 'protobuf>=3.0.0' - ``` - -- **GPU Dependencies (optional)** - - To build GPU version, you will need the following installed: - - 1. a CUDA-capable GPU - 2. A supported version of Linux with a gcc compiler and toolchain - 3. NVIDIA CUDA Toolkit (available at http://developer.nvidia.com/cuda-downloads) - 4. NVIDIA cuDNN Library (availabel at https://developer.nvidia.com/cudnn) - - The CUDA development environment relies on tight integration with the host development environment, - including the host compiler and C runtime libraries, and is therefore only supported on - distribution versions that have been qualified for this CUDA Toolkit release. - - After downloading cuDNN library, issue the following commands: - - ```bash - sudo tar -xzf cudnn-7.5-linux-x64-v5.1.tgz -C /usr/local - sudo chmod a+r /usr/local/cuda/include/cudnn.h /usr/local/cuda/lib64/libcudnn* - ``` - Then you need to set LD\_LIBRARY\_PATH, PATH environment variables in ~/.bashrc. - - ```bash - export LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH - export PATH=/usr/local/cuda/bin:$PATH - ``` - -### Build and Install - -As usual, the best option is to create build folder under paddle project directory. - -```bash -mkdir build && cd build -``` - -Finally, you can build and install PaddlePaddle: - -```bash -# you can add build option here, such as: -cmake3 .. -DCMAKE_INSTALL_PREFIX= -# please use sudo make install, if you want to install PaddlePaddle into the system -make -j `nproc` && make install -# set PaddlePaddle installation path in ~/.bashrc -export PATH=/bin:$PATH -# install PaddlePaddle Python modules. -sudo pip install /opt/paddle/share/wheels/*.whl -``` diff --git a/doc/getstarted/build_and_install/build_from_source_en.rst b/doc/getstarted/build_and_install/build_from_source_en.rst new file mode 100644 index 0000000000000000000000000000000000000000..27f73b2e2c029b41d514e1612912ed1c335605b6 --- /dev/null +++ b/doc/getstarted/build_and_install/build_from_source_en.rst @@ -0,0 +1,169 @@ +Build from Sources +========================== + +.. _build_step: + +How To Build +---------------- + +PaddlePaddle mainly uses `CMake `_ and GCC, G++ as compile +tools. We recommend you to use our pre-built Docker image to run the build +to avoid installing dependencies by yourself. We have several build environment +Docker images `here `_ . + +If you choose not to use Docker image for your build, you need to install the +below `Compile Dependencies`_ before run the build. + +Then run: + +.. code-block:: bash + + git clone https://github.com/PaddlePaddle/Paddle.git + cd Paddle + # run the following command to build a CPU-Only binaries if you are using docker + docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 bash -x /paddle/paddle/scripts/docker/build.sh + # else run these commands + mkdir build + cd build + cmake -DWITH_GPU=OFF -DWITH_TESTING=OFF .. + make + +When the compile finishes, you can get the output whl package under +build/python/dist, then you can choose to install the whl on local +machine or copy it to the target machine. + +.. code-block:: bash + + pip install build/python/dist/*.whl + +If the machine has installed PaddlePaddle before, there are two methods: + +.. code-block:: bash + + 1. uninstall and reinstall + pip uninstall paddlepaddle + pip install build/python/dist/*.whl + + 2. upgrade directly + pip install build/python/dist/*.whl -U + +.. _run_test: + +Run Tests +---------------- + +If you wish to run the tests, you may follow the below steps: + +When using Docker, set :code:`RUN_TEST=ON` and :code:`WITH_TESTING=ON` will run test immediately after the build. +Set :code:`WITH_GPU=ON` Can also run tests on GPU. + +.. code-block:: bash + + docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=ON" -e "RUN_TEST=ON" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 bash -x paddle/paddle/scripts/docker/build.sh + +If you don't use Docker, just run ctest will start the tests: + +.. code-block:: bash + + mkdir build + cd build + cmake -DWITH_GPU=OFF -DWITH_TESTING=ON .. + make + ctest + # run a single test like test_mul_op + ctest -R test_mul_op + + +.. _compile_deps: + +Compile Dependencies +---------------- + +PaddlePaddle need the following dependencies when compiling, other dependencies +will be downloaded automatically. + +.. csv-table:: PaddlePaddle Compile Dependencies + :header: "Dependency", "Version", "Description" + :widths: 10, 15, 30 + + "CMake", ">=3.2", "" + "GCC", "4.8.2", "Recommend devtools2 for CentOS" + "Python", "2.7.x", "Need libpython2.7.so" + "pip", ">=9.0", "" + "numpy", "", "" + "SWIG", ">=2.0", "" + "Go", ">=1.8", "Optional" + + +.. _build_options: + +Build Options +---------------- + +Build options include whether build binaries for CPU or GPU, which BLAS +library to use etc. You may pass these settings when running cmake. +For detailed cmake tutorial please refer to `here `_ 。 + +.. _build_options_bool: + +Bool Type Options +---------------- + +You can add :code:`-D` argument to pass such options, like: + +.. code-block:: bash + + cmake .. -DWITH_GPU=OFF + +.. csv-table:: Bool Type Options + :header: "Option", "Description", "Default" + :widths: 1, 7, 2 + + "WITH_GPU", "Build with GPU support", "ON" + "WITH_C_API", "Build only CAPI", "OFF" + "WITH_DOUBLE", "Build with double precision", "OFF" + "WITH_DSO", "Dynamically load CUDA libraries", "ON" + "WITH_AVX", "Build with AVX support", "ON" + "WITH_PYTHON", "Build with integrated Python interpreter", "ON" + "WITH_STYLE_CHECK", "Check code style when building", "ON" + "WITH_TESTING", "Build unit tests", "ON" + "WITH_DOC", "Build documentations", "OFF" + "WITH_SWIG_PY", "Build Python SWIG interface for V2 API", "Auto" + "WITH_GOLANG", "Build fault-tolerant parameter server written in go", "ON" + "WITH_MKL", "Use MKL as BLAS library, else use OpenBLAS", "ON" + + +BLAS ++++++ + +PaddlePaddle supports `MKL `_ and +`OpenBlAS `_ as BLAS library。By default it uses MKL. +If you are using MKL and your machine supports AVX2, MKL-DNN will also be downloaded +and used, for more `details `_ . + +If you choose not to use MKL, then OpenBlAS will be used. + +CUDA/cuDNN ++++++++++++ + +PaddlePaddle will automatically find CUDA and cuDNN when compiling and running. +parameter :code:`-DCUDA_ARCH_NAME=Auto` can be used to detect SM architecture +automatically in order to speed up the build. + +PaddlePaddle can build with any version later than cuDNN v5.1, and we intend to +keep on with latest cuDNN versions. Be sure to run with the same version of cuDNN +you built. + +Pass Compile Options +++++++++++++++ + +You can pass compile options to use intended BLAS/CUDA/Cudnn libraries. +When running cmake command, it will search system paths like +:code:`/usr/lib:/usr/local/lib` and then search paths that you +passed to cmake, i.e. + +.. code-block:: bash + + cmake .. -DWITH_GPU=ON -DWITH_TESTING=OFF -DCUDNN_ROOT=/opt/cudnnv5 + +**NOTE: These options only take effect when running cmake for the first time, you need to clean the cmake cache or clean the build directory (** :code:`rm -rf` **) if you want to change it.** diff --git a/doc/getstarted/build_and_install/cmake.png b/doc/getstarted/build_and_install/cmake.png deleted file mode 100644 index a58cd09ad99cf27cc1ca5785fe54d726b83a82f6..0000000000000000000000000000000000000000 Binary files a/doc/getstarted/build_and_install/cmake.png and /dev/null differ diff --git a/doc/getstarted/build_and_install/cmake/build_from_source_cn.rst b/doc/getstarted/build_and_install/cmake/build_from_source_cn.rst deleted file mode 100644 index be0c1ffa451b2901ec06621dd4d886f800b4562e..0000000000000000000000000000000000000000 --- a/doc/getstarted/build_and_install/cmake/build_from_source_cn.rst +++ /dev/null @@ -1,43 +0,0 @@ -PaddlePaddle的编译选项 -====================== - -PaddlePaddle的编译选项,包括生成CPU/GPU二进制文件、链接何种BLAS库等。用户可在调用cmake的时候设置它们,详细的cmake使用方法可以参考 `官方文档 `_ 。 - -Bool型的编译选项 ----------------- -用户可在cmake的命令行中,通过使用 ``-D`` 命令设置该类编译选项,例如 - -.. code-block:: bash - - cmake .. -DWITH_GPU=OFF - -.. csv-table:: Bool型的编译选项 - :widths: 1, 7, 2 - :file: compile_options.csv - -BLAS/CUDA/Cudnn的编译选项 --------------------------- -BLAS -+++++ - -PaddlePaddle支持以下任意一种BLAS库:`MKL `_ ,`ATLAS `_ ,`OpenBlAS `_ 和 `REFERENCE BLAS `_ 。 - -.. csv-table:: BLAS路径相关的编译选项 - :widths: 1, 2, 7 - :file: cblas_settings.csv - -CUDA/Cudnn -+++++++++++ - -PaddlePaddle可以使用cudnn v2之后的任何一个版本来编译运行,但尽量请保持编译和运行使用的cudnn是同一个版本。 我们推荐使用最新版本的cudnn v5.1。 - -编译选项的设置 -++++++++++++++ - -PaddePaddle通过编译时指定路径来实现引用各种BLAS/CUDA/Cudnn库。cmake编译时,首先在系统路径(/usr/lib\:/usr/local/lib)中搜索这几个库,同时也会读取相关路径变量来进行搜索。 通过使用 ``-D`` 命令可以设置,例如 - -.. code-block:: bash - - cmake .. -DMKL_ROOT=/opt/mkl/ -DCUDNN_ROOT=/opt/cudnnv5 - -注意:这几个编译选项的设置,只在第一次cmake的时候有效。如果之后想要重新设置,推荐清理整个编译目录(``rm -rf``)后,再指定。 diff --git a/doc/getstarted/build_and_install/cmake/cblas_settings.csv b/doc/getstarted/build_and_install/cmake/cblas_settings.csv deleted file mode 100644 index a6356baf16a0d3d2499e39d2055d8ee878dcaef2..0000000000000000000000000000000000000000 --- a/doc/getstarted/build_and_install/cmake/cblas_settings.csv +++ /dev/null @@ -1,5 +0,0 @@ -编译选项,描述,注意 -MKL_ROOT,MKL的路径,${MKL_ROOT}/include下需要包含mkl.h,${MKL_ROOT}/lib目录下需要包含mkl_core,mkl_sequential和mkl_intel_lp64三个库。 -ATLAS_ROOT,ATLAS的路径,${ATLAS_ROOT}/include下需要包含cblas.h,${ATLAS_ROOT}/lib下需要包含cblas和atlas两个库。 -OPENBLAS_ROOT,OpenBLAS的路径,${OPENBLAS_ROOT}/include下需要包含cblas.h,${OPENBLAS_ROOT}/lib下需要包含openblas库。 -REFERENCE_CBLAS_ROOT,REFERENCE BLAS的路径,${REFERENCE_CBLAS_ROOT}/include下需要包含cblas.h,${REFERENCE_CBLAS_ROOT}/lib下需要包含cblas库。 \ No newline at end of file diff --git a/doc/getstarted/build_and_install/cmake/compile_options.csv b/doc/getstarted/build_and_install/cmake/compile_options.csv deleted file mode 100644 index 463b825470579d0c3736a408b1e82dd33e6f8d42..0000000000000000000000000000000000000000 --- a/doc/getstarted/build_and_install/cmake/compile_options.csv +++ /dev/null @@ -1,12 +0,0 @@ -选项,说明,默认值 -WITH_GPU,是否支持GPU。,取决于是否寻找到CUDA工具链 -WITH_DOUBLE,是否使用双精度浮点数。,否 -WITH_DSO,是否运行时动态加载CUDA动态库,而非静态加载CUDA动态库。,是 -WITH_AVX,是否编译含有AVX指令集的PaddlePaddle二进制文件,是 -WITH_PYTHON,是否内嵌PYTHON解释器。方便今后的嵌入式移植工作。,是 -WITH_STYLE_CHECK,是否编译时进行代码风格检查,是 -WITH_RDMA,是否开启RDMA,否 -WITH_TIMER,是否开启计时功能。如果开启会导致运行略慢,打印的日志变多,但是方便调试和测Benchmark,否 -WITH_TESTING,是否开启单元测试,取决于是否寻找到GTEST -WITH_DOC,是否编译中英文文档,否 -WITH_SWIG_PY,是否编译PYTHON的SWIG接口,该接口可用于预测和定制化训练,取决于是否寻找到SWIG \ No newline at end of file diff --git a/doc/getstarted/build_and_install/docker_install_cn.rst b/doc/getstarted/build_and_install/docker_install_cn.rst index 87c286a1af75e08313813f1373ea03b85d4af523..79d214635a069a739060e0b79424729f6ff90387 100644 --- a/doc/getstarted/build_and_install/docker_install_cn.rst +++ b/doc/getstarted/build_and_install/docker_install_cn.rst @@ -1,183 +1,145 @@ -PaddlePaddle的Docker容器使用方式 +使用Docker安装运行 ================================ -PaddlePaddle目前唯一官方支持的运行的方式是Docker容器。因为Docker能在所有主要操作系统(包括Linux,Mac OS X和Windows)上运行。 请注意,您需要更改 `Dockers设置 `_ 才能充分利用Mac OS X和Windows上的硬件资源。 +使用Docker安装和运行PaddlePaddle可以无需考虑依赖环境即可运行。并且也可以在Windows的docker中运行。 +您可以在 `Docker官网 `_ 获得基本的Docker安装和使用方法。 +如果您在使用Windows,可以参考 +`这篇 `_ +教程,完成在Windows上安装和使用Docker。 -PaddlePaddle发布的Docker镜像使用说明 ------------------------------- - -我们把PaddlePaddle的编译环境打包成一个镜像,称为开发镜像,里面涵盖了 -PaddlePaddle需要的所有编译工具。把编译出来的PaddlePaddle也打包成一个镜 -像,称为生产镜像,里面涵盖了PaddlePaddle运行所需的所有环境。每次 -PaddlePaddle发布新版本的时候都会发布对应版本的生产镜像以及开发镜像。运 -行镜像包括纯CPU版本和GPU版本以及其对应的非AVX版本。我们会在 -`dockerhub.com `_ 提供最新 -的Docker镜像,可以在"tags"标签下找到最新的Paddle镜像版本。为了方便在国 -内的开发者下载Docker镜像,我们提供了国内的镜像服务器供大家使用。如果您 -在国内,请把文档里命令中的paddlepaddle/paddle替换成 -docker.paddlepaddle.org/paddle。 - -1. 开发镜像::code:`paddlepaddle/paddle:0.10.0-dev` - - 这个镜像包含了Paddle相关的开发工具以及编译和运行环境。用户可以使用开发镜像代替配置本地环境,完成开发,编译,发布, - 文档编写等工作。由于不同的Paddle的版本可能需要不同的依赖和工具,所以如果需要自行配置开发环境需要考虑版本的因素。 - 开发镜像包含了以下工具: - - - gcc/clang - - nvcc - - Python - - sphinx - - woboq - - sshd - 很多开发者会使用远程的安装有GPU的服务器工作,用户可以使用ssh登录到这台服务器上并执行 :code:`docker exec`进入开发镜像并开始工作, - 也可以在开发镜像中启动一个SSHD服务,方便开发者直接登录到镜像中进行开发: - - 以交互容器方式运行开发镜像: - - .. code-block:: bash - - docker run -it --rm paddlepaddle/paddle:0.10.0-dev /bin/bash - - 或者,可以以后台进程方式运行容器: - - .. code-block:: bash +在了解Docker的基本使用方法之后,即可开始下面的步骤: - docker run -d -p 2202:22 -p 8888:8888 paddledev/paddle:0.10.0-dev +.. _docker_pull: - 然后用密码 :code:`root` SSH进入容器: - - .. code-block:: bash +获取PaddlePaddle的Docker镜像 +------------------------------ - ssh -p 2202 root@localhost +执行下面的命令获取最新的PaddlePaddle Docker镜像,版本为cpu_avx_mkl: - SSH方式的一个优点是我们可以从多个终端进入容器。比如,一个终端运行vi,另一个终端运行Python。另一个好处是我们可以把PaddlePaddle容器运行在远程服务器上,并在笔记本上通过SSH与其连接。 + .. code-block:: bash -2. 生产镜像:根据CPU、GPU和非AVX区分了如下4个镜像: + docker pull paddlepaddle/paddle - - GPU/AVX::code:`paddlepaddle/paddle:-gpu` - - GPU/no-AVX::code:`paddlepaddle/paddle:-gpu-noavx` - - CPU/AVX::code:`paddlepaddle/paddle:` - - CPU/no-AVX::code:`paddlepaddle/paddle:-noavx` +对于国内用户,我们提供了加速访问的镜像源: - 纯CPU镜像以及GPU镜像都会用到AVX指令集,但是2008年之前生产的旧电脑不支持AVX。以下指令能检查Linux电脑是否支持AVX: + .. code-block:: bash - .. code-block:: bash + docker pull docker.paddlepaddlehub.com/paddle - if cat /proc/cpuinfo | grep -i avx; then echo Yes; else echo No; fi +下载GPU版本(cuda8.0_cudnn5_avx_mkl)的Docker镜像: - 如果输出是No,就需要选择使用no-AVX的镜像 + .. code-block:: bash - 以上方法在GPU镜像里也能用,只是请不要忘记提前在物理机上安装GPU最新驱动。 - 为了保证GPU驱动能够在镜像里面正常运行,我们推荐使用[nvidia-docker](https://github.com/NVIDIA/nvidia-docker)来运行镜像。 + docker pull paddlepaddle/paddle:latest-gpu + docker pull docker.paddlepaddlehub.com/paddle:latest-gpu - .. code-block:: bash +选择下载使用不同的BLAS库的Docker镜像: - nvidia-docker run -it --rm paddledev/paddle:0.10.0-gpu /bin/bash + .. code-block:: bash - 注意: 如果使用nvidia-docker存在问题,你也许可以尝试更老的方法,具体如下,但是我们并不推荐这种方法。: + # 默认是使用MKL的镜像 + docker pull paddlepaddle/paddle + # 使用OpenBLAS的镜像 + docker pull paddlepaddle/paddle:latest-openblas - .. code-block:: bash +下载指定版本的Docker镜像,可以从 `DockerHub网站 `_ 获取可选的tag,并执行下面的命令: - export CUDA_SO="$(\ls /usr/lib64/libcuda* | xargs -I{} echo '-v {}:{}') $(\ls /usr/lib64/libnvidia* | xargs -I{} echo '-v {}:{}')" - export DEVICES=$(\ls /dev/nvidia* | xargs -I{} echo '--device {}:{}') - docker run ${CUDA_SO} ${DEVICES} -it paddledev/paddle:0.10.0-gpu + .. code-block:: bash -3. 运行以及发布您的AI程序 + docker pull paddlepaddle/paddle:[tag] + # 比如: + docker pull docker.paddlepaddlehub.com/paddle:0.11.0-gpu - 假设您已经完成了一个AI训练的python程序 :code:`a.py`,这个程序是您在开发机上使用开发镜像完成开发。此时您可以运行这个命令在开发机上进行测试运行: +.. _docker_run: - .. code-block:: bash +在Docker中执行PaddlePaddle训练程序 +---------------------------------- - docker run -it -v $PWD:/work paddle /work/a.py +假设您已经在当前目录(比如在/home/work)编写了一个PaddlePaddle的程序 :code:`train.py` (可以参考 +`PaddlePaddleBook `_ +编写),就可以使用下面的命令开始执行训练: - 如果要使用GPU,请运行: + .. code-block:: bash - .. code-block:: bash + cd /home/work + docker run -it -v $PWD:/work paddlepaddle/paddle /work/train.py + +上述命令中, :code:`-it` 参数说明容器已交互式运行; :code:`-v $PWD:/work` +指定将当前路径(Linux中$PWD变量会展开为当前路径的绝对路径)挂载到容器内部的 :code:`/work` +目录; :code:`paddlepaddle/paddle` 指定需要使用的容器; 最后 :code:`/work/train.py` +为容器内执行的命令,即运行训练程序。 - nvidia-docker run -it -v $PWD:/work paddle /work/a.py +当然,您也可以进入到Docker容器中,以交互式的方式执行或调试您的代码: + .. code-block:: bash + docker run -it -v $PWD:/work paddlepaddle/paddle /bin/bash + cd /work + python train.py - 这里`a.py`包含的所有依赖假设都可以在Paddle的运行容器中。如果需要包含更多的依赖、或者需要发布您的应用的镜像,可以编写`Dockerfile`使用`FROM paddledev/paddle:0.10.0` - 创建和发布自己的AI程序镜像。 +**注:PaddlePaddle Docker镜像为了减小体积,默认没有安装vim,您可以在容器中执行** :code:`apt-get install -y vim` **安装后,在容器中编辑代码。** -运行PaddlePaddle Book ---------------------- +.. _docker_run_book: -Jupyter Notebook是一个开源的web程序,大家可以通过它制作和分享带有代码、公式、图表、文字的交互式文档。用户可以通过网页浏览文档。 +使用Docker启动PaddlePaddle Book教程 +----------------------------------- -PaddlePaddle Book是为用户和开发者制作的一个交互式的Jupyter Nodebook。 +使用Docker可以快速在本地启动一个包含了PaddlePaddle官方Book教程的Jupyter Notebook,可以通过网页浏览。 +PaddlePaddle Book是为用户和开发者制作的一个交互式的Jupyter Notebook。 如果您想要更深入了解deep learning,PaddlePaddle Book一定是您最好的选择。 +大家可以通过它阅读教程,或者制作和分享带有代码、公式、图表、文字的交互式文档。 我们提供可以直接运行PaddlePaddle Book的Docker镜像,直接运行: -.. code-block:: bash - - docker run -p 8888:8888 paddlepaddle/book + .. code-block:: bash -然后在浏览器中输入以下网址: + docker run -p 8888:8888 paddlepaddle/book -.. code-block:: text +国内用户可以使用下面的镜像源来加速访问: - http://localhost:8888/ + .. code-block: bash -就这么简单,享受您的旅程! + docker run -p 8888:8888 docker.paddlepaddlehub.com/book -通过Docker容器开发PaddlePaddle ------------------------------- +然后在浏览器中输入以下网址: -开发人员可以在Docker开发镜像中开发PaddlePaddle。这样开发人员可以以一致的方式在不同的平台上工作 - Linux,Mac OS X和Windows。 + .. code-block:: text -1. 制作PaddlePaddle开发镜像 + http://localhost:8888/ - PaddlePaddle每次发布新版本都会发布对应的开发镜像供开发者直接使用。这里介绍如生成造这个开发镜像。 - 生成Docker镜像的方式有两个,一个是直接把一个容器转换成镜像,另一个是创建Dockerfile并运行docker build指令按照Dockerfile生成镜像。第一个方法的好处是简单快捷,适合自己实验,可以快速迭代。第二个方法的好处是Dockerfile可以把整个生成流程描述很清楚,其他人很容易看懂镜像生成过程,持续集成系统也可以简单地复现这个过程。我们采用第二个方法。Dockerfile位于PaddlePaddle repo的根目录。生成生产镜像只需要运行: +就这么简单,享受您的旅程! - .. code-block:: bash - - git clone https://github.com/PaddlePaddle/Paddle.git - cd Paddle - docker build -t paddle:dev . +.. _docker_run_gpu: - docker build这个命令的-t指定了生成的镜像的名字,这里我们用paddle:dev。到此,PaddlePaddle开发镜像就被构建完毕了。 +使用Docker执行GPU训练 +------------------------------ -2. 制作PaddlePaddle生产镜像 +为了保证GPU驱动能够在镜像里面正常运行,我们推荐使用 +`nvidia-docker `_ 来运行镜像。 +请不要忘记提前在物理机上安装GPU最新驱动。 - 生产镜像的生成分为两步,第一步是运行: + .. code-block:: bash - .. code-block:: bash - - docker run -v $(pwd):/paddle -e "WITH_GPU=OFF" -e "WITH_AVX=OFF" -e "WITH_TEST=ON" paddle:dev + nvidia-docker run -it -v $PWD:/work paddlepaddle/paddle:latest-gpu /bin/bash - 以上命令会编译PaddlePaddle,生成运行程序,以及生成创建生产镜像的Dockerfile。所有生成的的文件都在build目录下。“WITH_GPU”控制生成的生产镜像是否支持GPU,“WITH_AVX”控制生成的生产镜像是否支持AVX,”WITH_TEST“控制是否生成单元测试。 +**注: 如果没有安装nvidia-docker,可以尝试以下的方法,将CUDA库和Linux设备挂载到Docker容器内:** - 第二步是运行: + .. code-block:: bash - .. code-block:: bash - - docker build -t paddle:prod -f build/Dockerfile ./build + export CUDA_SO="$(\ls /usr/lib64/libcuda* | xargs -I{} echo '-v {}:{}') $(\ls /usr/lib64/libnvidia* | xargs -I{} echo '-v {}:{}')" + export DEVICES=$(\ls /dev/nvidia* | xargs -I{} echo '--device {}:{}') + docker run ${CUDA_SO} ${DEVICES} -it paddlepaddle/paddle:latest-gpu - 以上命令会按照生成的Dockerfile把生成的程序拷贝到生产镜像中并做相应的配置,最终生成名为paddle:prod的生产镜像。 +**关于AVX:** -3. 运行单元测试 +AVX是一种CPU指令集,可以加速PaddlePaddle的计算。最新的PaddlePaddle Docker镜像默认 +是开启AVX编译的,所以,如果您的电脑不支持AVX,需要单独 +`编译 <./build_from_source_cn.html>`_ PaddlePaddle为no-avx版本。 - 运行以下指令: +以下指令能检查Linux电脑是否支持AVX: .. code-block:: bash - - docker run -it -v $(pwd):/paddle paddle:dev bash -c "cd /paddle/build && ctest" - -文档 ----- -Paddle的Docker开发镜像带有一个通过 `woboq code browser -`_ 生成的HTML版本的C++源代码,便于用户浏览C++源码。 - -只要在Docker里启动PaddlePaddle的时候给它一个名字,就可以再运行另一个Nginx Docker镜像来服务HTML代码: - -.. code-block:: bash - - docker run -d --name paddle-cpu-doc paddle:0.10.0-dev - docker run -d --volumes-from paddle-cpu-doc -p 8088:80 nginx + if cat /proc/cpuinfo | grep -i avx; then echo Yes; else echo No; fi -接着我们就能够打开浏览器在 http://localhost:8088/paddle/ 浏览代码。 +如果输出是No,就需要选择使用no-AVX的镜像 diff --git a/doc/getstarted/build_and_install/docker_install_en.rst b/doc/getstarted/build_and_install/docker_install_en.rst index b6fd3329b273aabe80edd5f1ff064a311648b3c2..e0e0559fb858a093db96a9b4ec1c5a45d6c71a38 100644 --- a/doc/getstarted/build_and_install/docker_install_en.rst +++ b/doc/getstarted/build_and_install/docker_install_en.rst @@ -1,270 +1,152 @@ -PaddlePaddle in Docker Containers +Run in Docker Containers ================================= -Docker container is currently the only officially-supported way to -running PaddlePaddle. This is reasonable as Docker now runs on all -major operating systems including Linux, Mac OS X, and Windows. -Please be aware that you will need to change `Dockers settings -`_ to make full use -of your hardware resource on Mac OS X and Windows. +Run PaddlePaddle in Docker container so that you don't need to care about +runtime dependencies, also you can run under Windows system. You can get +tutorials at `here `_ . -Working With Docker -------------------- +If you are using Windows, please refer to +`this `_ +tutorial to start running docker under windows. -Docker is simple as long as we understand a few basic concepts: +After you've read above tutorials you may proceed the following steps. -- *image*: A Docker image is a pack of software. It could contain one or more programs and all their dependencies. For example, the PaddlePaddle's Docker image includes pre-built PaddlePaddle and Python and many Python packages. We can run a Docker image directly, other than installing all these software. We can type +.. _docker_pull: - .. code-block:: bash - - docker images +Pull PaddlePaddle Docker Image +------------------------------ - to list all images in the system. We can also run +Run the following command to download the latest Docker images, the version is cpu_avx_mkl: .. code-block:: bash - - docker pull paddlepaddle/paddle:0.10.0 - to download a Docker image, paddlepaddle/paddle in this example, - from Dockerhub.com. + docker pull paddlepaddle/paddle -- *container*: considering a Docker image a program, a container is a - "process" that runs the image. Indeed, a container is exactly an - operating system process, but with a virtualized filesystem, network - port space, and other virtualized environment. We can type +For users in China, we provide a faster mirror: .. code-block:: bash - docker run paddlepaddle/paddle:0.10.0 - - to start a container to run a Docker image, paddlepaddle/paddle in this example. + docker pull docker.paddlepaddlehub.com/paddle -- By default docker container have an isolated file system namespace, - we can not see the files in the host file system. By using *volume*, - mounted files in host will be visible inside docker container. - Following command will mount current dirctory into /data inside - docker container, run docker container from debian image with - command :code:`ls /data`. +Download GPU version (cuda8.0_cudnn5_avx_mkl) images: .. code-block:: bash - docker run --rm -v $(pwd):/data debian ls /data - -Usage of CPU-only and GPU Images ----------------------------------- - -We package PaddlePaddle's compile environment into a Docker image, -called the develop image, it contains all compiling tools that -PaddlePaddle needs. We package compiled PaddlePaddle program into a -Docker image as well, called the production image, it contains all -runtime environment that running PaddlePaddle needs. For each version -of PaddlePaddle, we release both of them. Production image includes -CPU-only version and a CUDA GPU version and their no-AVX versions. - -We put the docker images on `dockerhub.com -`_. You can find the -latest versions under "tags" tab at dockerhub.com. If you are in -China, you can use our Docker image registry mirror to speed up the -download process. To use it, please replace all paddlepaddle/paddle in -the commands to docker.paddlepaddle.org/paddle. - -1. Production images, this image might have multiple variants: - - - GPU/AVX::code:`paddlepaddle/paddle:-gpu` - - GPU/no-AVX::code:`paddlepaddle/paddle:-gpu-noavx` - - CPU/AVX::code:`paddlepaddle/paddle:` - - CPU/no-AVX::code:`paddlepaddle/paddle:-noavx` - - Please be aware that the CPU-only and the GPU images both use the - AVX instruction set, but old computers produced before 2008 do not - support AVX. The following command checks if your Linux computer - supports AVX: - - .. code-block:: bash - - if cat /proc/cpuinfo | grep -i avx; then echo Yes; else echo No; fi - - - To run the CPU-only image as an interactive container: - - .. code-block:: bash - - docker run -it --rm paddlepaddle/paddle:0.10.0 /bin/bash + docker pull paddlepaddle/paddle:latest-gpu + docker pull docker.paddlepaddlehub.com/paddle:latest-gpu - Above method work with the GPU image too -- the recommended way is - using `nvidia-docker `_. +Choose between different BLAS version: - Please install nvidia-docker first following this `tutorial - `_. - - Now you can run a GPU image: - - .. code-block:: bash - - nvidia-docker run -it --rm paddlepaddle/paddle:0.10.0-gpu /bin/bash - -2. development image :code:`paddlepaddle/paddle:-dev` - - This image has packed related develop tools and runtime - environment. Users and developers can use this image instead of - their own local computer to accomplish development, build, - releasing, document writing etc. While different version of paddle - may depends on different version of libraries and tools, if you - want to setup a local environment, you must pay attention to the - versions. The development image contains: - - - gcc/clang - - nvcc - - Python - - sphinx - - woboq - - sshd - - Many developers use servers with GPUs, they can use ssh to login to - the server and run :code:`docker exec` to enter the docker - container and start their work. Also they can start a development - docker image with SSHD service, so they can login to the container - and start work. - - -Train Model Using Python API ----------------------------- + .. code-block:: bash -Our official docker image provides a runtime for PaddlePaddle -programs. The typical workflow will be as follows: + # image using MKL by default + docker pull paddlepaddle/paddle + # image using OpenBLAS + docker pull paddlepaddle/paddle:latest-openblas -Create a directory as workspace: -.. code-block:: bash +If you want to use legacy versions, choose a tag from +`DockerHub `_ +and run: - mkdir ~/workspace + .. code-block:: bash -Edit a PaddlePaddle python program using your favourite editor + docker pull paddlepaddle/paddle:[tag] + # i.e. + docker pull docker.paddlepaddlehub.com/paddle:0.11.0-gpu -.. code-block:: bash +.. _docker_run: - emacs ~/workspace/example.py +Launch your training program in Docker +-------------------------------------- -Run the program using docker: +Assume that you have already written a PaddlePaddle program +named :code:`train.py` under directory :code:`/home/work` (refer to +`PaddlePaddleBook `_ +for more samples), then run the following command: -.. code-block:: bash + .. code-block:: bash - docker run --rm -v ~/workspace:/workspace paddlepaddle/paddle:0.10.0 python /workspace/example.py + cd /home/work + docker run -it -v $PWD:/work paddlepaddle/paddle /work/train.py -Or if you are using GPU for training: +In the above command, :code:`-it` means run the container interactively; +:code:`-v $PWD:/work` means mount the current directory ($PWD will expand +to current absolute path in Linux) under :code:`/work` in the container. +:code:`paddlepaddle/paddle` to specify image to use; finnally +:code:`/work/train.py` is the command to run inside docker. -.. code-block:: bash +Also, you can go into the container shell, run or debug your code +interactively: - nvidia-docker run --rm -v ~/workspace:/workspace paddlepaddle/paddle:0.10.0-gpu python /workspace/example.py + .. code-block:: bash + docker run -it -v $PWD:/work paddlepaddle/paddle /bin/bash + cd /work + python train.py -Above commands will start a docker container by running :code:`python -/workspace/example.py`. It will stop once :code:`python -/workspace/example.py` finishes. +**NOTE: We did not install vim in the default docker image to reduce the image size, you can run** :code:`apt-get install -y vim` **to install it if you need to edit python files.** -Another way is to tell docker to start a :code:`/bin/bash` session and -run PaddlePaddle program interactively: +.. _docker_run_book: -.. code-block:: bash +PaddlePaddle Book +------------------ - docker run -it -v ~/workspace:/workspace paddlepaddle/paddle:0.10.0 /bin/bash - # now we are inside docker container - cd /workspace - python example.py +You can create a container serving PaddlePaddle Book using Jupyter Notebook in +one minute using Docker. PaddlePaddle Book is an interactive Jupyter Notebook +for users and developers.If you want to +dig deeper into deep learning, PaddlePaddle Book definitely is your best choice. -Running with GPU is identical: +We provide a packaged book image, simply issue the command: -.. code-block:: bash + .. code-block:: bash - nvidia-docker run -it -v ~/workspace:/workspace paddlepaddle/paddle:0.10.0-gpu /bin/bash - # now we are inside docker container - cd /workspace - python example.py + docker run -p 8888:8888 paddlepaddle/book +For users in China, we provide a faster mirror: -Develop PaddlePaddle or Train Model Using C++ API ---------------------------------------------------- + .. code-block: bash -We will be using PaddlePaddle development image since it contains all -compiling tools and dependencies. + docker run -p 8888:8888 docker.paddlepaddlehub.com/book -1. Build PaddlePaddle develop image +Then, you would back and paste the address into the local browser: - Use following command to build PaddlePaddle develop image: + .. code-block:: text - .. code-block:: bash + http://localhost:8888/ - git clone https://github.com/PaddlePaddle/Paddle.git && cd Paddle - docker build -t paddle:dev . +That's all. Enjoy your journey! -2. Build PaddlePaddle production image +.. _docker_run_gpu: - There are two steps for building production image, the first step is to run: +Train with Docker with GPU +------------------------------ - .. code-block:: bash +We recommend using +`nvidia-docker `_ +to run GPU training jobs. Please ensure you have latest +GPU driver installed before move on. - docker run -v $(pwd):/paddle -e "WITH_GPU=OFF" -e "WITH_AVX=OFF" -e "WITH_TEST=ON" paddle:dev + .. code-block:: bash - The above command will compile PaddlePaddle and create a Dockerfile for building production image. All the generated files are in the build directory. "WITH_GPU" controls if the generated production image supports GPU. "WITH_AVX" controls if the generated production image supports AVX. "WITH_TEST" controls if the unit test will be generated. + nvidia-docker run -it -v $PWD:/work paddlepaddle/paddle:latest-gpu /bin/bash - The second step is to run: +**NOTE: If you don't have nvidia-docker installed, try the following method to mount CUDA libs and devices into the container.** - .. code-block:: bash + .. code-block:: bash - docker build -t paddle:prod -f build/Dockerfile ./build + export CUDA_SO="$(\ls /usr/lib64/libcuda* | xargs -I{} echo '-v {}:{}') $(\ls /usr/lib64/libnvidia* | xargs -I{} echo '-v {}:{}')" + export DEVICES=$(\ls /dev/nvidia* | xargs -I{} echo '--device {}:{}') + docker run ${CUDA_SO} ${DEVICES} -it paddlepaddle/paddle:latest-gpu - The above command will generate the production image by copying the compiled PaddlePaddle program into the image. +**About AVX:** -3. Run unit test +AVX is a kind of CPU instruction can accelerate PaddlePaddle's calculations. +The latest PaddlePaddle Docker image turns AVX on by default, so, if your +computer doesn't support AVX, you'll probably need to +`build <./build_from_source_en.html>`_ with :code:`WITH_AVX=OFF`. - Following command will run unit test: +The following command will tell you whether your computer supports AVX. .. code-block:: bash - - docker run -it -v $(pwd):/paddle paddle:dev bash -c "cd /paddle/build && ctest" - -PaddlePaddle Book ------------------- - -The Jupyter Notebook is an open-source web application that allows -you to create and share documents that contain live code, equations, -visualizations and explanatory text in a single browser. - -PaddlePaddle Book is an interactive Jupyter Notebook for users and developers. -We already exposed port 8888 for this book. If you want to -dig deeper into deep learning, PaddlePaddle Book definitely is your best choice. - -We provide a packaged book image, simply issue the command: -.. code-block:: bash - - docker run -p 8888:8888 paddlepaddle/book - -Then, you would back and paste the address into the local browser: - -.. code-block:: text - - http://localhost:8888/ - -That's all. Enjoy your journey! - - -Documentation -------------- - -Paddle Docker images include an HTML version of C++ source code -generated using `woboq code browser -`_. This makes it easy -for users to browse and understand the C++ source code. - -As long as we give the Paddle Docker container a name, we can run an -additional Nginx Docker container to serve the volume from the Paddle -container: - -.. code-block:: bash - - docker run -d --name paddle-cpu-doc paddle: - docker run -d --volumes-from paddle-cpu-doc -p 8088:80 nginx - - -Then we can direct our Web browser to the HTML version of source code -at http://localhost:8088/paddle/ + if cat /proc/cpuinfo | grep -i avx; then echo Yes; else echo No; fi diff --git a/doc/getstarted/build_and_install/index_cn.rst b/doc/getstarted/build_and_install/index_cn.rst index a24df6c518fad84a48061ecb34ee46cb312a4995..c9ba84c842b530162c92713046e64fdf82bd441b 100644 --- a/doc/getstarted/build_and_install/index_cn.rst +++ b/doc/getstarted/build_and_install/index_cn.rst @@ -6,24 +6,28 @@ 安装流程 ++++++++ -PaddlePaddle提供数个预编译的二进制来进行安装,包括Docker镜像,ubuntu的deb安装包等。我们推荐使用Docker镜像来部署环境,同时欢迎贡献更多的安装包。 +PaddlePaddle提供pip和Docker的安装方式: .. toctree:: :maxdepth: 1 - - docker_install_cn.rst - ubuntu_install_cn.rst - + pip_install_cn.rst + docker_install_cn.rst + ../../howto/dev/build_cn.md 编译流程 ++++++++ .. warning:: - 编译流程主要推荐高级用户查看,普通用户请走安装流程。 + 建议直接使用上述安装流程,方便快速安装。只有在遇到需要独立定制的二进制时才需要编译。 .. toctree:: :maxdepth: 1 - cmake/build_from_source_cn.rst + build_from_source_cn.rst + +常见问题解答 +++++++++++ + +`常见问题解答 `_ diff --git a/doc/getstarted/build_and_install/index_en.rst b/doc/getstarted/build_and_install/index_en.rst index 1bfd4f75c0b9b82d61d28a30f03181f7be159f24..32d66d63dd5b2a30d5de4a088dc80b680830cb84 100644 --- a/doc/getstarted/build_and_install/index_en.rst +++ b/doc/getstarted/build_and_install/index_en.rst @@ -1,23 +1,34 @@ Install and Build ================= -Install PaddlePaddle ----------------------- +.. _install_steps: -.. toctree:: - :maxdepth: 1 +Install Steps +++++++++ + +You can choose either pip or Docker to complete your install: + +.. toctree:: + :maxdepth: 1 + + pip_install_en.rst + docker_install_en.rst + ../../howto/dev/build_en.md - docker_install_en.rst - ubuntu_install_en.rst Build from Source ----------------- .. warning:: - Please use :code:`deb` package or :code:`docker` image to install paddle. The building guide is used for hacking or contributing PaddlePaddle source code. + We recommend to directly install via above installation steps, you'll only need to build PaddlePaddle from source when you need a modifed binary. .. toctree:: :maxdepth: 1 build_from_source_en.md + +FAQ +++++++++++ + +`FAQ `_ diff --git a/doc/getstarted/build_and_install/paddleci.png b/doc/getstarted/build_and_install/paddleci.png new file mode 100644 index 0000000000000000000000000000000000000000..16087ce059aa3c07ce8c927d983eb86351915825 Binary files /dev/null and b/doc/getstarted/build_and_install/paddleci.png differ diff --git a/doc/getstarted/build_and_install/pip_install_cn.rst b/doc/getstarted/build_and_install/pip_install_cn.rst new file mode 100644 index 0000000000000000000000000000000000000000..8e4165da6b8135d083766c650f1092158f9d01c2 --- /dev/null +++ b/doc/getstarted/build_and_install/pip_install_cn.rst @@ -0,0 +1,87 @@ +使用pip安装 +================================ + +PaddlePaddle可以使用常用的Python包管理工具 +`pip `_ +完成安装,并可以在大多数主流的Linux操作系统以及MacOS上执行。 + +.. _pip_install: + +使用pip安装 +------------------------------ + + +执行下面的命令即可在当前机器上安装PaddlePaddle的运行时环境,并自动下载安装依赖软件,版本为cpu_avx_openblas。 + + .. code-block:: bash + + pip install paddlepaddle + + +如果需要安装支持GPU的版本(cuda7.5_cudnn5_avx_openblas),需要执行: + + .. code-block:: bash + + pip install paddlepaddle-gpu + +如果需要获取并安装最新的(开发分支)PaddlePaddle,可以从我们的CI系统中下载最新的whl安装包和c-api开发包并安装, +您可以从下面的表格中找到需要的版本: + +如果在点击下面链接时出现如下登陆界面,点击“Log in as guest”即可开始下载: + +.. image:: paddleci.png + :scale: 50 % + :align: center + +.. csv-table:: 各个版本最新的whl包 + :header: "版本说明", "cp27-cp27mu", "cp27-cp27m", "C-API" + :widths: 1, 3, 3, 3 + + "cpu_avx_mkl", "`paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl `_", "`paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl `_", "`paddle.tgz `_" + "cpu_avx_openblas", "`paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl `_", "`paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl `_", "暂无" + "cpu_noavx_openblas", "`paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl `_", "`paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl `_", "暂无" + "cuda7.5_cudnn5_avx_mkl", "`paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl `_", "`paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl `_", "`paddle.tgz `_" + "cuda8.0_cudnn5_avx_mkl", "`paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl `_", "`paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl `_", "`paddle.tgz `_" + "cuda8.0_cudnn7_avx_mkl", "`paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl `_", "`paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl `_", "`paddle.tgz `_" + +.. _pip_dependency: + +运行环境依赖 +------------------------------ + +PaddlePaddle安装包由于不仅仅包含.py程序,而且包含了C++编写的部分,所以我们确保发布的二进制包可以支持主流的Linux操作系统,比如CentOS 6以上,Ubuntu 14.04以上,MacOS 10.12以上。 + +PaddlePaddle发布的安装包会尽量对齐 `manylinux1 `_ 标准,通常使用CentOS 5作为编译环境。但由于CUDA库通常需要CentOS 6以上,而且CentOS 5即将停止维护,所以我们默认使用CentOS 6作为标准编译环境。 + +.. csv-table:: PaddlePaddle环境依赖 + :header: "依赖", "版本", "说明" + :widths: 10, 15, 30 + + "操作系统", "Linux, MacOS", "CentOS 6以上,Ubuntu 14.04以上,MacOS 10.12以上" + "Python", "2.7.x", "暂时不支持Python3" + "libc.so", "GLIBC_2.7", "glibc至少包含GLIBC_2.7以上的符号" + "libstdc++.so", "GLIBCXX_3.4.11, CXXABI_1.3.3", "至少包含GLIBCXX_3.4.11, CXXABI_1.3.3以上的符号" + "libgcc_s.so", "GCC_3.3", "至少包含GCC_3.3以上的符号" + +.. _pip_faq: + +安装常见问题和解决方法 +------------------------------ + +- paddlepaddle*.whl is not a supported wheel on this platform. + + 出现这个问题的主要原因是,没有找到和当前系统匹配的paddlepaddle安装包。请检查Python版本是否为2.7系列。另外最新的pip官方源中的安装包默认是manylinux1标准,需要使用最新的pip (>9.0.0) 才可以安装。可以使用下面的命令更新您的pip: + + .. code-block:: bash + + pip install --upgrade pip + + 如果仍然存在问题,可以执行: + + .. code-block:: bash + + python -c "import pip; print(pip.pep425tags.get_supported())" + + 获取当前系统支持的安装包格式,并检查和需安装的包是否匹配。pypi安装包可以在 `这个 `_ 链接中找到。 + + 如果系统支持的是 linux_x86_64 而安装包是 manylinux1_x86_64 ,需要升级pip版本到最新; 如果系统支持 manylinux1_x86_64 而安装包(本地)是 linux_x86_64 ,可以重命名这个whl包为 manylinux1_x86_64 再安装。 diff --git a/doc/getstarted/build_and_install/pip_install_en.rst b/doc/getstarted/build_and_install/pip_install_en.rst new file mode 100644 index 0000000000000000000000000000000000000000..c1e806c0fe5f03139c0dff985f9ae0856eaa2e98 --- /dev/null +++ b/doc/getstarted/build_and_install/pip_install_en.rst @@ -0,0 +1,105 @@ +Install Using pip +================================ + +You can use current widely used Python package management +tool `pip `_ +to install PaddlePaddle. This method can be used in +most of current Linux systems or MacOS. + +.. _pip_install: + +Install Using pip +------------------------------ + +Run the following command to install PaddlePaddle on the current +machine, it will also download requirements, the version is cpu_avx_openblas. + + .. code-block:: bash + + pip install paddlepaddle + + +If you wish to install GPU version (cuda7.5_cudnn5_avx_openblas), just run: + + .. code-block:: bash + + pip install paddlepaddle-gpu + +If you wish to install the latest develop branch PaddlePaddle, +you can download the latest whl package from our CI system. Access +the below links, log in as guest, then click at the "Artifact" +tab, you'll find the download link of whl packages. + +If the links below shows up the login form, just click "Log in as guest" to start the download: + +.. image:: paddleci.png + :scale: 50 % + :align: center + +.. csv-table:: whl package of each version + :header: "version", "cp27-cp27mu", "cp27-cp27m", "C-API" + :widths: 1, 3, 3, 3 + + "cpu_avx_mkl", "`paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl `_", "`paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl `_", "`paddle.tgz `_" + "cpu_avx_openblas", "`paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl `_", "`paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl `_", "Not Available" + "cpu_noavx_openblas", "`paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl `_", "`paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl `_", "Not Available" + "cuda7.5_cudnn5_avx_mkl", "`paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl `_", "`paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl `_", "`paddle.tgz `_" + "cuda8.0_cudnn5_avx_mkl", "`paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl `_", "`paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl `_", "`paddle.tgz `_" + "cuda8.0_cudnn7_avx_mkl", "`paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl `_", "`paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl `_", "`paddle.tgz `_" + +.. _pip_dependency: + +Runtime Dependency +------------------------------ + +PaddlePaddle installation packages (whl) does not only contain .py files, +but also binaries built from C++ code. We ensure that PaddlePaddle can +run on current mainline Linux distributions, like CentOS 6, Ubuntu 14.04 +and MacOS 10.12. + +PaddlePaddle whl packages are trying to satisfy +`manylinux1 `_ +standard, which uses CentOS 5 as default build environment. But CUDA libraries +seems only run on CentOS 6 at least, also, CentOS 5 is about to end its lifetime, +so we use CentOS 6 as default build environment. + +.. csv-table:: PaddlePaddle Runtime Deps + :header: "Dependency", "version", "description" + :widths: 10, 15, 30 + + "OS", "Linux, MacOS", "CentOS 6 or later,Ubuntu 14.04 or later,MacOS 10.12 or later" + "Python", "2.7.x", "Currently Python3 is not supported" + "libc.so", "GLIBC_2.7", "glibc at least include GLIBC_2.7 symbols" + "libstdc++.so", "GLIBCXX_3.4.11, CXXABI_1.3.3", "At least include GLIBCXX_3.4.11, CXXABI_1.3.3 symbols" + "libgcc_s.so", "GCC_3.3", "At least include GCC_3.3 symbols" + +.. _pip_faq: + +FAQ +------------------------------ + +- paddlepaddle*.whl is not a supported wheel on this platform. + + The main cause of this issue is that your current platform is + not supported. Please check that you are using Python 2.7 series. + Besides, pypi only supports manylinux1 standard, you'll need to + upgrade your pip to >9.0.0. Then run the below command: + + .. code-block:: bash + + pip install --upgrade pip + + If the problem still exists, run the following command: + + .. code-block:: bash + + python -c "import pip; print(pip.pep425tags.get_supported())" + + Then you'll get supported package suffixes, then check if it matches + the file name of the whl package. You can find default whl package at + `here `_ + + If your system supports linux_x86_64 but the whl package is manylinux1_x86_64, + you'll need to update pip to the latest version; If your system supports + manylinux1_x86_64 but the whl package is linux_x86_64 you can rename the + file to manylinux1_x86_64 suffix and then install. diff --git a/doc/getstarted/build_and_install/ubuntu_install_cn.rst b/doc/getstarted/build_and_install/ubuntu_install_cn.rst deleted file mode 100644 index 9e39ccb00f5d5655c30148900a3d76a22aacfc01..0000000000000000000000000000000000000000 --- a/doc/getstarted/build_and_install/ubuntu_install_cn.rst +++ /dev/null @@ -1,71 +0,0 @@ -Ubuntu部署PaddlePaddle -=================================== - -PaddlePaddle提供了ubuntu 14.04 deb安装包。 - -安装 ------- - -安装包的下载地址是\: https://github.com/PaddlePaddle/Paddle/releases - -它包含四个版本\: - -* cpu版本: 支持主流x86处理器平台, 使用了avx指令集。 - -* cpu-noavx版本:支持主流x86处理器平台,没有使用avx指令集。 - -* gpu版本:支持主流x86处理器平台,支持nvidia cuda平台,使用了avx指令集。 - -* gpu-noavx版本:支持主流x86处理器平台,支持nvidia cuda平台,没有使用avx指令集。 - -下载完相关安装包后,执行: - -.. code-block:: shell - - sudo apt-get install gdebi - gdebi paddle-*-cpu.deb - -或者: - -.. code-block:: shell - - dpkg -i paddle-*-cpu.deb - apt-get install -f - - -在 :code:`dpkg -i` 的时候如果报一些依赖未找到的错误是正常的, -在 :code:`apt-get install -f` 里会继续安装 PaddlePaddle。 - -安装完成后,可以使用命令 :code:`paddle version` 查看安装后的paddle 版本: - -.. code-block:: shell - - PaddlePaddle 0.8.0b1, compiled with - with_avx: ON - with_gpu: OFF - with_double: OFF - with_python: ON - with_rdma: OFF - with_timer: OFF - with_predict_sdk: - - -可能遇到的问题 --------------- - -libcudart.so/libcudnn.so找不到 -++++++++++++++++++++++++++++++ - -安装完成后,运行 :code:`paddle train` 报错\: - -.. code-block:: shell - - 0831 12:36:04.151525 1085 hl_dso_loader.cc:70] Check failed: nullptr != *dso_handle For Gpu version of PaddlePaddle, it couldn't find CUDA library: libcudart.so Please make sure you already specify its path.Note: for training data on Cpu using Gpu version of PaddlePaddle,you must specify libcudart.so via LD_LIBRARY_PATH. - -原因是未设置cuda运行时环境变量。 如果使用GPU版本的PaddlePaddle,请安装CUDA 7.5 和CUDNN 5到本地环境中,并设置: - -.. code-block:: shell - - export LD_LIBRARY_PATH=/usr/local/cuda/lib64:/usr/local/cuda/lib:$LD_LIBRARY_PATH - export PATH=/usr/local/cuda/bin:$PATH - diff --git a/doc/getstarted/build_and_install/ubuntu_install_en.rst b/doc/getstarted/build_and_install/ubuntu_install_en.rst deleted file mode 100644 index ea8042085bf458be96e71017d229d88ad867695b..0000000000000000000000000000000000000000 --- a/doc/getstarted/build_and_install/ubuntu_install_en.rst +++ /dev/null @@ -1,25 +0,0 @@ -Debian Package installation guide -================================= - -PaddlePaddle supports :code:`deb` pacakge. The installation of this :code:`deb` package is tested in ubuntu 14.04, but it should be support other debian based linux, too. - -There are four versions of debian package, :code:`cpu`, :code:`gpu`, :code:`cpu-noavx`, :code:`gpu-noavx`. And :code:`noavx` version is used to support CPU which does not contain :code:`AVX` instructions. The download url of :code:`deb` package is \: https://github.com/baidu/Paddle/releases/ - - -After downloading PaddlePaddle deb packages, you can use :code:`gdebi` install. - -.. code-block:: bash - - gdebi paddle-*.deb - -If :code:`gdebi` is not installed, you can use :code:`sudo apt-get install gdebi` to install it. - -Or you can use following commands to install PaddlePaddle. - -.. code-block:: bash - - dpkg -i paddle-*.deb - apt-get install -f - -And if you use GPU version deb package, you need to install CUDA toolkit and cuDNN, and set related environment variables(such as LD_LIBRARY_PATH) first. It is normal when `dpkg -i` get errors. `apt-get install -f` will continue install paddle, and install dependences. - diff --git a/doc/getstarted/concepts/src/infer.py b/doc/getstarted/concepts/src/infer.py new file mode 100644 index 0000000000000000000000000000000000000000..a1b60388c45ae83cbed3b5597e8b8aef5d69f814 --- /dev/null +++ b/doc/getstarted/concepts/src/infer.py @@ -0,0 +1,32 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle.v2 as paddle +import numpy as np + +paddle.init(use_gpu=False) +x = paddle.layer.data(name='x', type=paddle.data_type.dense_vector(2)) +y_predict = paddle.layer.fc(input=x, size=1, act=paddle.activation.Linear()) + +# loading the model which generated by training +with open('params_pass_90.tar', 'r') as f: + parameters = paddle.parameters.Parameters.from_tar(f) + +# Input multiple sets of data,Output the infer result in a array. +i = [[[1, 2]], [[3, 4]], [[5, 6]]] +print paddle.infer(output_layer=y_predict, parameters=parameters, input=i) +# Will print: +# [[ -3.24491572] +# [ -6.94668722] +# [-10.64845848]] diff --git a/doc/getstarted/concepts/src/train.py b/doc/getstarted/concepts/src/train.py index 679d0a931a7d650108ea89a04080a55d2976f72e..0e5bdb57bc95c513eb67d426741c860a34d37dd0 100644 --- a/doc/getstarted/concepts/src/train.py +++ b/doc/getstarted/concepts/src/train.py @@ -1,3 +1,17 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import paddle.v2 as paddle import numpy as np @@ -8,7 +22,7 @@ paddle.init(use_gpu=False) x = paddle.layer.data(name='x', type=paddle.data_type.dense_vector(2)) y_predict = paddle.layer.fc(input=x, size=1, act=paddle.activation.Linear()) y = paddle.layer.data(name='y', type=paddle.data_type.dense_vector(1)) -cost = paddle.layer.mse_cost(input=y_predict, label=y) +cost = paddle.layer.square_error_cost(input=y_predict, label=y) # create parameters parameters = paddle.parameters.create(cost) @@ -26,12 +40,17 @@ def event_handler(event): if event.batch_id % 1 == 0: print "Pass %d, Batch %d, Cost %f" % (event.pass_id, event.batch_id, event.cost) + # product model every 10 pass + if isinstance(event, paddle.event.EndPass): + if event.pass_id % 10 == 0: + with open('params_pass_%d.tar' % event.pass_id, 'w') as f: + trainer.save_parameter_to_tar(f) # define training dataset reader def train_reader(): train_x = np.array([[1, 1], [1, 2], [3, 4], [5, 2]]) - train_y = np.array([-2, -3, -7, -7]) + train_y = np.array([[-2], [-3], [-7], [-7]]) def reader(): for i in xrange(train_y.shape[0]): diff --git a/doc/getstarted/concepts/use_concepts_cn.rst b/doc/getstarted/concepts/use_concepts_cn.rst index e63ca11102c8ce457afcc3c262fa5f159361c01d..e695ff283e2e806377a51c559b37e8068360a4ff 100644 --- a/doc/getstarted/concepts/use_concepts_cn.rst +++ b/doc/getstarted/concepts/use_concepts_cn.rst @@ -81,9 +81,9 @@ PaddlePaddle支持不同类型的输入数据,主要包括四种类型,和 .. code-block:: bash y_predict = paddle.layer.fc(input=x, size=1, act=paddle.activation.Linear()) - cost = paddle.layer.mse_cost(input=y_predict, label=y) + cost = paddle.layer.square_error_cost(input=y_predict, label=y) -其中,x与y为之前描述的输入层;而y_predict是接收x作为输入,接上一个全连接层;cost接收y_predict与y作为输入,接上均方误差层。 +其中,x与y为之前描述的输入层;而y_predict是接收x作为输入,接上一个全连接层;cost接收y_predict与y作为输入,接上平方误差层。 最后一层cost中记录了神经网络的所有拓扑结构,通过组合不同的layer,我们即可完成神经网络的搭建。 @@ -111,7 +111,7 @@ PaddlePaddle支持不同类型的输入数据,主要包括四种类型,和 # define training dataset reader def train_reader(): train_x = np.array([[1, 1], [1, 2], [3, 4], [5, 2]]) - train_y = np.array([-2, -3, -7, -7]) + train_y = np.array([[-2], [-3], [-7], [-7]]) def reader(): for i in xrange(train_y.shape[0]): yield train_x[i], train_y[i] @@ -147,4 +147,9 @@ PaddlePaddle支持不同类型的输入数据,主要包括四种类型,和 .. literalinclude:: src/train.py :linenos: -有关线性回归的实际应用,可以参考PaddlePaddle book的 `第一章节 `_。 \ No newline at end of file +使用以上训练好的模型进行预测,取其中一个模型params_pass_90.tar,输入需要预测的向量组,然后打印输出: + +.. literalinclude:: src/infer.py + :linenos: + +有关线性回归的实际应用,可以参考PaddlePaddle book的 `第一章节 `_。 diff --git a/doc/getstarted/index_cn.rst b/doc/getstarted/index_cn.rst index aa418c657a4ba16cce61c030066f4d3e14e891cc..9f6ee25987d51dcca3a37cf0f62a70a5a5a2d89a 100644 --- a/doc/getstarted/index_cn.rst +++ b/doc/getstarted/index_cn.rst @@ -1,10 +1,61 @@ 新手入门 ============ +.. _quick_install: + +快速安装 +++++++++ + +PaddlePaddle支持使用pip快速安装,目前支持CentOS 6以上, Ubuntu 14.04以及MacOS 10.12,并安装有Python2.7。 +执行下面的命令完成快速安装,版本为cpu_avx_openblas: + + .. code-block:: bash + + pip install paddlepaddle + +如果需要安装支持GPU的版本(cuda7.5_cudnn5_avx_openblas),需要执行: + + .. code-block:: bash + + pip install paddlepaddle-gpu + +更详细的安装和编译方法参考: + .. toctree:: :maxdepth: 1 build_and_install/index_cn.rst - concepts/use_concepts_cn.rst -- `深度学习入门课程 `_ +.. _quick_start: + +快速开始 +++++++++ + +创建一个 housing.py 并粘贴此Python代码: + + .. code-block:: python + + import paddle.v2 as paddle + + # Initialize PaddlePaddle. + paddle.init(use_gpu=False, trainer_count=1) + + # Configure the neural network. + x = paddle.layer.data(name='x', type=paddle.data_type.dense_vector(13)) + y_predict = paddle.layer.fc(input=x, size=1, act=paddle.activation.Linear()) + + # Infer using provided test data. + probs = paddle.infer( + output_layer=y_predict, + parameters=paddle.dataset.uci_housing.model(), + input=[item for item in paddle.dataset.uci_housing.test()()]) + + for i in xrange(len(probs)): + print 'Predicted price: ${:,.2f}'.format(probs[i][0] * 1000) + +执行 :code:`python housing.py` 瞧! 它应该打印出预测住房数据的清单。 + +.. toctree:: + :maxdepth: 1 + + concepts/use_concepts_cn.rst diff --git a/doc/getstarted/index_en.rst b/doc/getstarted/index_en.rst index be3253e3d41b99a2b696e2c5ef6463ed49680d69..063d9d880c82550f7f5d47d3d0b1fff59865bca7 100644 --- a/doc/getstarted/index_en.rst +++ b/doc/getstarted/index_en.rst @@ -1,9 +1,61 @@ GET STARTED ============ +.. _quick_install: + +Quick Install +---------------------- + +You can use pip to install PaddlePaddle with a single command, supports +CentOS 6 above, Ubuntu 14.04 above or MacOS 10.12, with Python 2.7 installed. +Simply run the following command to install, the version is cpu_avx_openblas: + + .. code-block:: bash + + pip install paddlepaddle + +If you need to install GPU version (cuda7.5_cudnn5_avx_openblas), run: + + .. code-block:: bash + + pip install paddlepaddle-gpu + +For more details about installation and build: + .. toctree:: :maxdepth: 1 build_and_install/index_en.rst -- `Deep Learning 101 `_ + +.. _quick_start: + +Quick Start +++++++++ + +Create a new file called housing.py, and paste this Python +code: + + + .. code-block:: python + + import paddle.v2 as paddle + + # Initialize PaddlePaddle. + paddle.init(use_gpu=False, trainer_count=1) + + # Configure the neural network. + x = paddle.layer.data(name='x', type=paddle.data_type.dense_vector(13)) + y_predict = paddle.layer.fc(input=x, size=1, act=paddle.activation.Linear()) + + # Infer using provided test data. + probs = paddle.infer( + output_layer=y_predict, + parameters=paddle.dataset.uci_housing.model(), + input=[item for item in paddle.dataset.uci_housing.test()()]) + + for i in xrange(len(probs)): + print 'Predicted price: ${:,.2f}'.format(probs[i][0] * 1000) + +Run :code:`python housing.py` and voila! It should print out a list of predictions +for the test housing data. diff --git a/doc/howto/cross_compiling/cross_compiling_for_android_cn.md b/doc/howto/cross_compiling/cross_compiling_for_android_cn.md deleted file mode 100644 index 90dc84718c9ce1374cda6022de177afeeb60279d..0000000000000000000000000000000000000000 --- a/doc/howto/cross_compiling/cross_compiling_for_android_cn.md +++ /dev/null @@ -1,75 +0,0 @@ -# 构建Android平台上的PaddlePaddle库 - -用户可通过交叉编译的方式,在用户熟悉的开发平台(Linux,Mac OS X和Windows)上编译Android平台上适用的PaddlePaddle库。 -本文档将以Linux x86-64平台为例,介绍交叉编译Android平台上适用的PaddlePaddle库的方法和步骤。 - -## 准备交叉编译环境 - -从源码交叉编译PaddlePaddle,用户需要提前准备好交叉编译环境。Android平台上使用的C/C++交叉编译工具链为[Android NDK](https://developer.android.com/ndk/downloads/index.html?hl=zh-cn),用户可自行前往下载预编译好的版本,也可通过以下命令获取: - -```bash -wget -q https://dl.google.com/android/repository/android-ndk-r14b-linux-x86_64.zip -unzip -q android-ndk-r14b-linux-x86_64.zip -``` - -Android NDK中包含了所有Android API级别、所有架构(arm/arm64/x86/mips)需要用到的编译工具和系统库。用户可根据自己的编译目标架构、所需支持的最低Android API级别,构建[独立工具链](https://developer.android.google.cn/ndk/guides/standalone_toolchain.html?hl=zh-cn)。 -比如: - -```bash -your/path/to/android-ndk-r14b-linux-x86_64/build/tools/make-standalone-toolchain.sh \ - --arch=arm --platform=android-21 --install-dir=your/path/to/my_standalone_toolchain -``` - -此命令将在your/path/to/my_standalone_toolchain目录生成一套编译工具链,面向架构为32位ARM架构,支持的最小的Android API级别为21,使用的编译器为arm-linux-androideabi-gcc (GCC) 4.9。 - -注意:**PaddlePaddle要求使用的编译工具链所支持的Andoid API级别不小于21**。 - -## 配置交叉编译参数 - -CMake系统对交叉编译提供了支持[cmake-toolchains](https://cmake.org/cmake/help/v3.0/manual/cmake-toolchains.7.html#cross-compiling)。为了简化cmake配置,PaddlePaddle为交叉编译提供了工具链配置文档[cmake/cross_compiling/android.cmake](https://github.com/PaddlePaddle/Paddle/blob/develop/cmake/cross_compiling/android.cmake),以提供一些默认的编译器和编译参数相关配置。注意,从CMake 3.7版本开始,CMake官方对Android平台的交叉编译提供了通用的支持。PaddlePaddle若检测到用户使用的CMake版本不低于3.7时,将会将用户传进来的配置参数传递CMake系统,交由CMake系统本身来处理。有关参数配置的详细说明见[cmake-toolchains](https://cmake.org/cmake/help/v3.7/manual/cmake-toolchains.7.html#cross-compiling)。 - -交叉编译Android版本的PaddlePaddle库时,有一些必须配置的参数: -- `CMAKE_SYSTEM_NAME`,CMake编译的目标平台,必须设置为`Android`。在设置`CMAKE_SYSTEM_NAME=Android`后,PaddlePaddle的CMake系统才认为是在交叉编译Android系统的版本,并自动编译宿主机版protoc可执行文件、目标机版protobuf库、以及Android所需`arm_soft_fp_abi`分支的目标机版OpenBLAS库。此外,还会强制设置一些PaddlePaddle参数的值(`WITH_GPU=OFF`、`WITH_AVX=OFF`、`WITH_PYTHON=OFF`、`WITH_RDMA=OFF`)。 -- `WITH_C_API`,必须设置为`ON`。在Android平台上只支持使用C-API来预测。 -- `WITH_SWIG_PY`,必须设置为`OFF`。在Android平台上不支持通过swig调用来训练或者预测。 - -Android平台可选配置参数: - -- `ANDROID_STANDALONE_TOOLCHAIN`,独立工具链所在的绝对路径,或者相对于构建目录的相对路径。PaddlePaddle的CMake系统将根据该值自动推导和设置需要使用的交叉编译器、sysroot、以及Android API级别;否则,用户需要在cmake时手动设置这些值。无默认值。 -- `ANDROID_ABI`,目标架构ABI。目前只支持`armeabi-v7a`,默认值为`armeabi-v7a`。 -- `ANDROID_NATIVE_API_LEVEL`,工具链的Android API级别。若没有显式设置,PaddlePaddle将根据`ANDROID_STANDALONE_TOOLCHAIN`的值自动推导得到。 -- `ANROID_ARM_MODE`,是否使用ARM模式。可设置`ON/OFF`,默认值为`ON`。 -- `ANDROID_ARM_NEON`,是否使用NEON指令。目前必须设置成`ON`,默认值为`ON`。 - -其他配置参数: - -- `HOST_C/CXX_COMPILER`,宿主机的C/C++编译器。在编译宿主机版protoc可执行文件和目标机版OpenBLAS库时需要用到。默认设置成环境变量`CC`的值;若环境变量`CC`没有设置,则设置成`cc`编译器。 - -一种常用的cmake配置如下: - -```bash -cmake -DCMAKE_SYSTEM_NAME=Android \ - -DANDROID_STANDALONE_TOOLCHAIN=your/path/to/my_standalone_toolchain \ - -DANDROID_ABI=armeabi-v7a \ - -DANDROID_ARM_NEON=ON \ - -DANDROID_ARM_MODE=ON \ - -DCMAKE_INSTALL_PREFIX=your/path/to/install \ - -DWITH_C_API=ON \ - -DWITH_SWIG_PY=OFF \ - .. -``` - -用户还可根据自己的需求设置其他编译参数。比如希望最小化生成的库的大小,可以设置`CMAKE_BUILD_TYPE`为`MinSizeRel`;若希望最快的执行速度,则可设置`CMAKE_BUILD_TYPE`为`Release`。亦可以通过手动设置`CMAKE_C/CXX_FLAGS_MINSIZEREL/RELEASE`来影响PaddlePaddle的编译过程。 - -## 编译和安装 - -CMake配置完成后,执行以下命令,PaddlePaddle将自动下载和编译所有第三方依赖库、编译和安装PaddlePaddle预测库。 - -```bash -make -make install -``` - -注意:如果你曾经在源码目录下编译过其他平台的PaddlePaddle库,请先使用`rm -rf`命令删除`third_party`目录和`build`目录,以确保所有的第三方依赖库和PaddlePaddle代码都是针对新的CMake配置重新编译的。 - -执行完安装命令后,`your/path/to/install`目录中会包含`include`和`lib`目录,其中`include`中包含C-API的头文件,`lib`中包含一个Android版本的库。自此,PaddlePaddle的已经安装完成,用户可将`your/path/to/install`目录下的生成文件用于深度学习相关Android App中,调用方法见C-API文档。 diff --git a/doc/howto/cross_compiling/cross_compiling_for_raspberry_cn.md b/doc/howto/cross_compiling/cross_compiling_for_raspberry_cn.md deleted file mode 100644 index 085b5dda1615a9af918b59870db460fcc5acdcca..0000000000000000000000000000000000000000 --- a/doc/howto/cross_compiling/cross_compiling_for_raspberry_cn.md +++ /dev/null @@ -1,65 +0,0 @@ -# 构建Raspberry Pi平台上的PaddlePaddle库 - -对于Rasspberry Pi系统,用户可通过ssh等方式登录到Raspberry Pi系统上,按照[源码编译PaddlePaddle](http://www.paddlepaddle.org/doc_cn/getstarted/build_and_install/cmake/build_from_source_cn.html)相关文档所述,直接编译Raspberry Pi平台上适用的PaddlePaddle库。 - -用户也可以在自己熟悉的开发平台上,通过交叉编译的方式来编译。这篇文档将以Linux x86-64平台为例,介绍交叉编译Raspberry Pi平台上适用的PaddlePaddle的方法和步骤。 - -## 准备交叉编译环境 - -从源码交叉编译PaddlePaddle,用户需要提前准备好交叉编译环境。用户可自行前往[github](https://github.com/raspberrypi/tools)下载Raspberry Pi平台使用的C/C++交叉编译工具链,也可通过以下命令获取: - -```bash -git clone https://github.com/raspberrypi/tools.git -``` - -该github仓库中包含若干个预编译好的、针对不同平台的编译工具。宿主机是Linux x86-64环境,则需选用`arm-bcm2708/gcc-linaro-arm-linux-gnueabihf-raspbian-x64`下的作为编译工具,所使用的编译器为arm-linux-gnueabihf-gcc 4.8.3。 - -注意,该编译工具链需要系统glibc支持2.14以上。 - -## 配置交叉编译参数 - -CMake系统对交叉编译提供了支持[cmake-toolchains](https://cmake.org/cmake/help/v3.0/manual/cmake-toolchains.7.html#cross-compiling)。为了简化cmake配置,PaddlePaddle为交叉编译提供了工具链配置文档[cmake/cross_compiling/raspberry_pi.cmake](https://github.com/PaddlePaddle/Paddle/blob/develop/cmake/cross_compiling/raspberry_pi.cmake),以提供一些默认的编译器和编译参数相关配置。 - -交叉编译Raspberry Pi版本PaddlePaddle库时,有一些必须配置的参数: - -- `CMAKE_SYSTEM_NAME`,CMake编译的目标平台,必须配置为`RPi`。在设置`CMAKE_SYSTEM_NAME=RPi`后,PaddlePaddle的CMake系统才认为在是在交叉编译Raspberry Pi系统的版本,并自动编译宿主机版protoc可执行文件、目标机版protobuf库、以及目标机版OpenBLAS库。 - -Raspberry Pi平台可选配置参数: - -- `RPI_TOOLCHAIN`,编译工具链所在的绝对路径,或者相对于构建目录的相对路径。PaddlePaddle的CMake系统将根据该值自动设置需要使用的交叉编译器;否则,用户需要在cmake时手动设置这些值。无默认值。 -- `RPI_ARM_NEON`,是否使用NEON指令。目前必须设置成`ON`,默认值为`ON`。 - -其他配置参数: - -- `HOST_C/CXX_COMPILER`,宿主机的C/C++编译器。在编译宿主机版protoc可执行文件和目标机版OpenBLAS库时需要用到。默认设置成环境变量`CC`的值;若环境变量`CC`没有设置,则设置成`cc`编译器。 - -cmake参数如下; - -``` -cmake -DCMAKE_SYSTEM_NAME=RPi \ - -DRPI_TOOLCHAIN=your/path/to/arm-bcm2708/gcc-linaro-arm-linux-gnueabihf-raspbian-x64 \ - -DRPI_ARM_NEON=ON \ - -DCMAKE_INSTALL_PREFIX=your/path/to/install \ - -DWITH_GPU=OFF \ - -DWITH_C_API=ON \ - -DWITH_PYTHON=OFF \ - -DWITH_SWIG_PY=OFF \ - .. -``` - -用户还可根据自己的需求设置其他编译参数。比如希望最小化生成的库的大小,可以设置`CMAKE_BUILD_TYPE`为`MinSizeRel`;若希望最快的执行速度,则可设置`CMAKE_BUILD_TYPE`为`Release`。亦可以通过手动设置`CMAKE_C/CXX_FLAGS_MINSIZEREL/RELEASE`来影响PaddlePaddle的编译过程。 - -## 编译和安装 - -CMake配置完成后,执行以下命令,PaddlePaddle将自动下载和编译所有第三方依赖库、编译和安装PaddlePaddle。 - -```bash -make -make install -``` - -注意:如果你曾经在源码目录下编译过其他平台的PaddlePaddle库,请先使用`rm -rf`命令删除`third_party`目录和`build`目录,以确保所有的第三方依赖库和PaddlePaddle代码都是针对新的CMake配置重新编译的。 - -执行完安装命令后,由于上一步cmake配置中`WITH_C_API`设置为`ON`,`your/path/to/install`目录中会包含`include`和`lib`目录,其中`include`中包含C-API的头文件,`lib`中包含一个Raspberry Pi版本的库。 - -更多的编译配置见[源码编译PaddlePaddle](http://www.paddlepaddle.org/doc_cn/getstarted/build_and_install/cmake/build_from_source_cn.html)相关文档。 diff --git a/doc/howto/deep_model/rnn/index_cn.rst b/doc/howto/deep_model/rnn/index_cn.rst index 9e805ca85191b793c8798a239927a318c70b96f5..9ecab5594cff47cde4700b7ce0f58013a960a16e 100644 --- a/doc/howto/deep_model/rnn/index_cn.rst +++ b/doc/howto/deep_model/rnn/index_cn.rst @@ -4,6 +4,7 @@ RNN相关模型 .. toctree:: :maxdepth: 1 + rnn_config_cn.rst recurrent_group_cn.md hierarchical_layer_cn.rst hrnn_rnn_api_compare_cn.rst diff --git a/doc/howto/deep_model/rnn/index_en.rst b/doc/howto/deep_model/rnn/index_en.rst index 13a153b05c578e0af82ee29db5ea27fd4b6d6f59..7adc79873d699fdfd5a85034bcef964dd1f19132 100644 --- a/doc/howto/deep_model/rnn/index_en.rst +++ b/doc/howto/deep_model/rnn/index_en.rst @@ -1,2 +1,7 @@ RNN Models ========== + +.. toctree:: + :maxdepth: 1 + + rnn_config_en.rst diff --git a/doc/howto/deep_model/rnn/rnn_config_cn.rst b/doc/howto/deep_model/rnn/rnn_config_cn.rst index ac2bd0775f4ab2e0a0c37462e2c23001123b152b..63fa161fafed0f3a8ec8799af21304cbec62d813 100644 --- a/doc/howto/deep_model/rnn/rnn_config_cn.rst +++ b/doc/howto/deep_model/rnn/rnn_config_cn.rst @@ -5,36 +5,13 @@ RNN配置 中配置循环神经网络(RNN)。PaddlePaddle 高度支持灵活和高效的循环神经网络配置。 在本教程中,您将了解如何: -- 准备用来学习循环神经网络的序列数据。 - 配置循环神经网络架构。 - 使用学习完成的循环神经网络模型生成序列。 我们将使用 vanilla 循环神经网络和 sequence to sequence 模型来指导你完成这些步骤。sequence to sequence -模型的代码可以在\ ``demo / seqToseq``\ 找到。 - -准备序列数据 ------------- - -PaddlePaddle -不需要对序列数据进行任何预处理,例如填充。唯一需要做的是将相应类型设置为输入。例如,以下代码段定义了三个输入。 -它们都是序列,它们的大小是\ ``src_dict``\ ,\ ``trg_dict``\ 和\ ``trg_dict``\ : - -.. code:: python - - settings.input_types = [ - integer_value_sequence(len(settings.src_dict)), - integer_value_sequence(len(settings.trg_dict)), - integer_value_sequence(len(settings.trg_dict))] - -在\ ``process``\ 函数中,每个\ ``yield``\ 函数将返回三个整数列表。每个整数列表被视为一个整数序列: - -.. code:: python - - yield src_ids, trg_ids, trg_ids_next - -有关如何编写数据提供程序的更多细节描述,请参考 :ref:`api_pydataprovider2` 。完整的数据提供文件在 -``demo/seqToseq/dataprovider.py``\ 。 +模型的代码可以在 `book/08.machine_translation `_ 找到。 +wmt14数据的提供文件在 `python/paddle/v2/dataset/wmt14.py `_ 。 配置循环神经网络架构 -------------------- @@ -44,7 +21,7 @@ PaddlePaddle 循环神经网络在每个时间步骤顺序地处理序列。下面列出了 LSTM 的架构的示例。 -.. image:: ../../../tutorials/sentiment_analysis/bi_lstm.jpg +.. image:: src/bi_lstm.jpg :align: center 一般来说,循环网络从 :math:`t=1` 到 :math:`t=T` 或者反向地从 :math:`t=T` 到 :math:`t=1` 执行以下操作。 @@ -85,19 +62,19 @@ vanilla act=None, rnn_layer_attr=None): def __rnn_step__(ipt): - out_mem = memory(name=name, size=size) - rnn_out = mixed_layer(input = [full_matrix_projection(ipt), - full_matrix_projection(out_mem)], - name = name, - bias_attr = rnn_bias_attr, - act = act, - layer_attr = rnn_layer_attr, - size = size) + out_mem = paddle.layer.memory(name=name, size=size) + rnn_out = paddle.layer.mixed(input = [paddle.layer.full_matrix_projection(input=ipt), + paddle.layer.full_matrix_projection(input=out_mem)], + name = name, + bias_attr = rnn_bias_attr, + act = act, + layer_attr = rnn_layer_attr, + size = size) return rnn_out - return recurrent_group(name='%s_recurrent_group' % name, - step=__rnn_step__, - reverse=reverse, - input=input) + return paddle.layer.recurrent_group(name='%s_recurrent_group' % name, + step=__rnn_step__, + reverse=reverse, + input=input) PaddlePaddle 使用“Memory”(记忆模块)实现单步函数。\ **Memory**\ 是在PaddlePaddle中构造循环神经网络时最重要的概念。 @@ -119,7 +96,7 @@ Sequence to Sequence Model with Attention 我们将使用 sequence to sequence model with attention 作为例子演示如何配置复杂的循环神经网络模型。该模型的说明如下图所示。 -.. image:: ../../../tutorials/text_generation/encoder-decoder-attention-model.png +.. image:: src/encoder-decoder-attention-model.png :align: center 在这个模型中,源序列 :math:`S = \{s_1, \dots, s_T\}` @@ -140,43 +117,52 @@ Sequence to Sequence Model with Attention .. code:: python # 定义源语句的数据层 - src_word_id = data_layer(name='source_language_word', size=source_dict_dim) + src_word_id = paddle.layer.data( + name='source_language_word', + type=paddle.data_type.integer_value_sequence(source_dict_dim)) # 计算每个词的词向量 - src_embedding = embedding_layer( + src_embedding = paddle.layer.embedding( input=src_word_id, size=word_vector_dim, - param_attr=ParamAttr(name='_source_language_embedding')) + param_attr=paddle.attr.ParamAttr(name='_source_language_embedding')) # 应用前向循环神经网络 - src_forward = grumemory(input=src_embedding, size=encoder_size) + src_forward = paddle.networks.simple_gru( + input=src_embedding, size=encoder_size) # 应用反向递归神经网络(reverse=True表示反向循环神经网络) - src_backward = grumemory(input=src_embedding, - size=encoder_size, - reverse=True) + src_backward = paddle.networks.simple_gru( + input=src_embedding, size=encoder_size, reverse=True) # 将循环神经网络的前向和反向部分混合在一起 - encoded_vector = concat_layer(input=[src_forward, src_backward]) + encoded_vector = paddle.layer.concat(input=[src_forward, src_backward]) # 投射编码向量到 decoder_size - encoder_proj = mixed_layer(input = [full_matrix_projection(encoded_vector)], - size = decoder_size) + encoded_proj = paddle.layer.mixed( + size=decoder_size, + input=paddle.layer.full_matrix_projection(encoded_vector)) # 计算反向RNN的第一个实例 - backward_first = first_seq(input=src_backward) + backward_first = paddle.layer.first_seq(input=src_backward) # 投射反向RNN的第一个实例到 decoder size - decoder_boot = mixed_layer(input=[full_matrix_projection(backward_first)], size=decoder_size, act=TanhActivation()) + decoder_boot = paddle.layer.mixed( + size=decoder_size, + act=paddle.activation.Tanh(), + input=paddle.layer.full_matrix_projection(backward_first)) 解码器使用 ``recurrent_group`` 来定义循环神经网络。单步函数和输出函数在 ``gru_decoder_with_attention`` 中定义: .. code:: python - group_inputs=[StaticInput(input=encoded_vector,is_seq=True), - StaticInput(input=encoded_proj,is_seq=True)] - trg_embedding = embedding_layer( - input=data_layer(name='target_language_word', - size=target_dict_dim), - size=word_vector_dim, - param_attr=ParamAttr(name='_target_language_embedding')) + group_input1 = paddle.layer.StaticInput(input=encoded_vector, is_seq=True) + group_input2 = paddle.layer.StaticInput(input=encoded_proj, is_seq=True) + group_inputs = [group_input1, group_input2] + trg_embedding = paddle.layer.embedding( + input=paddle.layer.data( + name='target_language_word', + type=paddle.data_type.integer_value_sequence(target_dict_dim)), + size=word_vector_dim, + param_attr=paddle.attr.ParamAttr(name='_target_language_embedding')) + group_inputs.append(trg_embedding) group_inputs.append(trg_embedding) # 对于配备有注意力机制的解码器,在训练中, @@ -185,9 +171,10 @@ Sequence to Sequence Model with Attention # StaticInput 意味着不同时间步的输入都是相同的值, # 否则它以一个序列输入,不同时间步的输入是不同的。 # 所有输入序列应该有相同的长度。 - decoder = recurrent_group(name=decoder_group_name, - step=gru_decoder_with_attention, - input=group_inputs) + decoder = paddle.layer.recurrent_group( + name=decoder_group_name, + step=gru_decoder_with_attention, + input=group_inputs) 单步函数的实现如下所示。首先,它定义解码网络的\ **Memory**\ 。然后定义 attention,门控循环单元单步函数和输出函数: @@ -198,27 +185,32 @@ attention,门控循环单元单步函数和输出函数: # 定义解码器的Memory # Memory的输出定义在 gru_step 内 # 注意 gru_step 应该与它的Memory名字相同 - decoder_mem = memory(name='gru_decoder', - size=decoder_size, - boot_layer=decoder_boot) + decoder_mem = paddle.layer.memory( + name='gru_decoder', size=decoder_size, boot_layer=decoder_boot) # 计算 attention 加权编码向量 - context = simple_attention(encoded_sequence=enc_vec, - encoded_proj=enc_proj, - decoder_state=decoder_mem) + context = paddle.networks.simple_attention( + encoded_sequence=enc_vec, + encoded_proj=enc_proj, + decoder_state=decoder_mem) # 混合当前词向量和attention加权编码向量 - decoder_inputs = mixed_layer(inputs = [full_matrix_projection(context), - full_matrix_projection(current_word)], - size = decoder_size * 3) + decoder_inputs = paddle.layer.mixed( + size=decoder_size * 3, + input=[ + paddle.layer.full_matrix_projection(input=context), + paddle.layer.full_matrix_projection(input=current_word) + ]) # 定义门控循环单元循环神经网络单步函数 - gru_step = gru_step_layer(name='gru_decoder', - input=decoder_inputs, - output_mem=decoder_mem, - size=decoder_size) + gru_step = paddle.layer.gru_step( + name='gru_decoder', + input=decoder_inputs, + output_mem=decoder_mem, + size=decoder_size) # 定义输出函数 - out = mixed_layer(input=[full_matrix_projection(input=gru_step)], - size=target_dict_dim, - bias_attr=True, - act=SoftmaxActivation()) + out = paddle.layer.mixed( + size=target_dict_dim, + bias_attr=True, + act=paddle.activation.Softmax(), + input=paddle.layer.full_matrix_projection(input=gru_step)) return out 生成序列 @@ -238,41 +230,32 @@ attention,门控循环单元单步函数和输出函数: - ``beam_size``: beam search 算法中的beam大小。 - ``max_length``: 生成序列的最大长度。 -- 使用 ``seqtext_printer_evaluator`` - 根据索引矩阵和字典打印文本。这个函数需要设置: - - - ``id_input``: 数据的整数ID,用于标识生成的文件中的相应输出。 - - ``dict_file``: 用于将词ID转换为词的字典文件。 - - ``result_file``: 生成结果文件的路径。 - 代码如下: .. code:: python - group_inputs=[StaticInput(input=encoded_vector,is_seq=True), - StaticInput(input=encoded_proj,is_seq=True)] + group_input1 = paddle.layer.StaticInput(input=encoded_vector, is_seq=True) + group_input2 = paddle.layer.StaticInput(input=encoded_proj, is_seq=True) + group_inputs = [group_input1, group_input2] # 在生成时,解码器基于编码源序列和最后生成的目标词预测下一目标词。 # 编码源序列(编码器输出)必须由只读Memory的 StaticInput 指定。 # 这里, GeneratedInputs 自动获取上一个生成的词,并在最开始初始化为起始词,如 。 - trg_embedding = GeneratedInput( - size=target_dict_dim, - embedding_name='_target_language_embedding', - embedding_size=word_vector_dim) + trg_embedding = paddle.layer.GeneratedInput( + size=target_dict_dim, + embedding_name='_target_language_embedding', + embedding_size=word_vector_dim) group_inputs.append(trg_embedding) - beam_gen = beam_search(name=decoder_group_name, - step=gru_decoder_with_attention, - input=group_inputs, - bos_id=0, # Beginnning token. - eos_id=1, # End of sentence token. - beam_size=beam_size, - max_length=max_length) - - seqtext_printer_evaluator(input=beam_gen, - id_input=data_layer(name="sent_id", size=1), - dict_file=trg_dict_path, - result_file=gen_trans_file) - outputs(beam_gen) - -注意,这种生成技术只用于类似解码器的生成过程。如果你正在处理序列标记任务,请参阅 :ref:`semantic_role_labeling` 了解更多详细信息。 - -完整的配置文件在\ ``demo/seqToseq/seqToseq_net.py``\ 。 + beam_gen = paddle.layer.beam_search( + name=decoder_group_name, + step=gru_decoder_with_attention, + input=group_inputs, + bos_id=0, # Beginnning token. + eos_id=1, # End of sentence token. + beam_size=beam_size, + max_length=max_length) + + return beam_gen + +注意,这种生成技术只用于类似解码器的生成过程。如果你正在处理序列标记任务,请参阅 `book/06.understand_sentiment `_ 了解更多详细信息。 + +完整的配置文件在 `book/08.machine_translation/train.py `_ 。 diff --git a/doc/howto/deep_model/rnn/rnn_config_en.rst b/doc/howto/deep_model/rnn/rnn_config_en.rst index 73f5d5371fcd3ce95253cad47b0d8e738284441c..f92edd108ff5c10a31b5f181f0f6dcb7a3f119f3 100644 --- a/doc/howto/deep_model/rnn/rnn_config_en.rst +++ b/doc/howto/deep_model/rnn/rnn_config_en.rst @@ -3,34 +3,11 @@ RNN Configuration This tutorial will guide you how to configure recurrent neural network in PaddlePaddle. PaddlePaddle supports highly flexible and efficient recurrent neural network configuration. In this tutorial, you will learn how to: -- prepare sequence data for learning recurrent neural networks. - configure recurrent neural network architecture. - generate sequence with learned recurrent neural network models. -We will use vanilla recurrent neural network, and sequence to sequence model to guide you through these steps. The code of sequence to sequence model can be found at :code:`demo/seqToseq`. - -===================== -Prepare Sequence Data -===================== - -PaddlePaddle does not need any preprocessing to sequence data, such as padding. The only thing that needs to be done is to set the type of the corresponding type to input. For example, the following code snippets defines three input. All of them are sequences, and the size of them are :code:`src_dict`, :code:`trg_dict`, and :code:`trg_dict`: - -.. code-block:: python - - settings.input_types = [ - integer_value_sequence(len(settings.src_dict)), - integer_value_sequence(len(settings.trg_dict)), - integer_value_sequence(len(settings.trg_dict))] - - -Then at the :code:`process` function, each :code:`yield` function will return three integer lists. Each integer list is treated as a sequence of integers: - -.. code-block:: python - - yield src_ids, trg_ids, trg_ids_next - - -For more details description of how to write a data provider, please refer to :ref:`api_pydataprovider2` . The full data provider file is located at :code:`demo/seqToseq/dataprovider.py`. +We will use vanilla recurrent neural network, and sequence to sequence model to guide you through these steps. The code of sequence to sequence model can be found at `book/08.machine_translation `_ . +And the data preparation of this model can be found at `python/paddle/v2/dataset/wmt14.py `_ =============================================== Configure Recurrent Neural Network Architecture @@ -42,7 +19,7 @@ Simple Gated Recurrent Neural Network Recurrent neural network process a sequence at each time step sequentially. An example of the architecture of LSTM is listed below. -.. image:: ../../../tutorials/sentiment_analysis/src/bi_lstm.jpg +.. image:: src/bi_lstm.jpg :align: center Generally speaking, a recurrent network perform the following operations from :math:`t=1` to :math:`t=T`, or reversely from :math:`t=T` to :math:`t=1`. @@ -75,19 +52,19 @@ Its **output function** simply takes :math:`x_t` as the output. act=None, rnn_layer_attr=None): def __rnn_step__(ipt): - out_mem = memory(name=name, size=size) - rnn_out = mixed_layer(input = [full_matrix_projection(ipt), - full_matrix_projection(out_mem)], - name = name, - bias_attr = rnn_bias_attr, - act = act, - layer_attr = rnn_layer_attr, - size = size) + out_mem = paddle.layer.memory(name=name, size=size) + rnn_out = paddle.layer.mixed(input = [paddle.layer.full_matrix_projection(input=ipt), + paddle.layer.full_matrix_projection(input=out_mem)], + name = name, + bias_attr = rnn_bias_attr, + act = act, + layer_attr = rnn_layer_attr, + size = size) return rnn_out - return recurrent_group(name='%s_recurrent_group' % name, - step=__rnn_step__, - reverse=reverse, - input=input) + return paddle.layer.recurrent_group(name='%s_recurrent_group' % name, + step=__rnn_step__, + reverse=reverse, + input=input) PaddlePaddle uses memory to construct step function. **Memory** is the most important concept when constructing recurrent neural networks in PaddlePaddle. A memory is a state that is used recurrently in step functions, such as :math:`x_{t+1} = f_x(x_t)`. One memory contains an **output** and a **input**. The output of memory at the current time step is utilized as the input of the memory at the next time step. A memory can also has a **boot layer**, whose output is utilized as the initial value of the memory. In our case, the output of the gated recurrent unit is employed as the output memory. Notice that the name of the layer :code:`rnn_out` is the same as the name of :code:`out_mem`. This means the output of the layer :code:`rnn_out` (:math:`x_{t+1}`) is utilized as the **output** of :code:`out_mem` memory. @@ -101,7 +78,7 @@ Sequence to Sequence Model with Attention ----------------------------------------- We will use the sequence to sequence model with attention as an example to demonstrate how you can configure complex recurrent neural network models. An illustration of the sequence to sequence model with attention is shown in the following figure. -.. image:: ../../../tutorials/text_generation/encoder-decoder-attention-model.png +.. image:: src/encoder-decoder-attention-model.png :align: center In this model, the source sequence :math:`S = \{s_1, \dots, s_T\}` is encoded with a bidirectional gated recurrent neural networks. The hidden states of the bidirectional gated recurrent neural network :math:`H_S = \{H_1, \dots, H_T\}` is called *encoder vector* The decoder is a gated recurrent neural network. When decoding each token :math:`y_t`, the gated recurrent neural network generates a set of weights :math:`W_S^t = \{W_1^t, \dots, W_T^t\}`, which are used to compute a weighted sum of the encoder vector. The weighted sum of the encoder vector is utilized to condition the generation of the token :math:`y_t`. @@ -113,43 +90,52 @@ We also project the encoder vector to :code:`decoder_size` dimensional space, ge .. code-block:: python # Define the data layer of the source sentence. - src_word_id = data_layer(name='source_language_word', size=source_dict_dim) + src_word_id = paddle.layer.data( + name='source_language_word', + type=paddle.data_type.integer_value_sequence(source_dict_dim)) # Calculate the word embedding of each word. - src_embedding = embedding_layer( + src_embedding = paddle.layer.embedding( input=src_word_id, size=word_vector_dim, - param_attr=ParamAttr(name='_source_language_embedding')) + param_attr=paddle.attr.ParamAttr(name='_source_language_embedding')) # Apply forward recurrent neural network. - src_forward = grumemory(input=src_embedding, size=encoder_size) + src_forward = paddle.networks.simple_gru( + input=src_embedding, size=encoder_size) # Apply backward recurrent neural network. reverse=True means backward recurrent neural network. - src_backward = grumemory(input=src_embedding, - size=encoder_size, - reverse=True) + src_backward = paddle.networks.simple_gru( + input=src_embedding, size=encoder_size, reverse=True) # Mix the forward and backward parts of the recurrent neural network together. - encoded_vector = concat_layer(input=[src_forward, src_backward]) + encoded_vector = paddle.layer.concat(input=[src_forward, src_backward]) # Project encoding vector to decoder_size. - encoder_proj = mixed_layer(input = [full_matrix_projection(encoded_vector)], - size = decoder_size) + encoded_proj = paddle.layer.mixed( + size=decoder_size, + input=paddle.layer.full_matrix_projection(encoded_vector)) # Compute the first instance of the backward RNN. - backward_first = first_seq(input=src_backward) + backward_first = paddle.layer.first_seq(input=src_backward) # Project the first instance of backward RNN to decoder size. - decoder_boot = mixed_layer(input=[full_matrix_projection(backward_first)], size=decoder_size, act=TanhActivation()) + decoder_boot = paddle.layer.mixed( + size=decoder_size, + act=paddle.activation.Tanh(), + input=paddle.layer.full_matrix_projection(backward_first)) The decoder uses :code:`recurrent_group` to define the recurrent neural network. The step and output functions are defined in :code:`gru_decoder_with_attention`: .. code-block:: python - group_inputs=[StaticInput(input=encoded_vector,is_seq=True), - StaticInput(input=encoded_proj,is_seq=True)] - trg_embedding = embedding_layer( - input=data_layer(name='target_language_word', - size=target_dict_dim), - size=word_vector_dim, - param_attr=ParamAttr(name='_target_language_embedding')) + group_input1 = paddle.layer.StaticInput(input=encoded_vector, is_seq=True) + group_input2 = paddle.layer.StaticInput(input=encoded_proj, is_seq=True) + group_inputs = [group_input1, group_input2] + trg_embedding = paddle.layer.embedding( + input=paddle.layer.data( + name='target_language_word', + type=paddle.data_type.integer_value_sequence(target_dict_dim)), + size=word_vector_dim, + param_attr=paddle.attr.ParamAttr(name='_target_language_embedding')) + group_inputs.append(trg_embedding) group_inputs.append(trg_embedding) # For decoder equipped with attention mechanism, in training, @@ -158,9 +144,10 @@ The decoder uses :code:`recurrent_group` to define the recurrent neural network. # StaticInput means the same value is utilized at different time steps. # Otherwise, it is a sequence input. Inputs at different time steps are different. # All sequence inputs should have the same length. - decoder = recurrent_group(name=decoder_group_name, - step=gru_decoder_with_attention, - input=group_inputs) + decoder = paddle.layer.recurrent_group( + name=decoder_group_name, + step=gru_decoder_with_attention, + input=group_inputs) The implementation of the step function is listed as below. First, it defines the **memory** of the decoder network. Then it defines attention, gated recurrent unit step function, and the output function: @@ -171,27 +158,32 @@ The implementation of the step function is listed as below. First, it defines th # Defines the memory of the decoder. # The output of this memory is defined in gru_step. # Notice that the name of gru_step should be the same as the name of this memory. - decoder_mem = memory(name='gru_decoder', - size=decoder_size, - boot_layer=decoder_boot) + decoder_mem = paddle.layer.memory( + name='gru_decoder', size=decoder_size, boot_layer=decoder_boot) # Compute attention weighted encoder vector. - context = simple_attention(encoded_sequence=enc_vec, - encoded_proj=enc_proj, - decoder_state=decoder_mem) + context = paddle.networks.simple_attention( + encoded_sequence=enc_vec, + encoded_proj=enc_proj, + decoder_state=decoder_mem) # Mix the current word embedding and the attention weighted encoder vector. - decoder_inputs = mixed_layer(inputs = [full_matrix_projection(context), - full_matrix_projection(current_word)], - size = decoder_size * 3) + decoder_inputs = paddle.layer.mixed( + size=decoder_size * 3, + input=[ + paddle.layer.full_matrix_projection(input=context), + paddle.layer.full_matrix_projection(input=current_word) + ]) # Define Gated recurrent unit recurrent neural network step function. - gru_step = gru_step_layer(name='gru_decoder', - input=decoder_inputs, - output_mem=decoder_mem, - size=decoder_size) + gru_step = paddle.layer.gru_step( + name='gru_decoder', + input=decoder_inputs, + output_mem=decoder_mem, + size=decoder_size) # Defines the output function. - out = mixed_layer(input=[full_matrix_projection(input=gru_step)], - size=target_dict_dim, - bias_attr=True, - act=SoftmaxActivation()) + out = paddle.layer.mixed( + size=target_dict_dim, + bias_attr=True, + act=paddle.activation.Softmax(), + input=paddle.layer.full_matrix_projection(input=gru_step)) return out @@ -207,45 +199,37 @@ After training the model, we can use it to generate sequences. A common practice - :code:`eos_id`: the end token. Every sentence ends with the end token. - :code:`beam_size`: the beam size used in beam search. - :code:`max_length`: the maximum length of the generated sentences. - -* use :code:`seqtext_printer_evaluator` to print text according to index matrix and dictionary. This function needs to set: - - - :code:`id_input`: the integer ID of the data, used to identify the corresponding output in the generated files. - - :code:`dict_file`: the dictionary file for converting word id to word. - - :code:`result_file`: the path of the generation result file. The code is listed below: .. code-block:: python - group_inputs=[StaticInput(input=encoded_vector,is_seq=True), - StaticInput(input=encoded_proj,is_seq=True)] + group_input1 = paddle.layer.StaticInput(input=encoded_vector, is_seq=True) + group_input2 = paddle.layer.StaticInput(input=encoded_proj, is_seq=True) + group_inputs = [group_input1, group_input2] # In generation, decoder predicts a next target word based on # the encoded source sequence and the last generated target word. # The encoded source sequence (encoder's output) must be specified by # StaticInput which is a read-only memory. # Here, GeneratedInputs automatically fetchs the last generated word, # which is initialized by a start mark, such as . - trg_embedding = GeneratedInput( - size=target_dict_dim, - embedding_name='_target_language_embedding', - embedding_size=word_vector_dim) + trg_embedding = paddle.layer.GeneratedInput( + size=target_dict_dim, + embedding_name='_target_language_embedding', + embedding_size=word_vector_dim) group_inputs.append(trg_embedding) - beam_gen = beam_search(name=decoder_group_name, - step=gru_decoder_with_attention, - input=group_inputs, - bos_id=0, # Beginnning token. - eos_id=1, # End of sentence token. - beam_size=beam_size, - max_length=max_length) + beam_gen = paddle.layer.beam_search( + name=decoder_group_name, + step=gru_decoder_with_attention, + input=group_inputs, + bos_id=0, # Beginnning token. + eos_id=1, # End of sentence token. + beam_size=beam_size, + max_length=max_length) - seqtext_printer_evaluator(input=beam_gen, - id_input=data_layer(name="sent_id", size=1), - dict_file=trg_dict_path, - result_file=gen_trans_file) - outputs(beam_gen) + return beam_gen -Notice that this generation technique is only useful for decoder like generation process. If you are working on sequence tagging tasks, please refer to :ref:`semantic_role_labeling` for more details. +Notice that this generation technique is only useful for decoder like generation process. If you are working on sequence tagging tasks, please refer to `book/06.understand_sentiment `_ for more details. -The full configuration file is located at :code:`demo/seqToseq/seqToseq_net.py`. +The full configuration file is located at `book/08.machine_translation/train.py `_ . diff --git a/doc/tutorials/sentiment_analysis/bi_lstm.jpg b/doc/howto/deep_model/rnn/src/bi_lstm.jpg similarity index 100% rename from doc/tutorials/sentiment_analysis/bi_lstm.jpg rename to doc/howto/deep_model/rnn/src/bi_lstm.jpg diff --git a/doc/tutorials/text_generation/encoder-decoder-attention-model.png b/doc/howto/deep_model/rnn/src/encoder-decoder-attention-model.png similarity index 100% rename from doc/tutorials/text_generation/encoder-decoder-attention-model.png rename to doc/howto/deep_model/rnn/src/encoder-decoder-attention-model.png diff --git a/doc/howto/dev/build_cn.md b/doc/howto/dev/build_cn.md new file mode 100644 index 0000000000000000000000000000000000000000..4a80a5245102fb992f513a749f6a02e1130188af --- /dev/null +++ b/doc/howto/dev/build_cn.md @@ -0,0 +1,124 @@ +# 用Docker编译和测试PaddlePaddle + +## 需要的软硬件 + +为了开发PaddlePaddle,我们需要 + +1. 一台电脑,可以装的是 Linux, BSD, Windows 或者 MacOS 操作系统,以及 +1. Docker。 + +不需要依赖其他任何软件了。即便是 Python 和 GCC 都不需要,因为我们会把所有编译工具都安装进一个 Docker image 里。 + +## 总体流程 + +1. 获取源码 + + ```bash + git clone https://github.com/paddlepaddle/paddle + ``` + +2. 安装开发工具到 Docker image 里 + + ```bash + cd paddle; docker build -t paddle:dev . + ``` + + 请注意这个命令结尾处的 `.`;它表示 `docker build` 应该读取当前目录下的 [`Dockerfile`文件](https://github.com/PaddlePaddle/Paddle/blob/develop/Dockerfile),按照其内容创建一个名为 `paddle:dev` 的 Docker image,并且把各种开发工具安装进去。 + +3. 编译 + + 以下命令启动一个 Docker container 来执行 `paddle:dev` 这个 Docker image,同时把当前目录(源码树根目录)映射为 container 里的 `/paddle` 目录,并且运行 `Dockerfile` 描述的默认入口程序 [`build.sh`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/scripts/docker/build.sh)。这个脚本调用 `cmake` 和 `make` 来编译 `/paddle` 里的源码,结果输出到 `/paddle/build`,也就是本地的源码树根目录里的 `build` 子目录。 + + ```bash + docker run --rm -v $PWD:/paddle paddle:dev + ``` + + 上述命令编译出一个 CUDA-enabled 版本。如果我们只需要编译一个只支持 CPU 的版本,可以用 + + ```bash + docker run --rm -e WITH_GPU=OFF -v $PWD:/paddle paddle:dev + ``` + +4. 运行单元测试 + + 用本机的第一个 GPU 来运行包括 GPU 单元测试在内的所有单元测试: + + ```bash + NV_GPU=0 nvidia-docker run --rm -v $PWD:/paddle paddle:dev bash -c "cd /paddle/build; ctest" + ``` + + 如果编译的时候我们用了 `WITH_GPU=OFF` 选项,那么编译过程只会产生 CPU-based 单元测试,那么我们也就不需要 nvidia-docker 来运行单元测试了。我们只需要: + + ```bash + docker run --rm -v $PWD:/paddle paddle:dev bash -c "cd /paddle/build; ctest" + ``` + + 有时候我们只想运行一个特定的单元测试,比如 `memory_test`,我们可以 + + ```bash + nvidia-docker run --rm -v $PWD:/paddle paddle:dev bash -c "cd /paddle/build; ctest -V -R memory_test" + ``` + +5. 清理 + + 有时候我们会希望清理掉已经下载的第三方依赖以及已经编译的二进制文件。此时只需要: + + ```bash + rm -rf build + ``` + +## 为什么要 Docker 呀? + +- 什么是 Docker? + + 如果您没有听说 Docker,可以把它想象为一个类似 virtualenv 的系统,但是虚拟的不仅仅是 Python 的运行环境。 + +- Docker 还是虚拟机? + + 有人用虚拟机来类比 Docker。需要强调的是:Docker 不会虚拟任何硬件,Docker container 里运行的编译工具实际上都是在本机的 CPU 和操作系统上直接运行的,性能和把编译工具安装在本机运行一样。 + +- 为什么用 Docker? + + 把工具和配置都安装在一个 Docker image 里可以标准化编译环境。这样如果遇到问题,其他人可以复现问题以便帮助。 + + 另外,对于习惯使用Windows和MacOS的开发者来说,使用Docker就不用配置交叉编译环境了。 + +- 我可以选择不用Docker吗? + + 当然可以。大家可以用把开发工具安装进入 Docker image 一样的方式,把这些工具安装到本机。这篇文档介绍基于 Docker 的开发流程,是因为这个流程比其他方法都更简便。 + +- 学习 Docker 有多难? + + 理解 Docker 并不难,大概花十分钟看一下[这篇文章](https://zhuanlan.zhihu.com/p/19902938)。这可以帮您省掉花一小时安装和配置各种开发工具,以及切换机器时需要新安装的辛苦。别忘了 PaddlePaddle 更新可能导致需要新的开发工具。更别提简化问题复现带来的好处了。 + +- 我可以用 IDE 吗? + + 当然可以,因为源码就在本机上。IDE 默认调用 make 之类的程序来编译源码,我们只需要配置 IDE 来调用 Docker 命令编译源码即可。 + + 很多 PaddlePaddle 开发者使用 Emacs。他们在自己的 `~/.emacs` 配置文件里加两行 + + ```emacs + (global-set-key "\C-cc" 'compile) + (setq compile-command + "docker run --rm -it -v $(git rev-parse --show-toplevel):/paddle paddle:dev") + ``` + + 就可以按 `Ctrl-C` 和 `c` 键来启动编译了。 + +- 可以并行编译吗? + + 是的。我们的 Docker image 运行一个 [Bash 脚本](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/scripts/docker/build.sh)。这个脚本调用 `make -j$(nproc)` 来启动和 CPU 核一样多的进程来并行编译。 + +## 可能碰到的问题 + +- Docker 需要 sudo + + 如果用自己的电脑开发,自然也就有管理员权限(sudo)了。如果用公用的电脑开发,需要请管理员安装和配置好 Docker。此外,PaddlePaddle 项目在努力开始支持其他不需要 sudo 的集装箱技术,比如 rkt。 + +- 在 Windows/MacOS 上编译很慢 + + Docker 在 Windows 和 MacOS 都可以运行。不过实际上是运行在一个 Linux 虚拟机上。可能需要注意给这个虚拟机多分配一些 CPU 和内存,以保证编译高效。具体做法请参考[这个issue](https://github.com/PaddlePaddle/Paddle/issues/627)。 + +- 磁盘不够 + + 本文中的例子里,`docker run` 命令里都用了 `--rm` 参数,这样保证运行结束之后的 containers 不会保留在磁盘上。可以用 `docker ps -a` 命令看到停止后但是没有删除的 containers。`docker build` 命令有时候会产生一些中间结果,是没有名字的 images,也会占用磁盘。可以参考[这篇文章](https://zaiste.net/posts/removing_docker_containers/)来清理这些内容。 diff --git a/doc/howto/dev/build_en.md b/doc/howto/dev/build_en.md new file mode 100644 index 0000000000000000000000000000000000000000..91c41ef8ce3abdec5d69a9cbcebbc49b17d8f663 --- /dev/null +++ b/doc/howto/dev/build_en.md @@ -0,0 +1,124 @@ +# Build using Docker + +## What Developers Need + +To contribute to PaddlePaddle, you need + +1. A computer -- Linux, BSD, Windows, MacOS, and +1. Docker. + +Nothing else. Not even Python and GCC, because you can install all build tools into a Docker image. We run all the tools by running this image. + +## General Process + +1. Retrieve source code. + + ```bash + git clone https://github.com/paddlepaddle/paddle + ``` + +2. Install build tools into a Docker image. + + ```bash + cd paddle; docker build -t paddle:dev . + ``` + + Please be aware of the `.` at the end of the command, which refers to the [`./Dockerfile` file](https://github.com/PaddlePaddle/Paddle/blob/develop/Dockerfile). `docker build` follows instructions in this file to create a Docker image named `paddle:dev`, and installs building tools into it. + +3. Build from source. + + This following command starts a Docker container that executes the Docker image `paddle:dev`, mapping the current directory to `/paddle/` in the container, and runs the default entry-point [`build.sh`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/scripts/docker/build.sh) as specified in the Dockefile. `build.sh` invokes `cmake` and `make` to build PaddlePaddle source code, which had been mapped to `/paddle`, and writes outputs to `/paddle/build`, which maps to `build` in the current source directory on the computer. + + ```bash + docker run -v $PWD:/paddle paddle:dev + ``` + + Above command builds a CUDA-enabled version. If we want to build a CPU-only version, we can type + + ```bash + docker run -e WITH_GPU=OFF -v $PWD:/paddle paddle:dev + ``` + +4. Run unit tests. + + To run all unit tests using the first GPU of a node: + + ```bash + NV_GPU=0 nvidia-docker run -v $PWD:/paddle paddle:dev bash -c "cd /paddle/build; ctest" + ``` + + If we used `WITH_GPU=OFF` at build time, it generates only CPU-based unit tests, and we don't need nvidia-docker to run them. We can just run + + ```bash + docker run -v $PWD:/paddle paddle:dev bash -c "cd /paddle/build; ctest" + ``` + + Sometimes we want to run a specific unit test, say `memory_test`, we can run + + ```bash + nvidia-docker run -v $PWD:/paddle paddle:dev bash -c "cd /paddle/build; ctest -V -R memory_test" + ``` + +5. Clean Build. + + Sometimes, we might want to clean all thirt-party dependents and built binaries. To do so, just + + ```bash + rm -rf build + ``` + +## Docker, Or Not? + +- What is Docker? + + If you haven't heard of it, consider it something like Python's virtualenv. + +- Docker or virtual machine? + + Some people compare Docker with VMs, but Docker doesn't virtualize any hardware nor running a guest OS, which means there is no compromise on the performance. + +- Why Docker? + + Using a Docker image of build tools standardizes the building environment, which makes it easier for others to reproduce your problems and to help. + + Also, some build tools don't run on Windows or Mac or BSD, but Docker runs almost everywhere, so developers can use whatever computer they want. + +- Can I choose not to use Docker? + + Sure, you don't have to install build tools into a Docker image; instead, you can install them in your local computer. This document exists because Docker would make the development way easier. + +- How difficult is it to learn Docker? + + It takes you ten minutes to read [an introductory article](https://docs.docker.com/get-started) and saves you more than one hour to install all required build tools, configure them, especially when new versions of PaddlePaddle require some new tools. Not even to mention the time saved when other people trying to reproduce the issue you have. + +- Can I use my favorite IDE? + + Yes, of course. The source code resides on your local computer, and you can edit it using whatever editor you like. + + Many PaddlePaddle developers are using Emacs. They add the following few lines into their `~/.emacs` configure file: + + ```emacs + (global-set-key "\C-cc" 'compile) + (setq compile-command + "docker run --rm -it -v $(git rev-parse --show-toplevel):/paddle paddle:dev") + ``` + + so they could type `Ctrl-C` and `c` to build PaddlePaddle from source. + +- Does Docker do parallel building? + + Our building Docker image runs a [Bash script](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/scripts/docker/build.sh), which calls `make -j$(nproc)` to starts as many processes as the number of your CPU cores. + +## Some Gotchas + +- Docker requires sudo + + An owner of a computer has the administrative privilege, a.k.a., sudo, and Docker requires this privilege to work properly. If you use a shared computer for development, please ask the administrator to install and configure Docker. We will do our best to support rkt, another container technology that doesn't require sudo. + +- Docker on Windows/MacOS builds slowly + + On Windows and MacOS, Docker containers run in a Linux VM. You might want to give this VM some more memory and CPUs so to make the building efficient. Please refer to [this issue](https://github.com/PaddlePaddle/Paddle/issues/627) for details. + +- Not enough disk space + + Examples in this article uses option `--rm` with the `docker run` command. This option ensures that stopped containers do not exist on hard disks. We can use `docker ps -a` to list all containers, including stopped. Sometimes `docker build` generates some intermediate dangling images, which also take disk space. To clean them, please refer to [this article](https://zaiste.net/posts/removing_docker_containers/). diff --git a/doc/howto/dev/contribute_to_paddle_cn.md b/doc/howto/dev/contribute_to_paddle_cn.md index 699390145226ec2b65fdf5122db187e1d30d669e..3e0bf7b3973079a2063d33b6be4fe8a9dc5c07bb 100644 --- a/doc/howto/dev/contribute_to_paddle_cn.md +++ b/doc/howto/dev/contribute_to_paddle_cn.md @@ -76,18 +76,18 @@ no changes added to commit (use "git add" and/or "git commit -a") ## 构建和测试 -编译 PaddlePaddle 的源码以及生成文档需要多种开发工具。为了方便大家,我们的标准开发流程是把这些工具都装进一个Docker image,称为*开发镜像*,通常名字是 `paddle:dev`。然后所有用 `cmake && make` 的地方(比如IDE配置里)都用 `docker run paddle:dev`来代替。 +编译 PaddlePaddle 的源码以及生成文档需要多种开发工具。为了方便大家,我们的标准开发流程是把这些工具都装进一个Docker image,称为*开发镜像*,通常名字是 `paddle:latest-dev` 或者 `paddle:[version tag]-dev` 如 `paddle:0.11.0-dev`。然后所有用 `cmake && make` 的地方(比如IDE配置里)都用 `docker run paddle:latest-dev`来代替。 如要build这个开发镜像,在源码目录树的根目录中运行: ```bash -➜ docker build -t paddle:dev . +➜ docker build -t paddle:latest-dev . ``` 随后可以用这个开发镜像开始build PaddlePaddle的源码。比如如果要build一个不依赖GPU,但是支持AVX指令集,并且包括unit tests的PaddlePaddle,可以: ```bash -➜ docker run -v $(pwd):/paddle -e "WITH_GPU=OFF" -e "WITH_AVX=ON" -e "WITH_TEST=ON" paddle:dev +➜ docker run -v $(pwd):/paddle -e "WITH_GPU=OFF" -e "WITH_AVX=ON" -e "WITH_TESTING=ON" paddle:latest-dev ``` 这个过程除了编译PaddlePaddle为 `./build/libpaddle.so`,并且输出一个 `./build/paddle.deb`文件之外,还会输出一个 `build/Dockerfile`。我们只需要运行下面命令把编译好的PaddlePaddle打包成一个*生产镜像*(`paddle:prod`): @@ -99,7 +99,7 @@ no changes added to commit (use "git add" and/or "git commit -a") 如果要运行所有的单元测试,可以用如下命令: ```bash -➜ docker run -it -v $(pwd):/paddle paddle:dev bash -c "cd /paddle/build && ctest" +➜ docker run -it -v $(pwd):/paddle paddle:latest-dev bash -c "cd /paddle/build && ctest" ``` 关于构建和测试的更多信息,请参见[这篇文档](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/getstarted/build_and_install/docker_install_cn.rst)。 diff --git a/doc/howto/dev/contribute_to_paddle_en.md b/doc/howto/dev/contribute_to_paddle_en.md deleted file mode 100644 index 40d1eb62d722244139cc84eb170c190d988f5626..0000000000000000000000000000000000000000 --- a/doc/howto/dev/contribute_to_paddle_en.md +++ /dev/null @@ -1,219 +0,0 @@ -# Contribute Code - -We sincerely appreciate your contributions. You can use fork and pull request -workflow to merge your code. - -## Code Requirements -- Your code comments must be fully documented by - [Doxygen](http://www.stack.nl/~dimitri/doxygen/) style. -- Make sure the compiler option `WITH_STYLE_CHECK` is on and the compiler - passes the code style check. -- All code must have unit test. -- Pass all unit tests. - -The following tutorial guides you into submitting your contibution. - -## [Creating a Fork](https://help.github.com/articles/fork-a-repo/) - -Just head over to the GitHub page and click the "Fork" button. -It's just that simple. - -## Clone - -Clone remote repository. - -```bash -➜ git clone https://github.com/USERNAME/Paddle -➜ cd Paddle -``` - -## Create a local branch - -Paddle is currently using [Git-flow branching model](http://nvie.com/posts/a-successful-git-branching-model/). - -All feature and bug fix development work should be done on a new branch, generally create new branch from `develop` branch . - -```bash -➜ git checkout -b my-cool-stuff -``` - -Before the checkout, you need to keep the current branch directory clean, otherwise the untracked file will be brought to the new branch, which can be inspected by `git status`. - -## Using `pre-commit` hook - -Paddle developers use [pre-commit](http://pre-commit.com/) tool to manage git -pre-commit hooks. It can help us format source codes (cpp, python), check some -basic thing before commit (only one EOL for each file, do not add a huge file -in git). `pre-commit` tests is a part of unit tests in Travis-CI now, every -PR doesn't fit hook can not be merged into Paddle. - -To use [pre-commit](http://pre-commit.com/), you should install it by -`pip install pre-commit`, and currently, Paddle uses `clang-format` to format -c/cpp sources. Please make sure clang-format 3.8+ installed. - -Install and run it as follow: - -```bash -➜ pip install pre-commit -➜ pre-commit install -``` - -When you commit your code, the pre-commit hook will check the local code if there is -anything not suitable to commit, and so on. - -## Start to develop - -In this tutorial, I delete a line in README.md and created a new file. - -We can use `git status` to inspect the changes of current directory, `git diff` to see difference. - -```bash -➜ git status -On branch test -Changes not staged for commit: - (use "git add ..." to update what will be committed) - (use "git checkout -- ..." to discard changes in working directory) - - modified: README.md - -Untracked files: - (use "git add ..." to include in what will be committed) - - test - -no changes added to commit (use "git add" and/or "git commit -a") -``` -## Build and Test - -We package PaddlePaddle's compile environment into a Docker image, called the develop image named `paddle:dev`, it contains all compiling tools that PaddlePaddle needs. - -If you want to build the develop image, just run: - -```bash -➜ docker build -t paddle:dev . -``` - -Then we can use the develop image to build PaddlePaddle source. For example: - -```bash -➜ docker run -v $(pwd):/paddle -e "WITH_GPU=OFF" -e "WITH_AVX=ON" -e "WITH_TEST=ON" paddle:dev -``` - -The above command will compile PaddlePaddle and create a Dockerfile for building production image. All the generated files are in the build directory. "WITH_GPU" controls if the generated production image supports GPU. "WITH_AVX" controls if the generated production image supports AVX. "WITH_TEST" controls if the unit test will be generated. - -Then we can generate the production image by copying the compiled PaddlePaddle program into the image by - -```bash -➜ docker build -t paddle:prod -f build/Dockerfile . -``` - -Run unit test finally: - -```bash -➜ docker run -it -v $(pwd):/paddle paddle:dev bash -c "cd /paddle/build && ctest" -``` - -For more details, you can read [this doc](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/getstarted/build_and_install/docker_install_en.rst). - -## Commit - -Next we cancel the changes to the README.md file and then commit our changes by following command lines: - -```bash -➜ git checkout -- README.md -➜ git status -On branch test -Untracked files: - (use "git add ..." to include in what will be committed) - - test - -nothing added to commit but untracked files present (use "git add" to track) -➜ git add test -``` - -We should write a description of each commit by `git commit` to allow others to know -the changes in these files. - -```bash -➜ git commit -CRLF end-lines remover...............................(no files to check)Skipped -yapf.................................................(no files to check)Skipped -Check for added large files..............................................Passed -Check for merge conflicts................................................Passed -Check for broken symlinks................................................Passed -Detect Private Key...................................(no files to check)Skipped -Fix End of Files.....................................(no files to check)Skipped -clang-formater.......................................(no files to check)Skipped -[my-cool-stuff c703c041] add test file - 1 file changed, 0 insertions(+), 0 deletions(-) - create mode 100644 233 -``` - -## Keeping Fork Up to Date - -Before pull your request, you should sync your code from the latest PaddlePaddle. -To do this, you'll need to add a remote at first: - -```bash -➜ git remote add upstream https://github.com/PaddlePaddle/Paddle -➜ git remote -origin -upstream -``` - -Update your fork with the latest upstream changes: - -```bash -➜ git fetch upstream -➜ git pull upstream develop -``` - -Now, your local master branch is up-to-date with everything modified upstream. - -## Push to GitHub - -```bash -# push to your repository in Github -➜ git push origin my-cool-stuff -``` - -## Create an issue and a Pull Request - -Create an Issue to describe the problem and record its number. - -Go to the page for your fork on GitHub, select your development branch, -and click the `New pull request`. - -screen shot 2017-04-26 at 9 09 28 pm - -Then select the target branch: - -screen shot 2017-04-26 at 9 11 52 pm - -We can add `resolve #Issue number` in PR description to close the issue automatically after the PR is merge. More details in . - -Then wait for review, if there need to modify, refer to the above steps to update the corresponding origin branch. - -## Delete origin branch - -After the PR is merge into the main repository, we can delete the remote branch on the PR page. - -screen shot 2017-04-26 at 9 18 24 pm - -Or just run: - -```bash -➜ git push origin :my-cool-stuff -``` - -## Delete local branch - -Finally, we delete local branch: - -```bash -➜ git checkout develop - -# delete my-cool-stuff branch -➜ git branch -D my-cool-stuff -``` diff --git a/doc/howto/dev/contribute_to_paddle_en.md b/doc/howto/dev/contribute_to_paddle_en.md new file mode 120000 index 0000000000000000000000000000000000000000..c97564d93a7f0a753a23cd97d2467d595bd154ff --- /dev/null +++ b/doc/howto/dev/contribute_to_paddle_en.md @@ -0,0 +1 @@ +../../../CONTRIBUTING.md \ No newline at end of file diff --git a/doc/howto/dev/new_layer_cn.rst b/doc/howto/dev/new_layer_cn.rst index 9489a921c70ad6ee5709f46445554f5d9640162c..75037e693b32f923ee7dc9dfec322495fe4ce10a 100644 --- a/doc/howto/dev/new_layer_cn.rst +++ b/doc/howto/dev/new_layer_cn.rst @@ -37,7 +37,7 @@ \frac{\partial c(y)}{\partial x} = \frac{\partial c(y)}{\partial y} \frac{\partial y}{\partial x} -假设 :math:`z = f(W^T x + b)` ,那么 +假设 :math:`z = W^T x + b` ,那么 .. math:: diff --git a/doc/howto/dev/new_layer_en.rst b/doc/howto/dev/new_layer_en.rst index 46481f5ead33dc6a26507e021fd9ae0f8316e940..110a9fb38f890a766bb4480e91feb22d3b0838a5 100644 --- a/doc/howto/dev/new_layer_en.rst +++ b/doc/howto/dev/new_layer_en.rst @@ -29,7 +29,7 @@ Fully connected layer takes a dense input vector with dimension :math:`D_i`. It where :math:`f(.)` is an nonlinear *activation* function, such as sigmoid, tanh, and Relu. -The transformation matrix :math:`W` and bias vector :math:`b` are the *parameters* of the layer. The *parameters* of a layer are learned during training in the *backward pass*. The backward pass computes the gradients of the output function with respect to all parameters and inputs. The optimizer can use chain rule to compute the gradients of the loss function with respect to each parameter. +The transformation matrix :math:`W` and bias vector :math:`b` are the *parameters* of the layer. The *parameters* of a layer are learned during training in the *backward pass*. The backward pass computes the gradients of the output function with respect to all parameters and inputs. The optimizer can use chain rule to compute the gradients of the loss function with respect to each parameter. Suppose our loss function is :math:`c(y)`, then @@ -37,7 +37,7 @@ Suppose our loss function is :math:`c(y)`, then \frac{\partial c(y)}{\partial x} = \frac{\partial c(y)}{\partial y} \frac{\partial y}{\partial x} -Suppose :math:`z = f(W^T x + b)`, then +Suppose :math:`z = W^T x + b`, then .. math:: @@ -48,7 +48,7 @@ This derivative can be automatically computed by our base layer class. Then, for fully connected layer, we need to compute: .. math:: - + \frac{\partial z}{\partial x} = W, \frac{\partial z_j}{\partial W_{ij}} = x_i, \frac{\partial z}{\partial b} = \mathbf 1 where :math:`\mathbf 1` is an all one vector, :math:`W_{ij}` is the number at the i-th row and j-th column of the matrix :math:`W`, :math:`z_j` is the j-th component of the vector :math:`z`, and :math:`x_i` is the i-th component of the vector :math:`x`. @@ -322,7 +322,7 @@ All the gradient check unit tests are located in :code:`paddle/gserver/tests/tes /* weight */ true); } } - + If you are creating a new file for the test, such as :code:`paddle/gserver/tests/testFCGrad.cpp`, you need to add the file to :code:`paddle/gserver/tests/CMakeLists.txt`. An example is given below. All the unit tests will run when you execute the command :code:`make tests`. Notice that some layers might need high accuracy for the gradient check unit tests to work well. You need to configure :code:`WITH_DOUBLE` to `ON` when configuring cmake. .. code-block:: bash diff --git a/doc/howto/dev/new_op_cn.md b/doc/howto/dev/new_op_cn.md new file mode 100644 index 0000000000000000000000000000000000000000..92996585674b46f45549b972b9f295503b1c7f8c --- /dev/null +++ b/doc/howto/dev/new_op_cn.md @@ -0,0 +1,318 @@ +# 如何写新的Operator + + - [概念简介](#概念简介) + - [实现C++类](#实现c类) + - [定义ProtoMaker类](#定义protomaker类) + - [定义Operator类](#定义operator类) + - [定义OpKernel类](#定义opkernel类) + - [注册Operator](#注册operator) + - [编译](#编译) + - [绑定Python](#绑定python) + - [实现单元测试](#实现单元测试) + - [前向Operator单测](#前向operator单测) + - [反向Operator单测](#反向operator单测) + - [编译和执行](#编译和执行) + - [注意事项](#注意事项) + + +## 概念简介 + +简单介绍需要用到基类,详细介绍请参考设计文档。 + +- `framework::OperatorBase`: Operator(简写,Op)基类。 +- `framework::OpKernel`: Op计算函数的基类,称作Kernel。 +- `framework::OperatorWithKernel`:继承自OperatorBase,Op有计算函数,称作有Kernel。 +- `class OpProtoAndCheckerMaker`:描述该Op的输入、输出、属性、注释,主要用于Python API接口生成 + +依据是否包含kernel,可以将Op分为两种:包含Kernel的Op和不包含kernel的Op,前者Op的定义继承自`OperatorWithKernel`,后者继承自`OperatorBase`。本教程主要介绍带Kernel的Op如何写,简单总结Op需要包含的内容如下: + + + 内容 | 定义位置 +-------------- | :---------------------- +OpProtoMake定义 | `.cc`文件,Backward Op不需要定义OpProtoMake +Op定义 | `.cc`文件 +Kernel实现 | CPU、CUDA共享Kernel实现在`.h`文件中,否则,CPU 实现在`.cc`文件中,CUDA 实现在`.cu`文件中。 +注册Op | Op注册实现在`.cc`文件;Kernel注册CPU实现在`.cc`文件中,CUDA实现在`.cu`文件中 + + +实现新的op都添加至目录[paddle/operators](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/operators)下,文件命名以`*_op.h`(如有) 、 `*_op.cc` 、`*_op.cu`(如有)结尾。**系统会根据文件名自动构建op和其对应的Python扩展。** + + +下面以矩阵乘操作,即[MulOp](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/mul_op.cc)为例来介绍如何写带Kernel的Operator。 + + +## 实现C++类 + + +### 定义ProtoMaker类 + +矩阵乘法的公式:$Out = X * Y$, 可见该计算由两个输入,一个输出组成。 + +首先定义`ProtoMaker`来描述该Op的输入、输出,并添加注释: + +```cpp +class MulOpMaker : public framework::OpProtoAndCheckerMaker { + public: + MulOpMaker(OpProto *proto, OpAttrChecker *op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", "(Tensor), 2D tensor of size (M x K)"); + AddInput("Y", "(Tensor), 2D tensor of size (K x N)"); + AddOutput("Out", "(Tensor), 2D tensor of size (M x N)"); + AddComment(R"DOC( +Two Element Mul Operator. +The equation is: Out = X * Y +)DOC"); + } +}; +``` + +[`MulOpMaker`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/mul_op.cc#L43)继承自`framework::OpProtoAndCheckerMaker`,构造函数含有2个参数: + + - `framework::OpProto` : 前者存储Op的输入输出和参数属性,将用于Python API接口的生成。 + - `framework::OpAttrChecker` :后者用于检查参数属性的合法性。 + +构造函数里通过`AddInput`添加输入参数,通过`AddOutput`添加输出参数,通过`AddComment`添加Op的注释。这些函数会将对应内容添加到`OpProto`中。 + +上面的代码在`MulOp`中添加两个输入`X`和`Y`,添加了一个输出`Out`,并解释了各自含义,命名请遵守[命名规范](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/name_convention.md)。 + + +再以[`ScaleOp`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/scale_op.cc#L37)为例: + +```cpp +template +class ScaleOpMaker : public framework::OpProtoAndCheckerMaker { + public: + ScaleOpMaker(OpProto *proto, OpAttrChecker *op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", "The input tensor of scale operator.").NotInGradient(); + AddOutput("Out", "The output tensor of scale operator.").NotInGradient(); + AddComment(R"DOC(Scale operator +The equation is: Out = scale*X +)DOC"); + AddAttr("scale", "scale of scale operator.").SetDefault(1.0); + } +}; +``` + +这个例子有两处不同: + +- `AddInput("X","...").NotInGradient()` : 表示`X`这个输入不参与`ScaleOp`对应的梯度Op计算之中,如果Op的某个输入不参与反向梯度的计算,请显示地调用`.NotInGradient()`进行设置。 + +- `AddAttr("scale", "...").SetDefault(1.0);` : 增加`scale`系数,作为参数属性,并且设置默认值为1.0。 + + +### 定义Operator类 + +下面的点实现了MulOp的定义: + +```cpp +class MulOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + protected: + void InferShape(const framework::InferShapeContext &ctx) const override { + auto dim0 = ctx.Input("X")->dims(); + auto dim1 = ctx.Input("Y")->dims(); + PADDLE_ENFORCE_EQ(dim0.size(), 2, + "input X(%s) should be a tensor with 2 dims, a matrix", + ctx.op_.Input("X")); + PADDLE_ENFORCE_EQ(dim1.size(), 2, + "input Y(%s) should be a tensor with 2 dims, a matrix", + ctx.op_.Input("Y")); + PADDLE_ENFORCE_EQ( + dim0[1], dim1[0], + "First matrix's width must be equal with second matrix's height."); + ctx.Output("Out")->Resize({dim0[0], dim1[1]}); + } +}; +``` + +[`MulOp`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/mul_op.cc#L22)继承自`OperatorWithKernel`。`public`成员: + +```cpp +using framework::OperatorWithKernel::OperatorWithKernel; +``` + +这句表示使用基类`OperatorWithKernel`的构造函数,也可写成: + +```cpp +MulOp(const std::string &type, const framework::VariableNameMap &inputs, + const framework::VariableNameMap &outputs, + const framework::AttributeMap &attrs) + : OperatorWithKernel(type, inputs, outputs, attrs) {} +``` + +还需要重写`InferShape`接口。`InferShape`为const函数,不能修改Op的成员变量,参数为`const framework::InferShapeContext &ctx`,通过该参数可获取到输入输出以及属性。它的功能是: + + - 1). 做检查, 尽早报错:检查输入数据维度、类型等是否合法。 + - 2). 设置输出Tensor的形状。 + +通常`OpProtoMaker`和`Op`类的定义写在`.cc`文件中,和下面将要介绍的注册函数一起放在`.cc`中 + +### 定义OpKernel类 + +`MulKernel`继承自`framework::OpKernel`,带有下面两个模板参数: + +- `typename DeviceContext`: 表示设备类型,不同设备(CPU、CUDA)共享同一个Kernel时,需加该模板参数,不共享则不加,一个不共享的例子是[`OnehotCrossEntropyOpKernel`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/cross_entropy_op.h#L43)。 + +- `typename T` : 表示数据类型,如`float`, `double`等。 + +需要为`MulKernel`类重写`Compute`接口。 +- `Compute`接受一个输入参数:`const framework::ExecutionContext& context`。 +- 与`InferShapeContext`相比,`ExecutionContext`增加了设备类型,同样可获取到输入输出和属性参数。 +- `Compute`函数里实现`OpKernel`的具体计算逻辑。 + +下面是 `MulKernel` `Compute`的实现: + + ```cpp + template + class MulKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + auto* X = context.Input("X"); + auto* Y = context.Input("Y"); + auto* Z = context.Output("Out"); + Z->mutable_data(context.GetPlace()); + auto& device_context = context.template device_context(); + math::matmul(*X, false, *Y, false, 1, Z, 0, device_context); + } + }; + ``` + +需要注意:**不同设备(CPU、CUDA)共享一个Op定义,是否则共享同一个`OpKernel`,取决于`Compute`调用的函数是否支持不同设备。** + +`MulOp`的CPU、CUDA实现共享同一个`Kernel`。`OpKernel`不共享的例子可以参考:[`OnehotCrossEntropyOpKernel`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/cross_entropy_op.h#L43)。 + +为了使`OpKernel`的计算过程书写更加简单,并且CPU、CUDA的代码可以复用,我们通常借助 Eigen unsupported Tensor模块来实现`Compute`接口。关于在PaddlePaddle中如何使用Eigen库,请参考[使用文档](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/howto/dev/use_eigen_cn.md)。 + + +到此,前向Op实现完成。接下来,需要在`.cc`文件中注册该op和kernel。 +反向Op类的定义,反向OpKernel的定义与前向Op类似,这里不再赘述。**但需注意反向Op没有`ProtoMaker`**。 + +### 注册Operator + +- 在`.cc`文件中注册前向、反向Op类,注册CPU Kernel。 + + ```cpp + namespace ops = paddle::operators; + REGISTER_OP(mul, ops::MulOp, ops::MulOpMaker, mul_grad, ops::MulOpGrad); + REGISTER_OP_CPU_KERNEL(mul, ops::MulKernel); + REGISTER_OP_CPU_KERNEL(mul_grad, + ops::MulGradKernel); + ``` + + 在上面的代码中: + + - `REGISTER_OP` : 注册`ops::MulOp`类,类型名为`mul`,该类的`ProtoMaker`为`ops::MulOpMaker`,注册`ops::MulOpGrad`,类型名为`mul_grad`。 + - `REGISTER_OP_WITHOUT_GRADIENT` : 用于注册没有反向的Op。 + - `REGISTER_OP_CPU_KERNEL` :注册`ops::MulKernel`类,并特化模板参数为`paddle::platform::CPUPlace`和`float`类型,同理,注册`ops::MulGradKernel`类。 + + +- 在 `.cu`文件中注册CUDA Kernel。 + - 请注意,如果CUDA Kernel的实现基于Eigen unsupported模块,那么在 `.cu`的开始请加上宏定义 `#define EIGEN_USE_GPU`,代码示例如下: + + ```cpp + // if use Eigen unsupported module before include head files + #define EIGEN_USE_GPU + + namespace ops = paddle::operators; + REGISTER_OP_CUDA_KERNEL(mul, ops::MulKernel); + REGISTER_OP_CUDA_KERNEL(mul_grad, + ops::MulGradKernel); + ``` + +### 编译 + +运行下面命令可以进行编译: + +``` +make mul_op +``` + +## 绑定Python + +系统会对新增的op自动绑定Python,并链接到生成的lib库中。 + +## 实现单元测试 + +单测包括对比前向Op不同设备(CPU、CUDA)的实现、对比反向OP不同设备(CPU、CUDA)的实现、反向Op的梯度测试。下面介绍介绍[`MulOp`的单元测试](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/v2/framework/tests/test_mul_op.py)。 + +### 前向Operator单测 + +Op单元测试继承自`OpTest`。各项更加具体的单元测试在`TestMulOp`里完成。测试Operator,需要: + +1. 在`setUp`函数定义输入、输出,以及相关的属性参数。 +2. 生成随机的输入数据。 +3. 在Python脚本中实现与前向operator相同的计算逻辑,得到输出值,与operator前向计算的输出进行对比。 +4. 反向计算已经自动集成进测试框架,直接调用相应接口即可。 + + + ```python + import unittest + import numpy as np + from op_test import OpTest + + + class TestMulOp(OpTest): + def setUp(self): + self.op_type = "mul" + self.inputs = { + 'X': np.random.random((32, 84)).astype("float32"), + 'Y': np.random.random((84, 100)).astype("float32") + } + self.outputs = {'Out': np.dot(self.inputs['X'], self.inputs['Y'])} + + def test_check_output(self): + self.check_output() + + def test_check_grad_normal(self): + self.check_grad(['X', 'Y'], 'Out', max_relative_error=0.5) + + def test_check_grad_ingore_x(self): + self.check_grad( + ['Y'], 'Out', max_relative_error=0.5, no_grad_set=set("X")) + + def test_check_grad_ingore_y(self): + self.check_grad( + ['X'], 'Out', max_relative_error=0.5, no_grad_set=set('Y')) + ``` + +上面的代码首先导入依赖的包,下面是对`setUp`函数中操作的重要变量的详细解释: + +- `self.op_type = "mul" ` : 定义类型,与operator注册时注册的类型一致。 +- `self.inputs` : 定义输入,类型为`numpy.array`,并初始化。 +- `self.outputs` : 定义输出,并在Python脚本中完成与operator同样的计算逻辑,返回Python端的计算结果。 + +### 反向operator单测 + +而反向测试中: +- `test_check_grad_normal`中调用`check_grad`使用数值法检测梯度正确性和稳定性。 + - 第一个参数`["X", "Y"]` : 指定对输入变量`X`、`Y`做梯度检测。 + - 第二个参数`"Out"` : 指定前向网络最终的输出目标变量`Out`。 + - 第三个参数`max_relative_error`:指定检测梯度时能容忍的最大错误值。 +- `test_check_grad_ingore_x`和`test_check_grad_ingore_y`分支用来测试只需要计算一个输入梯度的情况。 + + +### 编译和执行 + +`python/paddle/v2/framework/tests` 目录下新增的 `test_*.py` 单元测试会被自动加入工程进行编译。 + +请注意,**不同于Op的编译测试,运行单元测试测时需要编译整个工程**,并且编译时需要打开`WITH_TESTING`, 即`cmake paddle_dir -DWITH_TESTING=ON`。编译成功后,执行下面的命令来运行单元测试: + +```bash +make test ARGS="-R test_mul_op -V" +``` + +或者: + +```bash +ctest -R test_mul_op +``` + +## 注意事项 + +- 为每个Op创建单独的`*_op.h`(如有)、`*_op.cc`和`*_op.cu`(如有)。不允许一个文件中包含多个Op,这将会导致编译出错。 +- 注册Op时的类型名,需要和该Op的名字一样。即不允许在`A_op.cc`里面,注册`REGISTER_OP(B, ...)`等,这将会导致单元测试出错。 +- 如果Op没有实现CUDA Kernel,请不要创建空的`*_op.cu`,这将会导致单元测试出错。 +- 如果多个Op依赖一些共用的函数,可以创建非`*_op.*`格式的文件来存放,如`gather.h`文件。 diff --git a/doc/howto/dev/new_op_en.md b/doc/howto/dev/new_op_en.md new file mode 100644 index 0000000000000000000000000000000000000000..da8b1bdd1082e439456daf25e9b3a1e8eb534375 --- /dev/null +++ b/doc/howto/dev/new_op_en.md @@ -0,0 +1,336 @@ +# How to write a new operator + + - [Background](#background) + - [Implementing C++ Types](#implementing-c-types) + - [Defining ProtoMaker](#defining-protomaker) + - [Defining Operator](#defining-operator) + - [Defining OpKernel](#defining-opkernel) + - [Registering Operator and OpKernel](#registering-operator-and-opkernel) + - [Compilation](#compilation) + - [Python Binding](#python-binding) + - [Unit Tests](#unit-tests) + - [Testing Forward Operators](#testing-forward-operators) + - [Testing Backward Operators](#testing-backward-operators) + - [Compiling and Running](#compiling-and-running) + - [Remarks](#remarks) +## Background + +Here are the base types needed. For details, please refer to the design docs. + +- `class OpProtoAndCheckerMaker`: Describes an Operator's input, output, attributes and description, mainly used to interface with Python API. +- `framework::OperatorBase`: Operator (Op)base class. +- `framework::OpKernel`: Base class for Op computation kernel. +- `framework::OperatorWithKernel`: Inherited from OperatorBase, describing an operator with computation kernels. + + +Operators can be categorized into two groups: operator with kernel(s) and operator without kernel(s). An operator with kernel(s) inherits from `OperatorWithKernel` while the one without kernel(s) inherits from `OperatorBase`. This tutorial focuses on implementing operators with kernels. In short, an operator includes the following information: + + + Information | Where is it defined +-------------- | :---------------------- +OpProtoMake definition | `.cc`files, Backward Op does not need an OpProtoMake interface. +Op definition | `.cc` files +Kernel implementation | The kernel methods shared between CPU and CUDA are defined in `.h` files. CPU-specific kernels live in `.cc` files, while CUDA-specific kernels are implemented in `.cu`files. +Registering the Op | Ops are registered in `.cc` files; For Kernel registration, `.cc` files contain the CPU implementation, while `.cu` files contain the CUDA implementation. + + +New Operator implementations are added to the list [paddle/operators](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/operators), with file names in the format `*_op.h` (if applicable), `*_op.cc`, `*_op.cu` (if applicable).** The system will use the naming scheme to automatically build operators and their corresponding Python extensions.** + + +Let's take matrix multiplication operator, [MulOp](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/mul_op.cc), as an example to introduce the writing of an Operator with Kernel. + + +## Implementing C++ Types + + +### Defining ProtoMaker + +Matrix Multiplication can be written as $Out = X * Y$, meaning that the operation consists of two inputs and pne output. + +First, define `ProtoMaker` to describe the Operator's input, output, and additional comments: + +```cpp +class MulOpMaker : public framework::OpProtoAndCheckerMaker { + public: + MulOpMaker(OpProto *proto, OpAttrChecker *op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", "(Tensor), 2D tensor of size (M x K)"); + AddInput("Y", "(Tensor), 2D tensor of size (K x N)"); + AddOutput("Out", "(Tensor), 2D tensor of size (M x N)"); + AddComment(R"DOC( +Two Element Mul Operator. +The equation is: Out = X * Y +)DOC"); + } +}; +``` + +[`MulOpMaker`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/mul_op.cc#L43)is inherited from`framework::OpProtoAndCheckerMaker`, consisting of 2 variables in the constructor: + + - `framework::OpProto` stores Operator input and variable attribute, used for generating Python API interfaces. + - `framework::OpAttrChecker` is used to validate variable attributes. + +The constructor utilizes `AddInput`, `AddOutput`, and `AddComment`, so that the corresponding information will be added to `OpProto`. + +The code above adds two inputs `X` and `Y` to `MulOp`, an output `Out`, and their corresponding descriptions, in accordance to Paddle's [naming convention](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/name_convention.md). + + +An additional example [`ScaleOp`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/scale_op.cc#L37) is implemented as follows: + +```cpp +template +class ScaleOpMaker : public framework::OpProtoAndCheckerMaker { + public: + ScaleOpMaker(OpProto *proto, OpAttrChecker *op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", "The input tensor of scale operator.").NotInGradient(); + AddOutput("Out", "The output tensor of scale operator.").NotInGradient(); + AddComment(R"DOC(Scale operator +The equation is: Out = scale*X +)DOC"); + AddAttr("scale", "scale of scale operator.").SetDefault(1.0); + } +}; +``` + +There are two changes in this example: + +- `AddInput("X","...").NotInGradient()` expresses that input `X` is not involved in `ScaleOp`'s corresponding computation. If an input to an operator is not participating in back-propagation, please explicitly set `.NotInGradient()`. + +- `AddAttr("scale", "...").SetDefault(1.0);` adds `scale`constant as an attribute, and sets the default value to 1.0. + + +### Defining Operator + +The following code defines the interface for MulOp: + +```cpp +class MulOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + protected: + void InferShape(const framework::InferShapeContext &ctx) const override { + auto dim0 = ctx.Input("X")->dims(); + auto dim1 = ctx.Input("Y")->dims(); + PADDLE_ENFORCE_EQ(dim0.size(), 2, + "input X(%s) should be a tensor with 2 dims, a matrix", + ctx.op_.Input("X")); + PADDLE_ENFORCE_EQ(dim1.size(), 2, + "input Y(%s) should be a tensor with 2 dims, a matrix", + ctx.op_.Input("Y")); + PADDLE_ENFORCE_EQ( + dim0[1], dim1[0], + "First matrix's width must be equal with second matrix's height."); + ctx.Output("Out")->Resize({dim0[0], dim1[1]}); + } +}; +``` + +[`MulOp`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/mul_op.cc#L22) is inherited from `OperatorWithKernel`. Its `public` member + +```cpp +using framework::OperatorWithKernel::OperatorWithKernel; +``` + +expresses an operator constructor using base class `OperatorWithKernel`, alternatively written as + +```cpp +MulOp(const std::string &type, const framework::VariableNameMap &inputs, + const framework::VariableNameMap &outputs, + const framework::AttributeMap &attrs) + : OperatorWithKernel(type, inputs, outputs, attrs) {} +``` + +`InferShape` interface needs to be re-written.`InferShape` is a constant method and cannot modify Op's member variables, its constant member `const framework::InferShapeContext &ctx` can be used to extract input, output, and attributes. It functions to + + - 1). validate and error out early: it checks input data dimensions and types. + - 2). configures the tensor shape in the output. + +Usually `OpProtoMaker` and `Op`'s type definitions are written in `.cc` files, which also include the registration methods introduced later. + +### Defining OpKernel + +`MulKernel` inherits `framework::OpKernel`, which includes the following templates: + +- `typename DeviceContext` denotes device context type. When different devices, namely the CPUDeviceContext and the CUDADeviceContext, share the same kernel, this template needs to be added. If they don't share kernels, this must not be added. An example of a non-sharing kernel is [`OnehotCrossEntropyOpKernel`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/cross_entropy_op.h#L43). + +- `typename T` denotes data type, such as `float` or `double`. + +`MulKernel` types need to rewrite the interface for `Compute`. + +- `Compute` takes one input parameter: `const framework::ExecutionContext& context`. +- Compared with `InferShapeContext`, `ExecutionContext` includes device types, and can similarly extract input, output, and attribute variables. +- `Compute` implements the computation logics of an `OpKernel`. + +`MulKernel`'s implementation of `Compute` is as follows: + + ```cpp + template + class MulKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + auto* X = context.Input("X"); + auto* Y = context.Input("Y"); + auto* Z = context.Output("Out"); + Z->mutable_data(context.GetPlace()); + auto& device_context = context.template device_context(); + math::matmul(*X, false, *Y, false, 1, Z, 0, device_context); + } + }; + ``` + +Note that **different devices (CPU, CUDA)share one Op definition; whether or not they share the same `OpKernel` depends on whether `Compute` calls functions can support both devices.** + +`MulOp`'s CPU and CUDA share the same `Kernel`. A non-sharing `OpKernel` example can be seen in [`OnehotCrossEntropyOpKernel`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/cross_entropy_op.h#L43). + +To ease the writing of `OpKernel` compute, and for reusing code cross-device, [`Eigen-unsupported Tensor`](https://bitbucket.org/eigen/eigen/src/default/unsupported/Eigen/CXX11/src/Tensor/README.md?fileviewer=file-view-default) module is used to implement `Compute` interface. To learn about how the Eigen library is used in PaddlePaddle, please see [usage document](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/howto/dev/use_eigen_cn.md). + + +This concludes the forward implementation of an operator. Next its operation and kernel need to be registered in a `.cc` file. + +The definition of its corresponding backward operator, if applicable, is similar to that of an forward operator. **Note that a backward operator does not include a `ProtoMaker`**. + +### Registering Operator and OpKernel + +- In `.cc` files, register forward and backward operator classes and the CPU kernel. + + ```cpp + namespace ops = paddle::operators; + REGISTER_OP(mul, ops::MulOp, ops::MulOpMaker, mul_grad, ops::MulOpGrad); + + REGISTER_OP_CPU_KERNEL(mul, ops::MulKernel); + REGISTER_OP_CPU_KERNEL(mul_grad, + ops::MulGradKernel); + ``` + + In that code block, + + - `REGISTER_OP` registers the `ops::MulOp` class, type named `mul`, its type `ProtoMaker` is `ops::MulOpMaker`, registering `ops::MulOpGrad` as `mul_grad`. + - `REGISTER_OP_WITHOUT_GRADIENT` registers an operator without gradient. + + - `REGISTER_OP_CPU_KERNEL` registers `ops::MulKernel` class and specialized template types `paddle::platform::CPUPlace` and `float`, which also registers `ops::MulGradKernel`. + + +- Registering CUDA Kernel in `.cu` files + - Note that if CUDA Kernel is implemented using the `Eigen unsupported` module, then on top of `.cu`, a macro definition `#define EIGEN_USE_GPU` is needed, such as + + ```cpp + // if use Eigen unsupported module before include head files + #define EIGEN_USE_GPU + + namespace ops = paddle::operators; + REGISTER_OP_CUDA_KERNEL(mul, ops::MulKernel); + REGISTER_OP_CUDA_KERNEL(mul_grad, + ops::MulGradKernel); + ``` + +### Compilation + +Run the following commands to compile. + +``` +# maybe you need to rerun cmake +make mul_op +``` + +## Python Binding + +The system will automatically bind to Python and link it to a generated library. + +## Unit Tests + +Unit tests for an operator include + +1. comparing a forward operator's implementations on different devices, + +2. comparing a backward operator's implementation on different devices, and + +3. a scaling test for the backward operator. + +Here, we introduce the [unit tests for `MulOp`](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/v2/framework/tests/test_mul_op.py). + +### Testing Forward Operators + +A forward operator unit test inherits `unittest.TestCase` and defines metaclass `__metaclass__ = OpTestMeta`. More concrete tests are performed in `OpTestMeta`. Testing a forward operator requires the following: + +1. Defining input, output and relevant attributes in `setUp` method. + +2. Generating random input data. + +3. Implementing the same computation logic in a Python script. + +4. Call check gradient function to check the backward operator. + + ```python + import unittest + import numpy as np + from op_test import OpTest + + + class TestMulOp(OpTest): + def setUp(self): + self.op_type = "mul" + self.inputs = { + 'X': np.random.random((32, 84)).astype("float32"), + 'Y': np.random.random((84, 100)).astype("float32") + } + self.outputs = {'Out': np.dot(self.inputs['X'], self.inputs['Y'])} + + def test_check_output(self): + self.check_output() + + def test_check_grad_normal(self): + self.check_grad(['X', 'Y'], 'Out', max_relative_error=0.5) + + def test_check_grad_ingore_x(self): + self.check_grad( + ['Y'], 'Out', max_relative_error=0.5, no_grad_set=set("X")) + + def test_check_grad_ingore_y(self): + self.check_grad( + ['X'], 'Out', max_relative_error=0.5, no_grad_set=set('Y')) + ``` +Get its output, and compare it with the forward operator's own output. + +The code above first loads required packages. In addition, we have + +- `self.op_type = "mul" ` defines the type that is identical to what the operator's registered type. +- `self.inputs` defines input, with type `numpy.array` and initializes it. +- `self.outputs` defines output and completes the same operator computation in the Python script, and returns its result from the Python script. + +### Testing Backward Operators + +Some key points in checking gradient above include: + +- `test_normal` calls `check_grad` to validate scaling tests' correctness and stability through numeric methods. + - The first variable `["X", "Y"]` appoints `X` and `Y` to be scale tested. + - The second variable `"Out"` points to the network's final output target `Out`. + - The third variable `max_relative_error` points to the maximum relative tolerance error during scaling tests. +- `test_check_grad_ingore_x` and `test_check_grad_ingore_y`branches test the cases where there is only one scaling input. + +### Compiling and Running + + +Any new unit testing file of the format `test_*.py` added to the director `python/paddle/v2/framework/tests` is automatically added to the project to compile. + +Note that **unlike the compile test for Ops, running unit tests requires compiling the entire project** and requires compiling with flag `WITH_TESTING` on i.e. `cmake paddle_dir -DWITH_TESTING=ON`. + +After successfully compiling the project, run the following command to run unit tests: + +```bash +make test ARGS="-R test_mul_op -V" +``` + +Or, + +```bash +ctest -R test_mul_op +``` + +## Remarks + +- Every `*_op.h` (if applicable), `*_op.cc`, and `*_op.cu` (if applicable) must be created for a unique Op. Compiling will fail if multiple operators are included per file. +- The type with which an operator is registered needs to be identical to the Op's name. Registering `REGISTER_OP(B, ...)` in `A_op.cc` will cause unit testing failures. +- If the operator does not implement a CUDA kernel, please refrain from creating an empty `*_op.cu` file, or else unit tests will fail. +- If multiple operators rely on some shared methods, a file NOT named `*_op.*` can be created to store them, such as `gather.h`. diff --git a/doc/howto/dev/new_op_kernel_en.md b/doc/howto/dev/new_op_kernel_en.md new file mode 100644 index 0000000000000000000000000000000000000000..123df0a7ee4943c0b789ef9cfa6e0804d0fdd564 --- /dev/null +++ b/doc/howto/dev/new_op_kernel_en.md @@ -0,0 +1,121 @@ +## Add Kernels for a New Device + +### Background + +PaddlePaddle Fluid have hundreds of operators. Each operator could have one or more kernels. A kernel is an implementation of the operator for a certain device, which could be a hardware device, e.g., the CUDA GPU, or a library that utilizes a device, e.g., Intel MKL that makes full use of the Xeon CPU. + +[This document](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/howto/dev/new_op_en.md) explains how to add an operator, and its kernels. The kernels of an operator are indexed by a C++ type [`OpKernelType`](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/operator_kernel_type.md). An operator chooses the right kernel at runtime. This choosing mechanism is described [here](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/switch_kernel.md). + +### Write Kernels for A New Device + +#### Add A New Device + + For some historical reaons, we misuse the word *library* for *device*. For example, we call the deivce type by *library type*. An example is the header file [`library_type.h`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/library_type.h#L24). We will correct this ASAP. + +To register a new device, we need to add an enum value to `LibraryType`: + +``` +enum class LibraryType { + kPlain = 0, + kMKLDNN = 1, + kCUDNN = 2, +}; +``` + + +#### Add A New [Place](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/platform/place.h#L53) + +If you have a new kind of Device, firstly you need to add a new kind of [`Place`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/platform/place.h#L53). For example `CUDAPlace`: + +```cpp +struct CUDAPlace { + CUDAPlace() : CUDAPlace(0) {} + explicit CUDAPlace(int d) : device(d) {} + + inline int GetDeviceId() const { return device; } + // needed for variant equality comparison + inline bool operator==(const CUDAPlace &o) const { + return device == o.device; + } + inline bool operator!=(const CUDAPlace &o) const { return !(*this == o); } + + int device; +}; + +typedef boost::variant Place; +``` + +#### Add [device context]((https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/platform/device_context.h#L37)) +After a new kind of Device is added, you should add a corresponding [DeviceContext](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/platform/device_context.h#L37) for it. + +```cpp +class DeviceContext { + public: + virtual ~DeviceContext() {} + virtual Place GetPlace() const = 0; + + virtual void Wait() const {} +}; +``` + +#### Implement new [OpKernel](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/operator.h#L351) for your Device. + +A detailed documentation can be found in [`new_op_and_kernel`](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/howto/dev/new_op_en.md) + +```cpp +class OpKernelBase { + public: + /** + * ExecutionContext is the only parameter of Kernel Run function. + * Run will get input/output variables, state such as momentum and + * device resource such as CUDA stream, cublas handle, etc. from + * ExecutionContext. User should construct it before run the Operator. + */ + + virtual void Compute(const ExecutionContext& context) const = 0; + + virtual ~OpKernelBase() = default; +}; + +template +class OpKernel : public OpKernelBase { + public: + using ELEMENT_TYPE = T; +}; +``` + + +#### Register the OpKernel to framework + +After writing the components described above, we should register the kernel to the framework. + +We use `REGISTER_OP_KERNEL` to do the registration. + +```cpp +REGISTER_OP_KERNEL( + op_type, + library_type, + place_type, + kernel0, kernel1, ...) +``` + +kernel0, kernel1 are kernels that have the same `op_type`, `library_type`, `place_type` but different `data_types`. + +take [`conv2d`]((https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/conv_cudnn_op.cu.cc#L318)) as an example: + + ```cpp + REGISTER_OP_KERNEL(conv2d, CPU, paddle::platform::CPUPlace, + paddle::operators::GemmConvKernel, + paddle::operators::GemmConvKernel); + + REGISTER_OP_KERNEL(conv2d, CUDNN, ::paddle::platform::CUDAPlace, + paddle::operators::CUDNNConvOpKernel, + paddle::operators::CUDNNConvOpKernel); + ``` + +In the code above: + + - `conv2d` is the type/name of the operator + - `CUDNN/CPU` is `library` + - `paddle::platform::CUDAPlace/CPUPlace` is `place` + - template parameter `float/double` on `CUDNNConvOpKernel` is `data_type`. diff --git a/doc/howto/dev/use_eigen_cn.md b/doc/howto/dev/use_eigen_cn.md new file mode 100644 index 0000000000000000000000000000000000000000..1367323b71277984834d9d4f0d9bea0f69478479 --- /dev/null +++ b/doc/howto/dev/use_eigen_cn.md @@ -0,0 +1,146 @@ +## 在Paddle中如何使用Eigen + +神经网络本质上是一个计算图,计算需要的数据存放在`Tensor`中,而计算过程是由`Operartor`来描述的。在执行时,`Operator`调用对应`OpKernel`中的`Compute`接口,实现对`Tensor`的操作。 + + +### Eigen Tensor模块 + +Eigen Tensor模块对element-wise计算提供了强大的支持,并且书写一份代码,可以同时在CPU、GPU执行。但Eigen Tensor是一个正在开发中的模块,因此可能测试不够完备,文档较少。 + +关于Eigen Tensor模块的详细介绍请参考[文档1](https://github.com/RLovelett/eigen/blob/master/unsupported/Eigen/CXX11/src/Tensor/README.md) 和[文档2](https://bitbucket.org/eigen/eigen/src/default/unsupported/Eigen/CXX11/src/Tensor/README.md) + + +### paddle::framework::Tensor + +Paddle Tensor定义在framework目录下,其主要接口如下: + +```cpp +class Tensor { + public: + /*! Return a pointer to mutable memory block. */ + template + inline T* data(); + + /** + * @brief Return a pointer to mutable memory block. + * @note If not exist, then allocation. + */ + template + inline T* mutable_data(platform::Place place); + + /** + * @brief Return a pointer to mutable memory block. + * + * @param[in] dims The dimensions of the memory block. + * @param[in] place The place of the memory block. + * + * @note If not exist, then allocation. + */ + template + inline T* mutable_data(DDim dims, platform::Place place); + + /*! Resize the dimensions of the memory block. */ + inline Tensor& Resize(const DDim& dims); + + /*! Return the dimensions of the memory block. */ + inline const DDim& dims() const; + + private: + /*! holds the memory block if allocated. */ + std::shared_ptr holder_; + + /*! points to dimensions of memory block. */ + DDim dim_; +}; +``` + +`Placeholder`的作用是延迟分配内存,即我们可以先定义一个Tensor,然后使用Resize接口设置Tensor的大小,最后再调用mutable_data接口分配实际的内存。 + +```cpp +paddle::framework::Tensor t; +paddle::platform::CPUPlace place; +// set size first +t.Resize({2, 3}); +// allocate memory on CPU later +t.mutable_data(place); +``` + +### paddle::framework::Tensor使用样例 +下面以AddOp为例说明Tensor的使用过程: + +- InferShape + +在运行神经网络计算图时,我们先调用每个`Operator`的`InferShape`接口,根据输入Tensor的大小来设置输出Tensor的大小,`Resize`接口会被调用。 + +```cpp +void InferShape(const framework::InferShapeContext &ctx) const override { + PADDLE_ENFORCE_EQ(ctx.Input("X")->dims(), + ctx.Input("Y")->dims(), + "Two input of Add Op's dimension must be same."); + ctx.Output("Out")->Resize(ctx.Input("X")->dims()); +} +``` + + +- Run + +`Operator`的`Run`接口最终会调用对应`OpKernel`的`Compute`接口,在这时真正的分配内存,`mutable_data`接口会被调用。 + +```cpp +void Compute(const framework::ExecutionContext& context) const override { + auto* input0 = context.Input("X"); + auto* input1 = context.Input("Y"); + auto* output = context.Output("Out"); + + output->mutable_data(context.GetPlace()); + + auto x = EigenVector::Flatten(*input0); + auto y = EigenVector::Flatten(*input1); + auto z = EigenVector::Flatten(*output); + + auto place = context.GetEigenDevice(); + + z.device(place) = x + y; +} +``` + + +### paddle::framework::Tensor到EigenTensor的转换 + +如上一小节所示,在具体的计算中,我们需要先把输入Tensor和输出Tensor转换为Eigen支持的格式。我们在[eigen.h](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/eigen.h)中提供了一些全局函数用来实现paddle::framework::Tensor到EigenTensor/EigenMatrix/EigenVector/EigenScalar的转换。 + +以EigenTensor为例,做一个介绍 + +```cpp +Tensor t; +float* p = t.mutable_data(make_ddim({1, 2, 3}), platform::CPUPlace()); +for (int i = 0; i < 1 * 2 * 3; i++) { + p[i] = static_cast(i); +} + +EigenTensor::Type et = EigenTensor::From(t); +``` + +From是EigenTensor模板提供的一个接口,可以实现从paddle::framework::Tensor到对EigenTensor的转换。由于Tensor的rank是模板参数,因此在转换时需要显示的指定。 + +在Eigen中,不同rank的Tensor是不同类型,Vector是rank为1的Tensor。需要额外注意的是,EigenVector::From方法是把paddle中的一维Tensor转为Eigen的一维Tensor,在这里用EigenVector来表示;而EigenVector::Flatten方法是把paddle中的一个Tensor进行reshape操作,压扁成为Eigen的一维Tensor,类型仍然为EigenVector。 + +更多的转换方法请参考eigen_test.cc中的[单元测试](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/eigen_test.cc)。 + + + +### 实现计算 + +当需要完成计算时,我们需要等式左边的EigenTensor调用device接口。在这里需要注意的是,这里的EigenTensor之间的运算只是改变了原有Tensor中的数据,而不会改变原有Tensor的shape信息。 + +```cpp +auto x = EigenVector::Flatten(*input0); +auto y = EigenVector::Flatten(*input1); +auto z = EigenVector::Flatten(*output); +auto place = context.GetEigenDevice(); +z.device(place) = x + y; +``` + +在这段代码中,input0/input1/output可以是任意维度的Tensor。我们调用了EigenVector的Flatten接口,把任意维度的Tensor转为了一维的EigenVector。而在计算结束之后,input0/input1/output的原有shape信息不变。如果想改变原有Tensor的shape信息,可以调用Resize接口进行改变。 + +由于Eigen Tensor模块的文档较少,我们可以参考TensorFlow的[kernels](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/core/kernels)模块下的相关`OpKernel`的计算代码。 diff --git a/doc/howto/dev/use_eigen_en.md b/doc/howto/dev/use_eigen_en.md new file mode 100644 index 0000000000000000000000000000000000000000..e169106e12f5d62696f1f0e7163562793b32c18c --- /dev/null +++ b/doc/howto/dev/use_eigen_en.md @@ -0,0 +1,146 @@ +## How to use Eigen in Paddle + +Essentially, a neural network is a compute graph. T data needed for the computation is stored in `Tensor`s and its computation procedure is described by `Operator`s. An `Operator` calls the `Compute` interface in its corresponding `OpKernel` and operates on the `Tensor`. + + +### Eigen Tensor Module + +The Eigen Tensor module supports powerful element-wise computation. In addition, a piece of code written using it can be run on both the CPU and the GPU. + +Note that Eigen Tensor is still being actively developed, so its tests are not completely covered and its documentation may be sparse. + +For details on Eigen Tensor module, please see [doc 1](https://github.com/RLovelett/eigen/blob/master/unsupported/Eigen/CXX11/src/Tensor/README.md) and [doc 2](https://bitbucket.org/eigen/eigen/src/default/unsupported/Eigen/CXX11/src/Tensor/README.md). + + +### paddle::framework::Tensor + +Paddle Tensor's is defined in the framework directory with the following interface: + +```cpp +class Tensor { + public: + /*! Return a pointer to mutable memory block. */ + template + inline T* data(); + + /** + * @brief Return a pointer to mutable memory block. + * @note If not exist, then allocation. + */ + template + inline T* mutable_data(platform::Place place); + + /** + * @brief Return a pointer to mutable memory block. + * + * @param[in] dims The dimensions of the memory block. + * @param[in] place The place of the memory block. + * + * @note If not exist, then allocation. + */ + template + inline T* mutable_data(DDim dims, platform::Place place); + + /*! Resize the dimensions of the memory block. */ + inline Tensor& Resize(const DDim& dims); + + /*! Return the dimensions of the memory block. */ + inline const DDim& dims() const; + + private: + /*! holds the memory block if allocated. */ + std::shared_ptr holder_; + + /*! points to dimensions of memory block. */ + DDim dim_; +}; +``` + +`Placeholder` is used to delay memory allocation; that is, we can first define a tensor, using `Resize` to configure its shape, and then call `mutuable_data` to allocate the actual memory. + +```cpp +paddle::framework::Tensor t; +paddle::platform::CPUPlace place; +// set size first +t.Resize({2, 3}); +// allocate memory on CPU later +t.mutable_data(place); +``` + +### paddle::framework::Tensor Usage +`AddOp` demonstrates Tensor's usage. + +- InferShape + +When computing a neural network's compute graph, first call every `Operator`'s `InferShape` method, and use `Resize` to configure the size of the output tensor. + +```cpp +void InferShape(const framework::InferShapeContext &ctx) const override { + PADDLE_ENFORCE_EQ(ctx.Input("X")->dims(), + ctx.Input("Y")->dims(), + "Two input of Add Op's dimension must be same."); + ctx.Output("Out")->Resize(ctx.Input("X")->dims()); +} +``` + + +- Run + +```cpp +void Compute(const framework::ExecutionContext& context) const override { + auto* input0 = context.Input("X"); + auto* input1 = context.Input("Y"); + auto* output = context.Output("Out"); + + output->mutable_data(context.GetPlace()); + + auto x = EigenVector::Flatten(*input0); + auto y = EigenVector::Flatten(*input1); + auto z = EigenVector::Flatten(*output); + + auto place = context.GetEigenDevice(); + + z.device(place) = x + y; +} +``` + + +### paddle::framework::Tensor到EigenTensor的转换 + +As shown above, in actual computation, we need to transform the input and output `Tensor`s into formats Eigen supports. We show some functions in [eigen.h](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/eigen.h) to implement the transformation from `paddle::framework::Tensor`to `EigenTensor/EigenMatrix/EigenVector/EigenScalar`. + +Using EigenTensor as an example: + +```cpp +Tensor t; +float* p = t.mutable_data(make_ddim({1, 2, 3}), platform::CPUPlace()); +for (int i = 0; i < 1 * 2 * 3; i++) { + p[i] = static_cast(i); +} + +EigenTensor::Type et = EigenTensor::From(t); +``` + +`From` is an interfacing method provided by the EigenTensor template, which implements the transformation from a `paddle::framework::Tensor` object to an EigenTensor. Since `rank` is a template parameter, it needs to be explicitly specified at the time of the transformation. + +In Eigen, tensors with different ranks are different types, with `Vector` bring a rank-1 instance. Note that `EigenVector::From` uses a transformation from an 1-dimensional Paddle tensor to a 1-dimensional Eigen tensor while `EigenVector::Flatten` reshapes a paddle tensor and flattens it into a 1-dimensional Eigen tensor. Both resulting tensors are still typed EigenVector. + +For more transformations, see the [unit tests](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/eigen_test.cc) in the `eigen_test.cc` file. + + + +### Implementing Computation + +While computing, the device interface is needed from the EigenTensors on the left hand side of the assignments. Note that the computation between EigenTensors only changes the data originally inthe Tensor and does not change all the shape information associated with the Tensor. + +```cpp +auto x = EigenVector::Flatten(*input0); +auto y = EigenVector::Flatten(*input1); +auto z = EigenVector::Flatten(*output); +auto place = context.GetEigenDevice(); +z.device(place) = x + y; +``` + +In this code segment, input0/input1/output can be Tensors of arbitrary dimension. We are calling Flatten from EigenVector, transforming a tensor of any dimension into a 1-dimensional EigenVector. After completing computation, input0/input1/output will retain the same shape information, and they can be resized using the `Resize` interface. + +Because the Eigen Tensor module is under-documented, please refer to `OpKernel`'s computation code in TensorFlow's [kernel module documentation](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/core/kernels). diff --git a/doc/howto/dev/write_docs_cn.rst b/doc/howto/dev/write_docs_cn.rst index d536f53abc031e9d279ace0e231a381a2f1e81b6..1bc947c260d7adb75ee5a2bb10e6b91bc0be2d4c 100644 --- a/doc/howto/dev/write_docs_cn.rst +++ b/doc/howto/dev/write_docs_cn.rst @@ -3,75 +3,108 @@ ################## PaddlePaddle的文档包括英文文档 ``doc`` 和中文文档 ``doc_cn`` 两个部分。文档都是通过 `cmake`_ 驱动 `sphinx`_ 编译生成,生成后的文档分别存储在编译目录的 ``doc`` 和 ``doc_cn`` 两个子目录下。 +也可以利用PaddlePaddle 工具来编译文档,这个情况下所有的文件会存在整理过的的文件目录 .ppo_workspace/content 下 +如何构建文档 +============ -如何构建PaddlePaddle的文档 -========================== +PaddlePaddle的文档构建有三种方式。 -PaddlePaddle的文档构建有直接构建和基于Docker构建两种方式,我们提供了一个构建脚本build_docs.sh来进行构建。 -PaddlePaddle文档需要准备的环境相对较复杂,所以我们推荐使用基于Docker来构建PaddlePaddle的文档。 +使用PaddlePaddle.org工具 +-------------- +这个是目前推荐的使用方法。除了可以自动编译文档,也可以直接在网页预览文档。 -使用Docker构建PaddlePaddle的文档 --------------------------------- - -使用Docker构建PaddlePaddle的文档,需要在系统里先安装好Docker工具包。Docker安装请参考 `Docker的官网 `_ 。安装好Docker之后可以使用源码目录下的脚本构建文档,即 +文件工具是使用Docker,需要在系统里先安装好Docker工具包。Docker安装请参考Docker的官网。安装好Docker之后及可用以下命令启动工具 .. code-block:: bash - cd TO_YOUR_PADDLE_CLONE_PATH - cd paddle/scripts/tools/build_docs - bash build_docs.sh with_docker + mkdir paddlepaddle # Create paddlepaddle working directory + cd paddlepaddle -编译完成后,会在当前目录生成两个子目录\: + # Clone the content repositories + git clone https://github.com/PaddlePaddle/Paddle.git + git clone https://github.com/PaddlePaddle/book.git + git clone https://github.com/PaddlePaddle/models.git + git clone https://github.com/PaddlePaddle/Mobile.git -* doc 英文文档目录 -* doc_cn 中文文档目录 + # Please specify the working directory through -v + docker run -it -p 8000:8000 -v `pwd`:/var/content paddlepaddle/paddlepaddle.org:latest -打开浏览器访问对应目录下的index.html即可访问本地文档。 +注意: PaddlePaddle.org 会在 -v (volume) 指定的内容存储库运行命令 +之后再用网页连到http://localhost:8000就可以在网页上生成需要的文档 +编译后的文件将被存储在工作目录 /.ppo_workspace/content。 +如果不想使用 Docker,你还可以通过运行Django框架直接激活工具的服务器。使用下面的命令来运行它。 +.. code-block:: bash -直接构建PaddlePaddle的文档 --------------------------- + mkdir paddlepaddle # Create paddlepaddle working directory + cd paddlepaddle -因为PaddlePaddle的v2 api文档生成过程依赖于py_paddle Python包,用户需要首先确认py_paddle包已经安装。 + # Clone the content repositories and PaddlePaddle.org + git clone https://github.com/PaddlePaddle/Paddle.git + git clone https://github.com/PaddlePaddle/book.git + git clone https://github.com/PaddlePaddle/models.git + git clone https://github.com/PaddlePaddle/Mobile.git + git clone https://github.com/PaddlePaddle/PaddlePaddle.org.git -.. code-block:: bash + # Please specify the PaddlePaddle working directory. In the current setting, it should be pwd + export CONTENT_DIR= + export ENV='' + cd PaddlePaddle.org/portal/ + pip install -r requirements.txt + python manage.py runserver - python -c "import py_paddle" +工具服务器将读取环境变量 CONTENT_DIR 搜索代码库。请指定的PaddlePaddle工作目录给环境变量 CONTENT_DIR。 +之后再用网页连到http://localhost:8000就可以在网页上生成需要的文档。 +编译后的文件将被存储在工作目录 /.ppo_workspace/content。 -如果提示错误,那么用户需要在本地编译安装PaddlePaddle,请参考 `源码编译文档 `_ 。 -注意,用户在首次编译安装PaddlePaddle时,请将WITH_DOC选项关闭。在编译安装正确之后,请再次确认py_paddle包已经安装,即可进行下一步操作。 +想了解更多PaddlePaddle.org工具的详细信息,可以 `点击这里 `_ 。 -如果提示正确,可以执行以下命令编译生成文档,即 +使用Docker构建 +-------------- + +使用Docker构建PaddlePaddle的文档,需要在系统里先安装好Docker工具包。Docker安装请参考 `Docker的官网 `_ 。安装好Docker之后可以使用源码目录下的脚本构建文档,即 .. code-block:: bash cd TO_YOUR_PADDLE_CLONE_PATH cd paddle/scripts/tools/build_docs - bash build_docs.sh local + sh build_docs.sh -编译完成之后,会在当前目录生成两个子目录\: +编译完成之后,会在当前目录生成两个子目录\: doc(英文文档目录)和 doc_cn(中文文档目录)。 +打开浏览器访问对应目录下的index.html即可访问本地文档。 -* doc 英文文档目录 -* doc_cn 中文文档目录 +直接构建 +-------- +如果提示正确,可以执行以下命令编译生成文档,即 + +.. code-block:: bash + + cd TO_YOUR_PADDLE_CLONE_PATH + mkdir -p build + cd build + cmake .. -DCMAKE_BUILD_TYPE=Debug -DWITH_GPU=OFF -DWITH_MKL=OFF -DWITH_DOC=ON + make gen_proto_py + make paddle_docs paddle_docs_cn + +编译完成之后,会在当前目录生成两个子目录\: doc(英文文档目录)和 doc_cn(中文文档目录)。 打开浏览器访问对应目录下的index.html即可访问本地文档。 -如何书写PaddlePaddle的文档 -========================== +如何书写文档 +============ PaddlePaddle文档使用 `sphinx`_ 自动生成,用户可以参考sphinx教程进行书写。 -如何更新www.paddlepaddle.org文档 -================================ - -开发者给PaddlePaddle代码增加的注释以PR的形式提交到github中,提交方式可参见 `贡献文档 `_ 。 -目前PaddlePaddle的develop分支的文档是自动触发更新的,用户可以分别查看最新的 `中文文档 `_ 和 -`英文文档 `_ 。 +如何更新www.paddlepaddle.org +============================ +更新的文档以PR的形式提交到github中,提交方式参见 `贡献文档 `_ 。 +目前PaddlePaddle的develop分支的文档是自动触发更新的,用户可以分别查看最新的 `中文文档 `_ 和 +`英文文档 `_ 。 .. _cmake: https://cmake.org/ diff --git a/doc/howto/dev/write_docs_en.rst b/doc/howto/dev/write_docs_en.rst new file mode 100644 index 0000000000000000000000000000000000000000..b3ef07eb1d0012827df8e6a4f27c5fa643649492 --- /dev/null +++ b/doc/howto/dev/write_docs_en.rst @@ -0,0 +1,80 @@ +################## +Contribute Documentation +################## + +PaddlePaddle supports English documentation ``doc`` and Chinese documentation ``doc_cn``. +Both are compiled by `cmake`_ and `sphinx`_ , the compiled documentations will be stored under ``doc`` and ``doc_cn`` directories. +When using the PaddlePaddle.org to compile documentations, the compiled documentations will be stored under a consolidated directory: .ppo_workspace/content + +How to Build Documentations +============ + +We recommend using PaddlePaddle.org tool to build documentation + + +Use PaddlePaddle.org tool +-------------- +This is the recommended method to build documentation. It can compile documentation and preview the documentation in a web browser. + +The tool uses Docker, please install it on your system. Please check Docker official website on how to install Docker. You may use the following commands to activate the tool + +.. code-block:: bash + + mkdir paddlepaddle # Create paddlepaddle working directory + cd paddlepaddle + + # Clone the content repositories. You may only clone the contents you need + git clone https://github.com/PaddlePaddle/Paddle.git + git clone https://github.com/PaddlePaddle/book.git + git clone https://github.com/PaddlePaddle/models.git + git clone https://github.com/PaddlePaddle/Mobile.git + + # Please specify the working directory through -v + docker run -it -p 8000:8000 -v `pwd`:/var/content paddlepaddle/paddlepaddle.org:latest + +Note: PaddlePaddle.org will read the content repos specified in the -v (volume) flag of the docker run command +Use a web browser and navigate to http://localhost:8000, click the buttons to compile the documentation +The compiled documentations will be stored in /.ppo_workspace/content + + +If you don't wish to use Docker, you can also activate the tool through Django. Use the following the commands to set up + +.. code-block:: bash + + mkdir paddlepaddle # Create paddlepaddle working directory + cd paddlepaddle + + # Clone the content repositories and PaddlePaddle.org + git clone https://github.com/PaddlePaddle/Paddle.git + git clone https://github.com/PaddlePaddle/book.git + git clone https://github.com/PaddlePaddle/models.git + git clone https://github.com/PaddlePaddle/Mobile.git + git clone https://github.com/PaddlePaddle/PaddlePaddle.org.git + + # Please specify the PaddlePaddle working directory. In the current setting, it should be pwd + export CONTENT_DIR= + export ENV='' + cd PaddlePaddle.org/portal/ + pip install -r requirements.txt + python manage.py runserver + +Use a web browser and navigate to http://localhost:8000, click the buttons to compile the documentation +The compiled documentations will be stored in /.ppo_workspace/content + +If you want to learn more on the PaddlePaddle.org, please `click here `_ 。 + +How to write Documentations +============ + +PaddlePaddle uses `sphinx`_ to compile documentations,Please check sphinx official website for more detail. + + +How to update www.paddlepaddle.org +============================ + +Please create PRs and submit them to github, please check `Contribute Code `_ 。 +PaddlePaddle develop branch will update the documentation once the PR is merged. User may check latest `Chinese Docs `_ and +`English Docs `_ 。 + +.. _cmake: https://cmake.org/ +.. _sphinx: http://www.sphinx-doc.org/en/1.4.8/ diff --git a/doc/howto/index_cn.rst b/doc/howto/index_cn.rst index 26449a6365843b526b3ac3111b337d2f17524c9d..e0c69f7a6a4043abe999af6c8dd2555178b68424 100644 --- a/doc/howto/index_cn.rst +++ b/doc/howto/index_cn.rst @@ -9,9 +9,7 @@ usage/cmd_parameter/index_cn.rst usage/cluster/cluster_train_cn.md - usage/k8s/k8s_basis_cn.md - usage/k8s/k8s_cn.md - usage/k8s/k8s_distributed_cn.md + usage/capi/index_cn.rst 开发标准 -------- @@ -19,8 +17,8 @@ .. toctree:: :maxdepth: 1 - dev/write_docs_cn.rst dev/contribute_to_paddle_cn.md + dev/write_docs_cn.rst 模型配置 -------- diff --git a/doc/howto/index_en.rst b/doc/howto/index_en.rst index 1fbfcd260b912078f00ed5b720ed607db725c4e2..6d1bf7dfc003da6de31410ee0a7959233adfaf76 100644 --- a/doc/howto/index_en.rst +++ b/doc/howto/index_en.rst @@ -9,8 +9,6 @@ Usage usage/cmd_parameter/index_en.rst usage/cluster/cluster_train_en.md - usage/k8s/k8s_en.md - usage/k8s/k8s_aws_en.md Development ------------ @@ -20,6 +18,7 @@ Development dev/new_layer_en.rst dev/contribute_to_paddle_en.md + dev/write_docs_en.rst Configuration ------------- diff --git a/doc/howto/optimization/cpu_profiling.md b/doc/howto/optimization/cpu_profiling.md new file mode 100644 index 0000000000000000000000000000000000000000..368af40cc7308cf6f4c609361078fe3ba02213ed --- /dev/null +++ b/doc/howto/optimization/cpu_profiling.md @@ -0,0 +1,196 @@ +This tutorial introduces techniques we use to profile and tune the +CPU performance of PaddlePaddle. We will use Python packages +`cProfile` and `yep`, and Google's `perftools`. + +Profiling is the process that reveals performance bottlenecks, +which could be very different from what's in the developers' mind. +Performance tuning is done to fix these bottlenecks. Performance optimization +repeats the steps of profiling and tuning alternatively. + +PaddlePaddle users program AI applications by calling the Python API, which calls +into `libpaddle.so.` written in C++. In this tutorial, we focus on +the profiling and tuning of + +1. the Python code and +1. the mixture of Python and C++ code. + +## Profiling the Python Code + +### Generate the Performance Profiling File + +We can use Python standard +package, [`cProfile`](https://docs.python.org/2/library/profile.html), +to generate Python profiling file. For example: + +```bash +python -m cProfile -o profile.out main.py +``` + +where `main.py` is the program we are going to profile, `-o` specifies +the output file. Without `-o`, `cProfile` would outputs to standard +output. + +### Look into the Profiling File + +`cProfile` generates `profile.out` after `main.py` completes. We can +use [`cprofilev`](https://github.com/ymichael/cprofilev) to look into +the details: + +```bash +cprofilev -a 0.0.0.0 -p 3214 -f profile.out main.py +``` + +where `-a` specifies the HTTP IP, `-p` specifies the port, `-f` +specifies the profiling file, and `main.py` is the source file. + +Open the Web browser and points to the local IP and the specifies +port, we will see the output like the following: + +``` + ncalls tottime percall cumtime percall filename:lineno(function) + 1 0.284 0.284 29.514 29.514 main.py:1() + 4696 0.128 0.000 15.748 0.003 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/executor.py:20(run) + 4696 12.040 0.003 12.040 0.003 {built-in method run} + 1 0.144 0.144 6.534 6.534 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/__init__.py:14() +``` + +where each line corresponds to Python function, and the meaning of +each column is as follows: + +| column | meaning | +| --- | --- | +| ncalls | the number of calls into a function | +| tottime | the total execution time of the function, not including the execution time of other functions called by the function | +| percall | tottime divided by ncalls | +| cumtime | the total execution time of the function, including the execution time of other functions being called | +| percall | cumtime divided by ncalls | +| filename:lineno(function) | where the function is defined | + +### Identify Performance Bottlenecks + +Usually, `tottime` and the related `percall` time is what we want to +focus on. We can sort above profiling file by tottime: + +```text + 4696 12.040 0.003 12.040 0.003 {built-in method run} + 300005 0.874 0.000 1.681 0.000 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/dataset/mnist.py:38(reader) + 107991 0.676 0.000 1.519 0.000 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/framework.py:219(__init__) + 4697 0.626 0.000 2.291 0.000 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/framework.py:428(sync_with_cpp) + 1 0.618 0.618 0.618 0.618 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/__init__.py:1() +``` + +We can see that the most time-consuming function is the `built-in +method run`, which is a C++ function in `libpaddle.so`. We will +explain how to profile C++ code in the next section. At this +moment, let's look into the third function `sync_with_cpp`, which is a +Python function. We can click it to understand more about it: + +``` +Called By: + + Ordered by: internal time + List reduced from 4497 to 2 due to restriction <'sync_with_cpp'> + +Function was called by... + ncalls tottime cumtime +/home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/framework.py:428(sync_with_cpp) <- 4697 0.626 2.291 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/framework.py:562(sync_with_cpp) +/home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/framework.py:562(sync_with_cpp) <- 4696 0.019 2.316 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/framework.py:487(clone) + 1 0.000 0.001 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/framework.py:534(append_backward) + + +Called: + + Ordered by: internal time + List reduced from 4497 to 2 due to restriction <'sync_with_cpp'> +``` + +The lists of the callers of `sync_with_cpp` might help us understand +how to improve the function definition. + +## Profiling Python and C++ Code + +### Generate the Profiling File + +To profile a mixture of Python and C++ code, we can use a Python +package, `yep`, that can work with Google's `perftools`, which is a +commonly-used profiler for C/C++ code. + +In Ubuntu systems, we can install `yep` and `perftools` by running the +following commands: + +```bash +apt update +apt install libgoogle-perftools-dev +pip install yep +``` + +Then we can run the following command + +```bash +python -m yep -v main.py +``` + +to generate the profiling file. The default filename is +`main.py.prof`. + +Please be aware of the `-v` command line option, which prints the +analysis results after generating the profiling file. By examining the + the print result, we'd know that if we stripped debug +information from `libpaddle.so` at build time. The following hints +help make sure that the analysis results are readable: + +1. Use GCC command line option `-g` when building `libpaddle.so` so to + include the debug information. The standard building system of + PaddlePaddle is CMake, so you might want to set + `CMAKE_BUILD_TYPE=RelWithDebInfo`. + +1. Use GCC command line option `-O2` or `-O3` to generate optimized + binary code. It doesn't make sense to profile `libpaddle.so` + without optimization, because it would anyway run slowly. + +1. Profiling the single-threaded binary file before the + multi-threading version, because the latter often generates tangled + profiling analysis result. You might want to set environment + variable `OMP_NUM_THREADS=1` to prevents OpenMP from automatically + starting multiple threads. + +### Examining the Profiling File + +The tool we used to examine the profiling file generated by +`perftools` is [`pprof`](https://github.com/google/pprof), which +provides a Web-based GUI like `cprofilev`. + +We can rely on the standard Go toolchain to retrieve the source code +of `pprof` and build it: + +```bash +go get github.com/google/pprof +``` + +Then we can use it to profile `main.py.prof` generated in the previous +section: + +```bash +pprof -http=0.0.0.0:3213 `which python` ./main.py.prof +``` + +Where `-http` specifies the IP and port of the HTTP service. +Directing our Web browser to the service, we would see something like +the following: + +![result](./pprof_1.png) + +### Identifying the Performance Bottlenecks + +Similar to how we work with `cprofilev`, we'd focus on `tottime` and +`cumtime`. + +![kernel_perf](./pprof_2.png) + +We can see that the execution time of multiplication and the computing +of the gradient of multiplication takes 2% to 4% of the total running +time, and `MomentumOp` takes about 17%. Obviously, we'd want to +optimize `MomentumOp`. + +`pprof` would mark performance critical parts of the program in +red. It's a good idea to follow the hints. diff --git a/doc/howto/optimization/cpu_profiling_cn.md b/doc/howto/optimization/cpu_profiling_cn.md new file mode 100644 index 0000000000000000000000000000000000000000..14eba0e2f34b115f5cd24920b5b1af07ec953d00 --- /dev/null +++ b/doc/howto/optimization/cpu_profiling_cn.md @@ -0,0 +1,155 @@ +此教程会介绍如何使用Python的cProfile包、Python库yep、Google perftools来进行性能分析 (profiling) 与调优(performance tuning)。 + +Profling 指发现性能瓶颈。系统中的瓶颈可能和程序员开发过程中想象的瓶颈相去甚远。Tuning 指消除瓶颈。性能优化的过程通常是不断重复地 profiling 和 tuning。 + +PaddlePaddle 用户一般通过调用 Python API 编写深度学习程序。大部分 Python API 调用用 C++ 写的 libpaddle.so。所以 PaddlePaddle 的性能分析与调优分为两个部分: + +* Python 代码的性能分析 +* Python 与 C++ 混合代码的性能分析 + + +## Python代码的性能分析 + +### 生成性能分析文件 + +Python标准库中提供了性能分析的工具包,[cProfile](https://docs.python.org/2/library/profile.html)。生成Python性能分析的命令如下: + +```bash +python -m cProfile -o profile.out main.py +``` + +其中 `main.py` 是我们要分析的程序,`-o`标识了一个输出的文件名,用来存储本次性能分析的结果。如果不指定这个文件,`cProfile`会打印到标准输出。 + +### 查看性能分析文件 + +`cProfile` 在main.py 运行完毕后输出`profile.out`。我们可以使用[`cprofilev`](https://github.com/ymichael/cprofilev)来查看性能分析结果。`cprofilev`是一个Python的第三方库。使用它会开启一个HTTP服务,将性能分析结果以网页的形式展示出来: + +```bash +cprofilev -a 0.0.0.0 -p 3214 -f profile.out main.py +``` + +其中`-a`标识HTTP服务绑定的IP。使用`0.0.0.0`允许外网访问这个HTTP服务。`-p`标识HTTP服务的端口。`-f`标识性能分析的结果文件。`main.py`标识被性能分析的源文件。 + +用Web浏览器访问对应网址,即可显示性能分析的结果: + +``` + ncalls tottime percall cumtime percall filename:lineno(function) + 1 0.284 0.284 29.514 29.514 main.py:1() + 4696 0.128 0.000 15.748 0.003 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/executor.py:20(run) + 4696 12.040 0.003 12.040 0.003 {built-in method run} + 1 0.144 0.144 6.534 6.534 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/__init__.py:14() +``` + +每一列的含义是: + +| 列名 | 含义 | +| --- | --- | +| ncalls | 函数的调用次数 | +| tottime | 函数实际使用的总时间。该时间去除掉本函数调用其他函数的时间 | +| percall | tottime的每次调用平均时间 | +| cumtime | 函数总时间。包含这个函数调用其他函数的时间 | +| percall | cumtime的每次调用平均时间 | +| filename:lineno(function) | 文件名, 行号,函数名 | + + +### 寻找性能瓶颈 + +通常`tottime`和`cumtime`是寻找瓶颈的关键指标。这两个指标代表了某一个函数真实的运行时间。 + +将性能分析结果按照tottime排序,效果如下: + +```text + 4696 12.040 0.003 12.040 0.003 {built-in method run} + 300005 0.874 0.000 1.681 0.000 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/dataset/mnist.py:38(reader) + 107991 0.676 0.000 1.519 0.000 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/framework.py:219(__init__) + 4697 0.626 0.000 2.291 0.000 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/framework.py:428(sync_with_cpp) + 1 0.618 0.618 0.618 0.618 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/__init__.py:1() +``` + +可以看到最耗时的函数是C++端的`run`函数。这需要联合我们第二节`Python`与`C++`混合代码的性能分析来进行调优。而`sync_with_cpp`函数的总共耗时很长,每次调用的耗时也很长。于是我们可以点击`sync_with_cpp`的详细信息,了解其调用关系。 + +```text +Called By: + + Ordered by: internal time + List reduced from 4497 to 2 due to restriction <'sync_with_cpp'> + +Function was called by... + ncalls tottime cumtime +/home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/framework.py:428(sync_with_cpp) <- 4697 0.626 2.291 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/framework.py:562(sync_with_cpp) +/home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/framework.py:562(sync_with_cpp) <- 4696 0.019 2.316 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/framework.py:487(clone) + 1 0.000 0.001 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/framework.py:534(append_backward) + + +Called: + + Ordered by: internal time + List reduced from 4497 to 2 due to restriction <'sync_with_cpp'> +``` + +通常观察热点函数间的调用关系,和对应行的代码,就可以了解到问题代码在哪里。当我们做出性能修正后,再次进行性能分析(profiling)即可检查我们调优后的修正是否能够改善程序的性能。 + + + +## Python与C++混合代码的性能分析 + +### 生成性能分析文件 + +C++的性能分析工具非常多。常见的包括`gprof`, `valgrind`, `google-perftools`。但是调试Python中使用的动态链接库与直接调试原始二进制相比增加了很多复杂度。幸而Python的一个第三方库`yep`提供了方便的和`google-perftools`交互的方法。于是这里使用`yep`进行Python与C++混合代码的性能分析 + +使用`yep`前需要安装`google-perftools`与`yep`包。ubuntu下安装命令为 + +```bash +apt update +apt install libgoogle-perftools-dev +pip install yep +``` + +安装完毕后,我们可以通过 + +```bash +python -m yep -v main.py +``` + +生成性能分析文件。生成的性能分析文件为`main.py.prof`。 + +命令行中的`-v`指定在生成性能分析文件之后,在命令行显示分析结果。我们可以在命令行中简单的看一下生成效果。因为C++与Python不同,编译时可能会去掉调试信息,运行时也可能因为多线程产生混乱不可读的性能分析结果。为了生成更可读的性能分析结果,可以采取下面几点措施: + +1. 编译时指定`-g`生成调试信息。使用cmake的话,可以将CMAKE_BUILD_TYPE指定为`RelWithDebInfo`。 +2. 编译时一定要开启优化。单纯的`Debug`编译性能会和`-O2`或者`-O3`有非常大的差别。`Debug`模式下的性能测试是没有意义的。 +3. 运行性能分析的时候,先从单线程开始,再开启多线程,进而多机。毕竟单线程调试更容易。可以设置`OMP_NUM_THREADS=1`这个环境变量关闭openmp优化。 + +### 查看性能分析文件 + +在运行完性能分析后,会生成性能分析结果文件。我们可以使用[`pprof`](https://github.com/google/pprof)来显示性能分析结果。注意,这里使用了用`Go`语言重构后的`pprof`,因为这个工具具有web服务界面,且展示效果更好。 + +安装`pprof`的命令和一般的`Go`程序是一样的,其命令如下: + +```bash +go get github.com/google/pprof +``` + +进而我们可以使用如下命令开启一个HTTP服务: + +```bash +pprof -http=0.0.0.0:3213 `which python` ./main.py.prof +``` + +这行命令中,`-http`指开启HTTP服务。`which python`会产生当前Python二进制的完整路径,进而指定了Python可执行文件的路径。`./main.py.prof`输入了性能分析结果。 + +访问对应的网址,我们可以查看性能分析的结果。结果如下图所示: + +![result](./pprof_1.png) + + +### 寻找性能瓶颈 + +与寻找Python代码的性能瓶颈类似,寻找Python与C++混合代码的性能瓶颈也是要看`tottime`和`cumtime`。而`pprof`展示的调用图也可以帮助我们发现性能中的问题。 + +例如下图中, + +![kernel_perf](./pprof_2.png) + +在一次训练中,乘法和乘法梯度的计算占用2%-4%左右的计算时间。而`MomentumOp`占用了17%左右的计算时间。显然,`MomentumOp`的性能有问题。 + +在`pprof`中,对于性能的关键路径都做出了红色标记。先检查关键路径的性能问题,再检查其他部分的性能问题,可以更有次序的完成性能的优化。 diff --git a/doc/howto/optimization/pprof_1.png b/doc/howto/optimization/pprof_1.png new file mode 100644 index 0000000000000000000000000000000000000000..8e9edbf377672d0ef40f2fc7bd39e746923550cb Binary files /dev/null and b/doc/howto/optimization/pprof_1.png differ diff --git a/doc/howto/optimization/pprof_2.png b/doc/howto/optimization/pprof_2.png new file mode 100644 index 0000000000000000000000000000000000000000..172ba20399ba974d27f4c072425277b69b02520b Binary files /dev/null and b/doc/howto/optimization/pprof_2.png differ diff --git a/doc/howto/read_source.md b/doc/howto/read_source.md new file mode 100644 index 0000000000000000000000000000000000000000..31987920f32f217ac2db42548874cfe7da57dd72 --- /dev/null +++ b/doc/howto/read_source.md @@ -0,0 +1,67 @@ +# PaddlePaddle Fluid Source Code Overview + +Examples: https://github.com/PaddlePaddle/Paddle/tree/develop/python/paddle/v2/fluid/tests/book + +Core: https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/framework + +Operator: https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/operators + +Memory: https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/memory + +Platform: https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/platform + +# Compile Time + +The following **defines** the NN. The definition goes into this [protocol buffer](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/framework.proto). + +```python +x = fluid.layers.data(name='x', shape=[13], dtype='float32') +y = fluid.layers.data(name='y', shape=[1], dtype='float32') + +y_predict = fluid.layers.fc(input=x, size=1, act=None) +cost = fluid.layers.square_error_cost(input=y_predict, label=y) +avg_cost = fluid.layers.mean(x=cost) + +sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.001) +sgd_optimizer.minimize(avg_cost) +``` + +- Variables: `x`, `y`, `y_predict`, `cost` and `avg_cost`. [Python](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/v2/fluid/framework.py#) +- Layers: `fluid.layers.data`, `fluid.layers.fc` and `fluid.layers.mean` are layers. [Python](https://github.com/PaddlePaddle/Paddle/tree/develop/python/paddle/v2/fluid/layers) + - Every Layer has one or more operators and variables/parameters + - All the operators are defined at [`paddle/operators/`](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/operators). Other worth-looking files: + - Base class: [`paddle/framework/operator.h`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/operator.h) + - Operator Registration: [`paddle/framework/op_registry.h`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/op_registry.h) + - Operator Lookup: [`paddle/framework/op_info.h`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/op_info.h) +- Optimizer: `fluid.optimizer.SGD`. It does the following + - Add backward operators. [[Python](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/v2/fluid/backward.py)] + - Add optimizer operators. [[Python](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/v2/fluid/optimizer.py)] + +# Run Time + +The following **evaluates** the NN. Instantiates all the variables, operators. + +```python +place = fluid.CPUPlace() +feeder = fluid.DataFeeder(place=place, feed_list=[x, y]) +exe = fluid.Executor(place) + +# Allocate memory. Initialize Parameter. +exe.run(fluid.default_startup_program()) + +# Allocate memory. Do computation. +exe.run(fluid.default_main_program(), + feed=feeder.feed(data), + fetch_list=[avg_cost]) +``` + +- Place: `place`. one of CPU, GPU or FPGA. [C++](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/platform/place.h) + - The device handle are at [paddle/platform/device_context.h](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/platform/device_context.h) +- Executor: `fluid.Executor(place)`. [[Python](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/v2/fluid/executor.py), [C++](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/executor.cc)] + - Feeds the data: `feed=feeder.feed(data)` + - Evaluates all the operators + - Fetches the result: `fetch_list=[avg_cost]` +- Other worth looking files: + - Scope: [paddle/framework/scope.h](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/scope.h). Where all the variables live + - Variable: [paddle/framework/variable.h](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/variable.h). Where all the data (most likely tensors) live + - Tensor: [paddle/framework/tensor.h](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/tensor.h). Where we allocate memory through [`paddle/memory/`](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/memory) diff --git a/doc/howto/usage/capi/compile_paddle_lib_cn.md b/doc/howto/usage/capi/compile_paddle_lib_cn.md new file mode 100644 index 0000000000000000000000000000000000000000..ac5ecffe2ea8ddc3703a32e9a0a8ee83bbe5dd14 --- /dev/null +++ b/doc/howto/usage/capi/compile_paddle_lib_cn.md @@ -0,0 +1,122 @@ +## 编译 PaddlePaddle 预测库 + +### 概述 + +使用 C-API 进行预测依赖于将 PaddlePaddle 核心代码编译成链接库,只需在编译时需配制下面这些编译选项: + +必须配置选项: +- `WITH_C_API`,必须配置为`ON`。 + +推荐配置选项: +- `WITH_PYTHON`,推荐配置为`OFF` +- `WITH_SWIG_PY`,推荐配置为`OFF` +- `WITH_GOLANG`,推荐设置为`OFF` + +可选配置选项: +- `WITH_GPU`,可配置为`ON/OFF` +- `WITH_MKL`,可配置为`ON/OFF` + +对推荐配置中的选项建议按照设置,以避免链接不必要的库。其它可选编译选项按需进行设定。 + +下面的代码片段从github拉取最新代码,配制编译选项(需要将PADDLE_ROOT替换为PaddlePaddle预测库的安装路径): + +```shell +PADDLE_ROOT=/path/of/capi +git clone https://github.com/PaddlePaddle/Paddle.git +cd Paddle +mkdir build +cd build +cmake -DCMAKE_INSTALL_PREFIX=$PADDLE_ROOT \ + -DCMAKE_BUILD_TYPE=Release \ + -DWITH_C_API=ON \ + -DWITH_SWIG_PY=OFF \ + -DWITH_GOLANG=OFF \ + -DWITH_PYTHON=OFF \ + -DWITH_MKL=OFF \ + -DWITH_GPU=OFF \ + .. +``` + +执行上述代码生成Makefile文件后,执行:`make && make install`。成功编译后,使用C-API所需的依赖(包括:(1)编译出的PaddlePaddle预测库和头文件;(2)第三方链接库和头文件)均会存放于`PADDLE_ROOT`目录中。 + +编译成功后在 `PADDLE_ROOT` 下会看到如下目录结构(包括了编译出的PaddlePaddle头文件和链接库,以及第三方依赖链接库和头文件(如果需要,由链接方式决定)): + +```text +├── include +│   └── paddle +│   ├── arguments.h +│   ├── capi.h +│   ├── capi_private.h +│   ├── config.h +│   ├── error.h +│   ├── gradient_machine.h +│   ├── main.h +│   ├── matrix.h +│   ├── paddle_capi.map +│   └── vector.h +├── lib +│   ├── libpaddle_capi_engine.a +│   ├── libpaddle_capi_layers.a +│   ├── libpaddle_capi_shared.so +│   └── libpaddle_capi_whole.a +└── third_party + ├── gflags + │   ├── include + │   │   └── gflags + │   │   ├── gflags_completions.h + │   │   ├── gflags_declare.h + │   │   ... + │   └── lib + │   └── libgflags.a + ├── glog + │   ├── include + │   │   └── glog + │   │   ├── config.h + │   │   ... + │   └── lib + │   └── libglog.a + ├── openblas + │   ├── include + │   │   ├── cblas.h + │   │   ... + │   └── lib + │   ... + ├── protobuf + │   ├── include + │   │   └── google + │   │   └── protobuf + │   │   ... + │   └── lib + │   └── libprotobuf-lite.a + └── zlib + ├── include + │   ... + └── lib + ... + +``` + +### 链接说明 + +目前提供三种链接方式: + +1. 链接`libpaddle_capi_shared.so` 动态库 + - 使用 PaddlePaddle C-API 开发预测程序链接`libpaddle_capi_shared.so`时,需注意: + 1. 如果编译时指定编译CPU版本,且使用`OpenBLAS`数学库,在使用C-API开发预测程序时,只需要链接`libpaddle_capi_shared.so`这一个库。 + 1. 如果是用编译时指定CPU版本,且使用`MKL`数学库,由于`MKL`库有自己独立的动态库文件,在使用PaddlePaddle C-API开发预测程序时,需要自己链接MKL链接库。 + 1. 如果编译时指定编译GPU版本,CUDA相关库会在预测程序运行时动态装载,需要将CUDA相关的库设置到`LD_LIBRARY_PATH`环境变量中。 + - 这种方式最为简便,链接相对容易,**在无特殊需求情况下,推荐使用此方式**。 + +2. 链接静态库 `libpaddle_capi_whole.a` + - 使用PaddlePaddle C-API 开发预测程序链接`libpaddle_capi_whole.a`时,需注意: + 1. 需要指定`-Wl,--whole-archive`链接选项。 + 1. 需要显式地链接 `gflags`、`glog`、`libz`、`protobuf` 等第三方库,可在`PADDLE_ROOT/third_party`下找到。 + 1. 如果在编译 C-API 时使用OpenBLAS数学库,需要显示地链接`libopenblas.a`。 + 1. 如果在编译 C-API 是使用MKL数学库,需要显示地链接MKL的动态库。 + +3. 链接静态库 `libpaddle_capi_layers.a`和`libpaddle_capi_engine.a` + - 使用PaddlePaddle C-API 开发预测程序链接`libpaddle_capi_whole.a`时,需注意: + 1. 这种链接方式主要用于移动端预测。 + 1. 为了减少生成链接库的大小把`libpaddle_capi_whole.a`拆成以上两个静态链接库。 + 1. 需指定`-Wl,--whole-archive -lpaddle_capi_layers` 和 `-Wl,--no-whole-archive -lpaddle_capi_engine` 进行链接。 + 1. 第三方依赖库需要按照与方式2同样方法显示地进行链接。 diff --git a/doc/howto/usage/capi/images/csr.png b/doc/howto/usage/capi/images/csr.png new file mode 100644 index 0000000000000000000000000000000000000000..3dc10b8de4f6d3f517624956b1694b689405a031 Binary files /dev/null and b/doc/howto/usage/capi/images/csr.png differ diff --git a/doc/howto/usage/capi/images/sequence_data.png b/doc/howto/usage/capi/images/sequence_data.png new file mode 100644 index 0000000000000000000000000000000000000000..6e47a46b8955dfe977e85898fe3c9f33ed28de7e Binary files /dev/null and b/doc/howto/usage/capi/images/sequence_data.png differ diff --git a/doc/howto/usage/capi/images/workflow_of_CAPI.png b/doc/howto/usage/capi/images/workflow_of_CAPI.png new file mode 100644 index 0000000000000000000000000000000000000000..a4399ade048b3fe10d2d9c714bc34333ca068edb Binary files /dev/null and b/doc/howto/usage/capi/images/workflow_of_CAPI.png differ diff --git a/doc/howto/usage/capi/index_cn.rst b/doc/howto/usage/capi/index_cn.rst new file mode 100644 index 0000000000000000000000000000000000000000..fd774fbc742671c5a8009cb742f2c9d06a525199 --- /dev/null +++ b/doc/howto/usage/capi/index_cn.rst @@ -0,0 +1,9 @@ +PaddlePaddle C-API +================== + +.. toctree:: + :maxdepth: 1 + + compile_paddle_lib_cn.md + organization_of_the_inputs_cn.md + workflow_of_capi_cn.md diff --git a/doc/howto/usage/capi/organization_of_the_inputs_cn.md b/doc/howto/usage/capi/organization_of_the_inputs_cn.md new file mode 100644 index 0000000000000000000000000000000000000000..a889ae4ffab7be02468b4a5ac5a18e3cc77803c9 --- /dev/null +++ b/doc/howto/usage/capi/organization_of_the_inputs_cn.md @@ -0,0 +1,285 @@ +## 输入/输出数据组织 + +这篇文档介绍在使用 PaddlePaddle C-API 时如何组织输入数据,以及如何解析神经网络前向计算的输出结果。 + +### 输入/输出数据类型 +在C-API中,按照基本数据类型在PaddlePaddle内部的定义和实现,输入数据可分为: +1. 一维整型数组 +1. 二维浮点型矩阵 + - 稠密矩阵 + - 稀疏矩阵 + +说明: +1. 一维数组**仅支持整型值**; + - 常用于自然语言处理任务,例如:表示词语在词典中的序号; + - 分类任务中类别标签; +1. 逻辑上高于二维的数据(例如含有多个通道的图片,视频等)在程序实现中都会转化为二维矩阵,转化方法在相应的领域都有通用解决方案,需要使用者自己了解并完成转化; +1. 二维矩阵可以表示行向量和列向量,任何时候如果需要浮点型数组(向量),都应使用C-API中的矩阵来表示,而不是C-API中的一维数组。 +1. 不论是一维整型数组还是二维浮点数矩阵,**为它们附加上序列信息将变成序列输入。PaddlePaddle 会通过判数据是否附带有序列信息来判断一个向量/矩阵是否是一个序列**。当非序列输入时,无需关心和处理序列信息。关于什么是“序列信息”,下文会详细进行介绍。 + +### 基本使用概念 + +- 在PaddlePaddle内部,神经网络中一个计算层的输入/输出被组织为一个 `Argument` 结构体,如果神经网络有多个输入或者多个输出,每一个输入/输出都会对应有自己的`Argument`。 +- `Argument` 并不真正“存储”数据,而是将输入/输出信息有机地组织在一起。 +- 在`Argument`内部由`IVector`(对应着上文提到的一维整型数组)和`Matrix`(对应着上文提到的二维浮点型矩阵)来实际存储数据;由 `Sequence Start Positions` (下文详细解释) 来描述输入/输出的序列信息。 + +- **注**: + 1. 这篇文档之后部分将会统一使用`argument`来特指PaddlePaddle中神经网络计算层一个输入/输出数据。 + 1. 使用`paddle_ivector`来特指PaddlePaddle中的一维整型数组。 + 1. 使用`paddle_matrix`来特指PaddlePaddle中的二维浮点型矩阵。 + +### 组织输入数据 +- 一维整型数组 + + 概念上可以将`paddle_ivector`理解为一个一维的整型数组,通常用于表示离散的类别标签,或是在自然语言处理任务中表示词语在字典中的序号。下面的代码片段创建了含有三个元素`1`、`2`、`3`的`paddle_ivector`。 + ```c + int ids[] = {1, 2, 3}; + paddle_ivector ids_array = + paddle_ivector_create(ids, sizeof(ids) / sizeof(int), false, false); + CHECK(paddle_arguments_set_ids(in_args, 0, ids_array)); + ``` + +- **稠密矩阵** + - 一个`m×n`的稠密矩阵是一个由`m`行`n`列元素排列成的矩形阵列,矩阵里的元素是浮点数。对神经网络来说,矩阵的高度`m`是一次预测接受的样本数目,宽度$n$是神经网络定义时,`paddle.layer.data`的`size`。 + - 下面的代码片段创建了一个高度为1,宽度为`layer_size`的稠密矩阵,矩阵中每个元素的值随机生成。 + + ```c + paddle_matrix mat = paddle_matrix_create( + /* height = batch size */ 1, + /* width = dimensionality of the data layer */ layer_size, + /* whether to use GPU */ false); + + paddle_real* array; + // Get the pointer pointing to the start address of the first row of the + // created matrix. + CHECK(paddle_matrix_get_row(mat, 0, &array)); + + // Fill the matrix with a randomly generated test sample. + srand(time(0)); + for (int i = 0; i < layer_size; ++i) { + array[i] = rand() / ((float)RAND_MAX); + } + + // Assign the matrix to the argument. + CHECK(paddle_arguments_set_value(in_args, 0, mat)); + ``` + +- **稀疏矩阵** + + PaddlePaddle C-API 中 稀疏矩阵使用[CSR(Compressed Sparse Row Format)](https://en.wikipedia.org/wiki/Sparse_matrix#Compressed_sparse_row_(CSR,_CRS_or_Yale_format))格式存储。下图是CSR存储稀疏矩阵的示意图。 +

+
图1. 稀疏矩阵存储示意图 +

+ + CSR存储格式通过:(1)非零元素的值(上图中的`values`);(2)行偏移(上图中的`row offsets`):每一行元素在`values`中的起始偏移,`row offsets`中元素个数总是等于行数 + 1;(3)非零元素的列号(上图中的`column indices`)来确定稀疏矩阵的内容。 + + 在PaddlePaddle C-API中,通过调用以下接口创建稀疏矩阵: + + ```c + PD_API paddle_matrix paddle_matrix_create_sparse( + uint64_t height, uint64_t width, uint64_t nnz, bool isBinary, bool useGpu); + ``` + + 1. 创建稀疏矩阵时需要显示地指定矩阵的(1)高度(`height`,在神经网络中等于一次预测处理的样本数)(2)宽度(`width`,`paddle.layer.data`的`size`)以及(3)非零元个数(`nnz`)。 + 1. 当上述接口第4个参数`isBinary`指定为`true`时,**只需要设置行偏移(`row_offset`)和列号(`colum indices`),不需要提供元素值(`values`)**,这时行偏移和列号指定的元素默认其值为1。 + + 下面的代码片段创建了一个CPU上的二值稀疏矩阵: + + ```c + paddle_matrix mat = paddle_matrix_create_sparse(1, layer_size, nnz, true, false); + int colIndices[] = {9, 93, 109}; // layer_size here is greater than 109. + int rowOffset[] = {0, sizeof(colIndices) / sizeof(int)}; + + CHECK(paddle_matrix_sparse_copy_from(mat, + rowOffset, + sizeof(rowOffset) / sizeof(int), + colIndices, + (colIndices) / sizeof(int), + NULL /*values array is NULL.*/, + 0 /*size of the value arrary is 0.*/)); + CHECK(paddle_arguments_set_value(in_args, 0, mat)); + ``` + 下面的代码片段在创建了一个CPU上的带元素值的稀疏矩阵: + ```c + paddle_matrix mat = paddle_matrix_create_sparse(1, layer_size, nnz, false, false); + int colIndices[] = {9, 93, 109}; // layer_size here is greater than 109. + int rowOffset[] = {0, sizeof(colIndices) / sizeof(int)}; + float values[] = {0.5, 0.5, 0.5}; + + CHECK(paddle_matrix_sparse_copy_from(mat, + rowOffset, + sizeof(rowOffset) / sizeof(int), + colIndices, + sizeof(colIndices) / sizeof(int), + values, + sizeof(values) / sizeof(float))); + ``` + 注意事项: + 1. 移动端预测**不支持**稀疏矩阵及相关的接口。 + +### 组织序列信息 + +多个排成一列的元素(可以是整型、浮点数、浮点数向量等)构成一个序列,元素之间的顺序是序列所携带的重要信息。不同序列可能会含有不同数目个元素。在 PaddlePaddle 中,序列输入/输出数据是在上文介绍的**数据输入(一维整型数组,二维浮点数矩阵)基础上,附加上序列信息**。下面详细解释什么是“序列信息”。 + +我们将神经网络一次计算接受的所有输入样本称之为一个`batch`(可以含有一条或多条样本),每一个序列在整个`batch`中的偏移,就是PaddlePaddle中所指的**序列信息**,称之为“sequence start positions”。PaddlePaddle 支持两种序列类型: + +1. 单层序列 + - 序列中的每一个元素是非序列,是进行计算的基本单位,不可再进行拆分。 + - 例如:自然语言中的句子是一个序列,序列中的元素是词语; +1. 双层序列 + - 序列中的每一个元素又是一个序列。 + - 例如:自然语言中的段落是一个双层序列;段落是由句子构成的序列;句子是由词语构成的序列。 + - 双层序列在处理长序列的任务或是构建层级模型时会发挥作用。 + +这篇文档之后部分会统一使用`sequence_start_positions`来特指:PaddlePaddle中神经网络计算层输入/输出所携带的序列信息。 + +对双层序列来讲,不仅要提供每一个外层序列在整个`batch`中的偏移,每一个外层序列又含有若干个内层序列,需要同时提供每一个内层序列在整个`batch`中的偏移。也就是说:**双层序列需要设置分别为外层序列和内层序列分别设置`sequence_start_positions`信息**。 + +**注:** +1. 不论序列中的元素在内存中占用多少实际存储空间,`sequence_start_positions`表示的偏移是以“序列中的一个元素”作为统计的基本单位,而不是相对`batch`起始存储地址以数据的存储大小为单位的偏移。 +1. 非序列输入不携带`sequence_start_positions`,非序列输入无需构造`sequence_start_positions`。 +1. **不论是单层序列还是双层序列的序列信息,都使用`paddle_ivector`(也就是PaddlePaddle中的一维整型数组)来存储。** + +图2 是PaddlePaddle中单层序列和双层序列存储示意图。 +

+
图2. 序列输入示意图 +

+ +- 单层序列 + + 图2 (a) 展示了一个含有4个序列的`batch`输入: + 1. 4个序列的长度分别为:5、3、2、4; + 1. 这时的`sequence_start_positions`为:`[0, 5, 8, 10, 14]`; + 1. 本地训练. 不论数据域是`paddle_ivector`类型还是`paddle_matrix`类型,都可以通过调用下面的接口为原有的数据输入附加上序列信息,使之变为一个单层序列输入,代码片段如下: + + ```c + int seq_pos_array[] = {0, 5, 8, 10, 14}; + paddle_ivector seq_pos = paddle_ivector_create( + seq_pos_array, sizeof(seq_pos_array) / sizeof(int), false, false); + // Suppose the network only has one input data layer. + CHECK(paddle_arguments_set_sequence_start_pos(in_args, 0, 0, seq_pos)); + ``` + +- 双层序列 + + 图2 (b) 展示了一个含有4个序列的`batch`输入; + 1. 4个序列的长度分别为:5、3、2、4;这四个序列又分别含有3、2、1、2个子序列; + 1. 这时的需要同时提供: + - 外层序列在`batch`中的起始偏移`:[0, 5, 8, 10, 14]`; + - 内层序列在`batch`中的起始偏移:`[0, 2, 3, 5, 7, 8, 10, 13, 14]`; + 1. 不论数据域是`paddle_ivector`类型还是`paddle_matrix`类型,这时需要调用创建序列信息和为`argument`设置序列信息的接口**两次**,分别为数据输入添加外层序列和内层序列的序列信息,使之变为一个双层序列输入,代码片段如下: + ```c + // set the sequence start positions for the outter sequences. + int outter_seq_pos_array[] = {0, 5, 8, 10, 14}; + paddle_ivector seq_pos = + paddle_ivector_create(outter_seq_pos_array, + sizeof(outter_pos_array) / sizeof(int), + false, + false); + // The third parameter of this API indicates the sequence level. + // 0 for the outter sequence. 1 for the inner sequence. + // If the input is a sequence not the nested sequence, the third parameter is + // fixed to be 0. + CHECK(paddle_arguments_set_sequence_start_pos(in_args, 0, 0, seq_pos)); + + // set the sequence start positions for the outter sequences. + int inner_seq_pos_array[] = {0, 2, 3, 5, 7, 8, 10, 13, 14}; + paddle_ivector seq_pos = paddle_ivector_create( + inner_pos_array, sizeof(inner_pos_array) / sizeof(int), false, false); + // The third parameter of this API indicates the sequence level. + // 0 for the outter sequence. 1 for the inner sequence. + CHECK(paddle_arguments_set_sequence_start_pos(in_args, 0, 1, seq_pos)); + ``` + +注意事项: +1. 当一个`batch`中含有多个序列,**不支持序列长度为`0`的序列(也就是空输入)** 作为输入。不同计算层对空输入的处理策略有可能不同,潜在会引起未定义行为,或者引起行时错误,请在输入时进行合法性检查。 + +### Python 端数据类型说明 + +下表列出了Python端训练接口暴露的数据类型(`paddle.layer.data`函数`type`字段的取值)对应于调用C-API需要创建的数据类型: + + + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Python 端数据类型C-API 输入数据类型
paddle.data_type.integer_value整型数组,无需附加序列信息
paddle.data_type.dense_vector浮点型稠密矩阵,无需附加序列信息
paddle.data_type.sparse_binary_vector浮点型稀疏矩阵,无需提供非零元的值,默认为1,无需附加序列信息
paddle.data_type.sparse_vector浮点型稀疏矩阵,需提供非零元的值,无需附加序列信息
paddle.data_type.integer_value_sequence整型数组,需附加序列信息
paddle.data_type.dense_vector_sequence浮点型稠密矩阵,需附加序列信息
paddle.data_type.sparse_binary_vector_sequence浮点型稀疏矩阵,无需提供非零元的值,默认为1,需附加序列信息
paddle.data_type.sparse_vector_sequence浮点型稀疏矩阵,需提供非零元的值,需附加序列信息
paddle.data_type.integer_value_sub_sequence整型数组,需附加双层序列信息
paddle.data_type.dense_vector_sub_sequence浮点型稠密矩阵,需附加双层序列信息
paddle.data_type.sparse_binary_vector_sub_sequence浮点型稀疏矩阵,无需提供非零元的值,默认为1,需附加双层序列信息
paddle.data_type.sparse_vector_sub_sequence浮点型稀疏矩阵,需提供非零元的值,需附加双层序列信息
+ +
+ + +### 输出数据 + +PaddlePaddle中一个计算层的输出数据组织方式和输入数据组织方式完全相同。一个输出数据同样被组织为一个`argument`,`argument`通过`paddle_matrix`或`paddle_ivector`存数数据,如果输出是一个序列,那么会携带有`sequence_start_positions`信息。调用C-API相关接口,读取需要的结果即可。 + +### 总结 + +- 在PaddlePaddle内部,神经网络中一个计算层的输入/输出被组织为`argument`。 +- `argument`并不真正“存储”数据,而是将输入/输出信息有机地组织在一起。 +- 在`argument`内部由`paddle_ivector`(一维整型数组)和`paddle_matrix`(二维浮点型矩阵)来实际存储数据。 +如果是一个序列输入/输出由 `sequence start positions` 来记录输入/输出的序列信息。 + +于是,在组织神经网络输入时,需要思考完成以下工作: +1. 为每一个输入/输出创建`argument`。 + - C-API 中操作`argument`的接口请查看[argument.h](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/capi/arguments.h)。 +1. 为每一个`argument`创建`paddle_matrix`或者`paddle_ivector`来存储数据。 + - C-API 中操作`paddle_ivector`的接口请查看 [vector.h](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/capi/vector.h)。 + - C-API 中操作`paddle_matrix`的接口请查看[matrix.h](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/capi/matrix.h)。 +1. 如果输入是序列数据,需要创建并填写`sequence_start_positions`信息。 + - 通过调用 [`paddle_arguments_set_sequence_start_pos`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/capi/arguments.h#L137) 来为一个`argument`添加序列信息。 + - 通过调用 [`paddle_arguments_get_sequence_start_pos`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/capi/arguments.h#L150) 来读取一个`argument`添加序列信息。 + - 接口说明请查看 [argument.h](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/capi/arguments.h) 文件。 diff --git a/doc/howto/usage/capi/workflow_of_capi_cn.md b/doc/howto/usage/capi/workflow_of_capi_cn.md new file mode 100644 index 0000000000000000000000000000000000000000..e0a42fff12cf0f53dee18165e059150861524f74 --- /dev/null +++ b/doc/howto/usage/capi/workflow_of_capi_cn.md @@ -0,0 +1,119 @@ +## C-API 使用流程 + +这篇文档介绍 PaddlePaddle C-API 整体使用流程。 + +### 使用流程 + +使用 C-API 的工作流程如图1所示,分为(1)准备预测模型和(2)预测程序开发两大部分。 + +

+
图1. C-API使用流程示意图 +

+ +- 准备预测模型 + 1. 只将神经网络结构进行序列化。 + - 只对神经网络结构进行序列化,加载模型需同时指定:网络结构的序列化结果和模型参数存储目录。 + 1. 将网络结构定义和训练结束存储下来的模型参数文件(多个)合并入一个文件。 + - 神经网络模型结构和训练好的模型将被序列化合并入一个文件。 + - 预测时只需加载一个文件便于发布。 + - **注意**:以上两种方式只需选择其一即可。 +- 调用 C-API 开发预测序 + 1. 初始化PaddlePaddle运行环境。 + 1. 加载预测模型。 + 1. 创建神经网络输入,组织输入数据。 + 1. 进行前向计算,获得计算结果。 + 1. 清理和结束。 + +### 准备预测模型 + +准备预测模型部分,我们以手写数字识别任务为例进行介绍。手写数字识别任务定义了一个含有[两个隐层的简单全连接网络](https://github.com/PaddlePaddle/book/blob/develop/02.recognize_digits/README.cn.md#softmax回归softmax-regression),网络接受一幅图片作为输入,将图片分类到 0 ~ 9 类别标签之一。完整代码可以查看[此目录](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/capi/examples/model_inference/dense) 中的相关脚本。 + +调用C-API开发预测程序需要一个训练好的模型,运行[MNIST手写数字识别目录](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/capi/examples/model_inference/dense)下的[mnist_v2.py](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/capi/examples/model_inference/dense/mnist_v2.py)脚本,在终端执行`python mnist_v2.py`,会使用 PaddlePaddle 内置的 [MNIST 数据集](http://yann.lecun.com/exdb/mnist/)进行训练。训练好的模型默认保存在当前运行目录下的`models`目录中。 + +下面,我们将训练结束后存储下来的模型转换成预测模型。 + +1. 序列化神经网络模型配置 + + PaddlePaddle 使用 protobuf 来传输网络配置文件中定义的网络结构和相关参数,使用 C-API 进行预测时,需要将网络结构使用 protobuf 进行序列化,写入文件中。 + + 调用[`paddle.utils.dump_v2_config`](https://github.com/PaddlePaddle/Paddle/tree/develop/python/paddle/utils/dump_v2_config.py)中的`dump_v2_config`函数能够将使用 PaddlePaddle V2 API 定义的神经网络结构 dump 到指定文件中,示例代码如下: + + ```python + from paddle.utils.dump_v2_config import dump_v2_config + from mnist_v2 import network + + predict = network(is_infer=True) + dump_v2_config(predict, "trainer_config.bin", True) + ``` + + 对[手写数字识别](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/capi/examples/model_inference/dense)这个示例,[`mnist_v2.py`](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/capi/examples/model_inference/dense/mnist_v2.py)脚本集成了序列化神经网络结构的过程,可以直接运行 `python mnist_v2.py --task dump_config` 对神经网络结构进行序列化,结果会写入当前运行目录下的`trainer_config.bin`文件中。 + + 使用这种方式,需要**在运行时将神经网络的多个可学习参数放在同一个目录中**,C-API可以通过分别指定序列化后的网络结构文件和参数目录来加载训练好的模型。 + +2. 合并模型文件(可选) + + 一些情况为了便于发布,希望能够将序列化后的神经网络结构和训练好的模型参数打包进一个文件。对于这样的需求,可以使用`paddle.utils.merge_model`中的`merge_v2_model`接口对神经网络结构和训练好的参数进行序列化,将序列化结果写入一个文件内。 + + 代码示例如下: + + ```python + from paddle.utils.merge_model import merge_v2_modelss + from mnist_v2 import network + + net = network(is_infer=True) + param_file = "models/params_pass_4.tar" + output_file = "output.paddle.model" + merge_v2_model(net, param_file, output_file) + ``` + 对[手写数字识别](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/capi/examples/model_inference/dense)这个示例,可直接运行 `python` [merge_v2_model.py](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/capi/examples/model_inference/dense/merge_v2_model.py)。序列化结果会写入当前运行目录下的`output.paddle.model`文件中。使用这种方式,运行时C-API可以通过指定`output.paddle.model`文件的路径来加载预测模型。 + +#### 注意事项 +1. 为使用C-API,在调用`dump_v2_config`序列化神经网络结构时,参数`binary`必须指定为`True`。 +1. **预测使用的网络结构往往不同于训练**,通常需要去掉网络中的:(1)类别标签层;(2)损失函数层;(3)`evaluator`等,只留下核心计算层,请注意是否需要修改网络结构。 +1. 预测时,可以获取网络中定义的任意多个(大于等于一个)层前向计算的结果,需要哪些层的计算结果作为输出,就将这些层加入一个Python list中,作为调用`dump_v2_config`的第一个参数。 + +### 编写预测代码 + +预测代码更多详细示例代码请参考[C-API使用示例](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/capi/examples/model_inference) 目录下的代码示例。这一节对图1中预测代码编写的5个步骤进行介绍和说明。 + +#### step 1. 初始化PaddlePaddle运行环境 +第一步需调用[`paddle_init`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/capi/main.h#L27) 初始化PaddlePaddle运行环境,该接口接受两个参数:参数的个数和参数列表。 + +#### step2. 加载模型 + +这里介绍C-API使用中的一个重要概念:Gradient Machine。 + +概念上,在 PaddlePaddle 内部,一个GradientMachine类的对象管理着一组计算层(PaddlePaddle Layers)来完成前向和反向计算,并处理与之相关的所有细节。在调用C-API预测时,只需进行前向计算而无需调用反向计算。这篇文档之后部分会使用`gradient machine`来特指调用PaddlePaddle C-API创建的GradientMachine类的对象。每一个 `gradient machine` 都会管理维护一份训练好的模型,下面是C-API提供的,两种常用的模型加载方式: + +1. 调用[`paddle_gradient_machine_load_parameter_from_disk`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/capi/gradient_machine.h#L61)接口,从磁盘加载预测模型。这时`gradient machine`会独立拥有一份训练好的模型; +1. 调用[`paddle_gradient_machine_create_shared_param`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/capi/gradient_machine.h#L88)接口,与其它`gradient machine`的共享已经加载的预测模型。这种情况多出现在使用多线程预测时,通过多个线程共享同一个模型来减少内存开销。可参考[此示例](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/capi/examples/model_inference/multi_thread/main.c)。 + +- 注意事项 + 1. 使用PaddlePaddle V2 API训练,模型中所有可学习参数会被存为一个压缩文件,需要手动进行解压,将它们放在同一目录中,C-API不会直接加载 V2 API 存储的压缩文件。 + 1. 如果使用`merge model`方式将神经网络结构和训练好的参数序列化到一个文件,请参考此[示例](https://github.com/PaddlePaddle/Mobile/blob/develop/Demo/linux/paddle_image_recognizer.cpp#L59)。 + 1. 通过灵活使用以上两个接口,加载模型可其它多种方式,例如也可在程序运行过程中再加载另外一个模型。 + +#### step 3. 创建神经网络输入,组织输入数据 + +基本使用概念: +- 在PaddlePaddle内部,神经网络中一个计算层的输入输出被组织为一个 `Argument` 结构体,如果神经网络有多个输入或者多个输出,每一个输入/输出都会对应有自己的`Argument`。 +- `Argument` 并不真正“存储”数据,而是将输入/输出数据有机地组织在一起。 +- 在`Argument`内部由:1. `Matrix`(二维矩阵,存储浮点类型输入/输出);2. `IVector`(一维数组,**仅用于存储整型值**,多用于自然语言处理任务)来实际存储数据。 + +C-API支持的所有输入数据类型和他们的组织方式,请参考“输入/输出数据组织”一节。 + +这篇文档的之后部分会使用`argument`来特指PaddlePaddle C-API中神经网络的一个输入/输出,使用`paddle_matrix`**特指**`argument`中用于存储数据的`Matrix`类的对象。 + +在组织神经网络输入,获取输出时,需要思考完成以下工作: +1. 为每一个输入/输出创建`argument`; +1. 为每一个`argument`创建`paddle_matrix`来存储数据; + +与输入不同的是,不需在使用C-API时为输出`argument`的`paddle_matrix`对象分配空间。前向计算之后PaddlePaddle内部已经分配/管理了每个计算层输出的存储空间。 + +#### step 4. 前向计算 + +完成上述准备之后,通过调用 [`paddle_gradient_machine_forward`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/capi/gradient_machine.h#L73) 接口完成神经网络的前向计算。 + +#### step 5. 清理 + +结束预测之后,对使用的中间变量和资源进行清理和释放。 diff --git a/doc/howto/usage/cluster/cluster_train_cn.md b/doc/howto/usage/cluster/cluster_train_cn.md index 274452fbf0c595ad7b4dbeffe85ad9038f12b458..c2fc86687d7106aac7c74d6dd16bc229353cb7c1 100644 --- a/doc/howto/usage/cluster/cluster_train_cn.md +++ b/doc/howto/usage/cluster/cluster_train_cn.md @@ -1,159 +1,188 @@ -```eval_rst -.. _cluster_train: -``` - -# 运行分布式训练 - -在本文中,我们将阐释如何在集群上运行分布式 Paddle 训练作业。我们将以[推荐系统](https://github.com/baidu/Paddle/tree/develop/demo/recommendation)为例创建分布式的单进程训练。 +# 分布式训练 -在本文中使用的[脚本](https://github.com/baidu/Paddle/tree/develop/paddle/scripts/cluster_train)通过 SSH 运行分布式作业。 它们还可以供那些运行更复杂的集群管理系统(如 MPI 和 [Kubernetes](https://github.com/PaddlePaddle/Paddle/tree/develop/doc/howto/usage/k8s) )的用户参考。 -## 前提条件 +## 概述 -1. 上述脚本使用 Python 库 [fabric](http://www.fabfile.org/) 来运行 SSH 命令。 我们使用 `pip` 来安装 fabric: +本文将介绍如何使用PaddlePaddle在不同的集群框架下完成分布式训练。分布式训练架构如下图所示: - ```bash - pip install fabric - ``` + -2. 我们需要在集群的所有节点上安装 PaddlePaddle。 如果要启用GPU,需要在 `/usr/local/cuda` 中安装 CUDA; 否则 Paddle 将在运行时报错。 +- 数据分片(Data shard): 用于训练神经网络的数据,被切分成多个部分,每个部分分别给每个trainer使用。 +- 计算节点(Trainer): 每个trainer启动后读取切分好的一部分数据,开始神经网络的“前馈”和“后馈”计算,并和参数服务器通信。在完成一定量数据的训练后,上传计算得出的梯度(gradients),然后下载优化更新后的神经网络参数(parameters)。 +- 参数服务器(Parameter server):每个参数服务器只保存整个神经网络所有参数的一部分。参数服务器接收从计算节点上传的梯度,并完成参数优化更新,再将更新后的参数下发到每个计算节点。 -3. 在 [`cluster_train/conf.py`] 中设置 `ROOT_DIR`, 该 ROOT_DIR 要在所有节点上存在。为了方便起见,我们通常在所有节点上创建一个 Unix 用户 `paddle`,并设置 `ROOT_DIR=/home/paddle`。这样,我们可以将 SSH 公钥写入 `/home/paddle/.ssh/authorized_keys`,以便用户 `paddle` 可以 SSH 到所有节点而不用密码。 +这样,通过计算节点和参数服务器的分布式协作,可以完成神经网络的SGD方法的训练。PaddlePaddle可以同时支持同步随机梯度下降(SGD)和异步随机梯度下降。 -## 准备工作空间 +在使用同步SGD训练神经网络时,PaddlePaddle使用同步屏障(barrier),使梯度的提交和参数的更新按照顺序方式执行。在异步SGD中,则并不会等待所有trainer提交梯度才更新参数,这样极大地提高了计算的并行性:参数服务器之间不相互依赖,并行地接收梯度和更新参数,参数服务器也不会等待计算节点全部都提交梯度之后才开始下一步,计算节点之间也不会相互依赖,并行地执行模型的训练。可以看出,虽然异步SGD方式会提高参数更新并行度, 但是并不能保证参数同步更新,在任意时间某一台参数服务器上保存的参数可能比另一台要更新,与同步SGD相比,梯度会有噪声。 -我们将放置依赖库、配置等文件的目录视为 *工作空间(workspace)*。 -这些 `train/test` 数据应该在启动集群作业之前准备好。 为了满足训练/测试数据放置在工作空间中不同目录的要求,PADDLE 根据在模型配置文件中使用的名为 `train.list/test.list` 的索引文件引用训练/测试数据,所以训练/测试数据也包含 train.list/test.list 两个列表文件。所有本地训练 demo 已经提供了脚本来帮助您创建这两个文件,并且集群作业中的所有节点将在正常情况下处理具有相同逻辑代码的文件。 +## 环境准备 -通常,你可以使用本地训练中的相同模型文件进行集群训练。请记住,在模型文件的 `setting`函数中设置的 `batch_size` 表示在集群作业**每个**节点中的 batch 大小,而不是使用同步 SGD 的总 batch 大小。 +1. 准备您的计算集群。计算集群通常由一组(几台到几千台规模)的Linux服务器组成。服务器之间可以通过局域网(LAN)联通,每台服务器具有集群中唯一的IP地址(或者可被DNS解析的主机名)。集群中的每台计算机通常被成为一个“节点”。 +1. 我们需要在集群的所有节点上安装 PaddlePaddle。 如果要启用GPU,还需要在节点上安装对应的GPU驱动以及CUDA。PaddlePaddle的安装可以参考[build_and_install](http://www.paddlepaddle.org/docs/develop/documentation/zh/getstarted/build_and_install/index_cn.html)的多种安装方式。我们推荐使用[Docker](http://www.paddlepaddle.org/docs/develop/documentation/zh/getstarted/build_and_install/docker_install_cn.html)安装方式来快速安装PaddlePaddle。 -以下步骤基于 demo 目录中的 [demo/recommendation](https://github.com/PaddlePaddle/Paddle/tree/develop/demo/recommendation)。 +安装完成之后,执行下面的命令可以查看已经安装的版本(docker安装方式可以进入docker容器执行:`docker run -it paddlepaddle/paddle:[tag] /bin/bash`): +```bash +$ paddle version +PaddlePaddle 0.10.0, compiled with + with_avx: ON + with_gpu: OFF + with_double: OFF + with_python: ON + with_rdma: OFF + with_timer: OFF +``` -你只需完成 demo/recommendation 教程文档到 `Train` 的部分,之后你会得到训练/测试数据和模型配置文件。最后,只需使用 demo/recommendation 作为集群训练的工作空间。 +下面以`doc/howto/usage/cluster/src/word2vec`中的代码作为实例,介绍使用PaddlePaddle v2 API完成分布式训练。 -最后,你的工作空间应如下所示: +## 启动参数说明 +### 启动参数服务器 +执行以下的命令启动一个参数服务器并等待和计算节点的数据交互 +```bash +$ paddle pserver --port=7164 --ports_num=1 --ports_num_for_sparse=1 --num_gradient_servers=1 ``` -. -|-- common_utils.py -|-- data -| |-- config.json -| |-- config_generator.py -| |-- meta.bin -| |-- meta_config.json -| |-- meta_generator.py -| |-- ml-1m -| |-- ml_data.sh -| |-- ratings.dat.test -| |-- ratings.dat.train -| |-- split.py -| |-- test.list -| `-- train.list -|-- dataprovider.py -|-- evaluate.sh -|-- prediction.py -|-- preprocess.sh -|-- requirements.txt -|-- run.sh -`-- trainer_config.py -``` -虽然这些文件并非都需要集群训练,但是也没有必要删除无用的文件。 -`trainer_config.py` -表示模型配置文件。 +如果希望可以在后台运行pserver程序,并保存输出到一个日志文件,可以运行: +```bash +$ stdbuf -oL /usr/bin/nohup paddle pserver --port=7164 --ports_num=1 --ports_num_for_sparse=1 --num_gradient_servers=1 &> pserver.log +``` -`train.list` 和 `test.list` -文件索引。它存储当前节点所有训练/测试数据的所有相对或绝对文件路径。 +参数说明 -`dataprovider.py` -用于读取训练/测试样本。这与本地训练相同。 +- port:**必选,默认7164**,pserver监听的起始端口,根据ports_num决定总端口个数,从起始端口监听多个端口用于通信 +- ports_num:**必选,默认1**,监听的端口个数 +- ports_num_for_sparse:**必选,默认0**,用于稀疏类型参数通信的端口个数 +- num_gradient_servers:**必选,默认1**,当前训练任务pserver总数 -`data` -数据目录中的所有文件被 train.list/test.list 引用。 +### 启动计算节点 +执行以下命令启动使用python编写的trainer程序(文件名为任意文件名,如train.py) +```bash +$ python train.py +``` +trainer需要和pserver保持网络联通以完成训练。trainer启动需要传入端口、pserver地址等参数使trainer可以正确连接到pserver。这些参数可以通过[环境变量](https://zh.wikipedia.org/wiki/环境变量)或编写程序时`paddle.init()`中传入参数。如果同时使用`paddle.init()`参数和环境变量,将会优先使用`paddle.init()`中传入的参数。 -## 准备集群作业配置 +使用环境变量: -以下选项必须在 cluster_train/conf.py 中认真设置 +```bash +export PADDLE_INIT_USE_GPU=False +export PADDLE_INIT_TRAINER_COUNT=1 +export PADDLE_INIT_PORT=7164 +export PADDLE_INIT_PORTS_NUM=1 +export PADDLE_INIT_PORTS_NUM_FOR_SPARSE=1 +export PADDLE_INIT_NUM_GRADIENT_SERVERS=1 +export PADDLE_INIT_TRAINER_ID=0 +export PADDLE_INIT_PSERVERS=127.0.0.1 +``` -`HOSTS` 所有节点运行集群作业的主机名或 IP 。你还可以将用户和 ssh 端口附加到主机名上,例如 root@192.168.100.17:9090。 +使用参数: -`ROOT_DIR` 用于放置 JOB 工作空间目录的工作空间 ROOT 目录 +```python +paddle.init( + use_gpu=False, + trainer_count=1, + port=7164, + ports_num=1, + ports_num_for_sparse=1, + num_gradient_servers=1, + trainer_id=0, + pservers="127.0.0.1") +``` -`PADDLE_NIC` 集群通信通道的 NIC(Network Interface Card, 网络接口卡) 接口名称,例如以太网的 eth0,infiniband 的 ib0。 +参数说明 -`PADDLE_PORT` 集群通信通道的端口号 +- use_gpu: **可选,默认False**,是否启用GPU训练 +- trainer_count:**必选,默认1**,当前训练任务trainer总个数 +- port:**必选,默认7164**,连接到pserver的端口 +- ports_num:**必选,默认1**,连接到pserver的端口个数 +- ports_num_for_sparse:**必选,默认0**,和pserver之间用于稀疏类型参数通信的端口个数 +- num_gradient_servers:**必选,默认1**,当前训练任务pserver总数 +- trainer_id:**必选,默认0**,每个trainer的唯一ID,从0开始的整数 +- pservers:**必选,默认127.0.0.1**,当前训练任务启动的pserver的IP列表,多个IP使用“,”隔开 -`PADDLE_PORTS_NUM` 用于集群通信通道的端口数。 如果集群节点数量少(少于5〜6个节点),建议将其设置为较大,如2〜8,以获得更好的网络性能。 -`PADDLE_PORTS_NUM_FOR_SPARSE` 用于 sparse remote updater 集群通信信道的端口数。如果使用 sparse remote update,则可以像 `PADDLE_PORTS_NUM` 一样设置。 +### 准备数据集 -`LD_LIBRARY_PATH` 为集群作业设置额外的 LD_LIBRARY_PATH。你可以使用它来设置 CUDA 库的路径。 +参考样例数据准备脚本[prepare.py](https://github.com/PaddlePaddle/Paddle/tree/develop/doc/howto/usage/cluster/src/word2vec/prepare.py),准备训练数据和验证数据集,我们使用paddle.dataset.imikolov数据集,并根据分布式训练并发数(trainer节点个数),在`prepare.py`开头部分指定`SPLIT_COUNT`将数据切分成多份。 -默认配置如下: +在线上系统中,通常会使用MapReduce任务的输出结果作为训练结果,这样训练文件的个数会比较多,而且个数并不确定。在trainer中可以使用下面取模的方法为每个trainer分配训练数据文件: ```python -HOSTS = [ - "root@192.168.100.17", - "root@192.168.100.18", - ] - -''' -工作空间配置 -''' - -#工作空间根目录 -ROOT_DIR = "/home/paddle" - -''' -网络配置 -''' -#pserver NIC -PADDLE_NIC = "eth0" -#pserver 端口 -PADDLE_PORT = 7164 -#pserver 端口数 -PADDLE_PORTS_NUM = 2 -#pserver sparse ports num -PADDLE_PORTS_NUM_FOR_SPARSE = 2 - -#集群作业中所有进程的环境设置 -LD_LIBRARY_PATH="/usr/local/cuda/lib64:/usr/lib64" +import os +train_list = [] +flist = os.listdir("/train_data/") +for f in flist: + suffix = int(f.split("-")[1]) + if suffix % TRAINER_COUNT == TRAINER_ID: + train_list.append(f) +``` + +示例程序`prepare.py`会把训练集和测试集分别分割成多个文件(例子中为3个,后缀为`-00000`、`-00001`和`-00002`): ``` +train.txt +train.txt-00000 +train.txt-00001 +train.txt-00002 +test.txt +test.txt-00000 +test.txt-00001 +test.txt-00002 +``` + +在进行分布式训练时,每个trainer进程需要能够读取属于自己的一份数据。在一些分布式系统中,系统会提供一个分布式存储服务,这样保存在分布式存储中的数据可以被集群中的每个节点读取到。如果不使用分布式存储,则需要手动拷贝属于每个trainer节点的训练数据到对应的节点上。 -### 启动集群作业 -`paddle.py` 提供了自动化脚本来启动不同节点中的所有 PaddlePaddle 集群进程。默认情况下,所有命令行选项可以设置为```paddle.py``` 命令选项并且 `paddle.py` 将透明、自动地将这些选项应用到 PaddlePaddle 底层进程。 +对于不同的训练任务,训练数据格式和训练程序的`reader()`会大不相同,所以开发者需要根据自己训练任务的实际场景完成训练数据的分割和`reader()`的编写。 -`paddle.py` 为方便作业启动提供了两个独特的命令选项。 +### 准备训练程序 -`job_dispatch_package` 设为本地 `workspace` 目录,它将被分发到 conf.py 中设置的所有节点。 它有助于帮助频繁修改和访问工作区文件的用户减少负担,否则频繁的多节点工作空间部署可能会很麻烦。 -`job_workspace` 设为已部署的工作空间目录,`paddle.py` 将跳过分发阶段直接启动所有节点的集群作业。它可以帮助减少分发延迟。 +我们会对每个训练任务都会在每个节点上创建一个工作空间(workspace),其中包含了用户的训练程序、程序依赖、挂载或下载的训练数据分片。 -`cluster_train/run.sh` 提供了命令样例来运行 `demo/recommendation` 集群工作,只需用你定义的目录修改 `job_dispatch_package` 和 `job_workspace`,然后: +最后,工作空间应如下所示: ``` -sh run.sh +. +|-- my_lib.py +|-- word_dict.pickle +|-- train.py +|-- train_data_dir/ +| |-- train.txt-00000 +| |-- train.txt-00001 +| |-- train.txt-00002 +`-- test_data_dir/ + |-- test.txt-00000 + |-- test.txt-00001 + `-- test.txt-00002 ``` -集群作业将会在几秒后启动。 +- `my_lib.py`:会被`train.py`调用的一些用户定义的库函数,比如PIL库等。 +- `word_dict.pickle`:在`train.py`中会使用到的字典数据文件。 +- `train.py`:训练程序,代码参考[api_train_v2_cluster.py](https://github.com/PaddlePaddle/Paddle/tree/develop/doc/howto/usage/cluster/src/word2vec/api_train_v2_cluster.py)。***注意:*** 对于本样例代码,在使用不同的分布式计算平台时,您可能需要修改`train.py`开头的部分(如下),以便获得训练数据的位置和获取环境变量配置: + + ```python + cluster_train_file = "./train_data_dir/train/train.txt" + cluster_test_file = "./test_data_dir/test/test.txt" + node_id = os.getenv("OMPI_COMM_WORLD_RANK") + if not node_id: + raise EnvironmentError("must provied OMPI_COMM_WORLD_RANK") + ``` -### 终止集群作业 -`paddle.py`能获取`Ctrl + C` SIGINT 信号来自动终止它启动的所有进程。只需中断 `paddle.py` 任务来终止集群作业。如果程序崩溃你也可以手动终止。 +- `train_data_dir`:包含训练数据的目录,可以是从分布式存储挂载过来的,也可以是在任务启动前下载到本地的。 +- `test_data_dir`:包含测试数据集的目录。 -### 检查集群训练结果 -详细信息请检查 $workspace/log 里的日志,每一个节点都有相同的日志结构。 +## 使用分布式计算平台或工具 -`paddle_trainer.INFO` -提供几乎所有训练的内部输出日志,与本地训练相同。这里检验运行时间模型的收敛。 +PaddlePaddle可以使用多种分布式计算平台构建分布式计算任务,包括: +- [Kubernetes](http://kubernetes.io) Google开源的容器集群的调度框架,支持大规模集群生产环境的完整集群方案。 +- [OpenMPI](https://www.open-mpi.org) 成熟的高性能并行计算框架。 +- [Fabric](http://www.fabfile.org) 集群管理工具。可以使用`Fabric`编写集群任务提交和管理脚本。 -`paddle_pserver2.INFO` -提供 pserver 运行日志,有助于诊断分布式错误。 +对于不同的集群平台,会分别介绍集群作业的启动和停止方法。这些例子都可以在[cluster_train_v2](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/scripts/cluster_train_v2)找到。 -`server.log` -提供 pserver 进程的 stderr 和 stdout。训练失败时可以检查错误日志。 +在使用分布式计算平台进行训练时,任务被调度在集群中时,分布式计算平台通常会通过API或者环境变量提供任务运行需要的参数,比如节点的ID、IP和任务节点个数等。 -`train.log` -提供训练过程的 stderr 和 stdout。训练失败时可以检查错误日志。 +## 在不同集群中运行 -### 检查模型输出 -运行完成后,模型文件将被写入节点 0 的 `output` 目录中。 -工作空间中的 `nodefile` 表示当前集群作业的节点 ID。 + - [fabric集群](fabric_cn.md) + - [openmpi集群](openmpi_cn.md) + - [kubernetes单机](k8s_cn.md) + - [kubernetes distributed分布式](k8s_distributed_cn.md) + - [AWS上运行kubernetes集群训练](k8s_aws_cn.md) diff --git a/doc/howto/usage/cluster/cluster_train_en.md b/doc/howto/usage/cluster/cluster_train_en.md index c60876721cbf5565d6e48c8061811aacada748cd..28cd1fa7903e559e33a7fc2f00172fdfbe2fdc97 100644 --- a/doc/howto/usage/cluster/cluster_train_en.md +++ b/doc/howto/usage/cluster/cluster_train_en.md @@ -1,156 +1,191 @@ -# Run Distributed Training +# Distributed Training -In this article, we explain how to run distributed Paddle training jobs on clusters. We will create the distributed version of the single-process training example, [recommendation](https://github.com/baidu/Paddle/tree/develop/demo/recommendation). +## Introduction -[Scripts](https://github.com/baidu/Paddle/tree/develop/paddle/scripts/cluster_train) used in this article launch distributed jobs via SSH. They also work as a reference for users running more sophisticated cluster management systems like MPI and [Kubernetes](https://github.com/PaddlePaddle/Paddle/tree/develop/doc/howto/usage/k8s). +In this article, we'll explain how to run distributed training jobs with PaddlePaddle on different types of clusters. The diagram below shows the main architecture of a distributed trainning job: -## Prerequisite + -1. Aforementioned scripts use a Python library [fabric](http://www.fabfile.org/) to run SSH commands. We can use `pip` to install fabric: +- Data shard: training data will be split into multiple partitions, trainers use the partitions of the whole dataset to do the training job. +- Trainer: each trainer reads the data shard, and train the neural network. Then the trainer will upload calculated "gradients" to parameter servers, and wait for parameters to be optimized on the parameter server side. When that finishes, the trainer download optimized parameters and continues its training. +- Parameter server: every parameter server stores part of the whole neural network model data. They will do optimization calculations when gradients are uploaded from trainers, and then send updated parameters to trainers. - ```bash - pip install fabric - ``` +PaddlePaddle can support both synchronize stochastic gradient descent (SGD) and asynchronous SGD. -1. We need to install PaddlePaddle on all nodes in the cluster. To enable GPUs, we need to install CUDA in `/usr/local/cuda`; otherwise Paddle would report errors at runtime. +When training with synchronize SGD, PaddlePaddle uses an internal "synchronize barrier" which makes gradients update and parameter download in strict order. On the other hand, asynchronous SGD won't wait for all trainers to finish upload at a single step, this will increase the parallelism of distributed training: parameter servers do not depend on each other, they'll do parameter optimization concurrently. Parameter servers will not wait for trainers, so trainers will also do their work concurrently. But asynchronous SGD will introduce more randomness and noises in the gradient. -1. Set the `ROOT_DIR` variable in [`cluster_train/conf.py`] on all nodes. For convenience, we often create a Unix user `paddle` on all nodes and set `ROOT_DIR=/home/paddle`. In this way, we can write public SSH keys into `/home/paddle/.ssh/authorized_keys` so that user `paddle` can SSH to all nodes without password. +## Preparations +1. Prepare your computer cluster. It's normally a bunch of Linux servers connected by LAN. Each server will be assigned a unique IP address. The computers in the cluster can be called "nodes". +2. Install PaddlePaddle on every node. If you are going to take advantage of GPU cards, you'll also need to install proper driver and CUDA libraries. To install PaddlePaddle please read [this build and install](http://www.paddlepaddle.org/docs/develop/documentation/en/getstarted/build_and_install/index_en.html) document. We strongly recommend using [Docker installation](http://www.paddlepaddle.org/docs/develop/documentation/en/getstarted/build_and_install/docker_install_en.html). -## Prepare Job Workspace +After installation, you can check the version by typing the below command (run a docker container if using docker: `docker run -it paddlepaddle/paddle:[tag] /bin/bash`): -We refer to the directory where we put dependent libraries, config files, etc., as *workspace*. +```bash +$ paddle version +PaddlePaddle 0.10.0rc, compiled with + with_avx: ON + with_gpu: OFF + with_double: OFF + with_python: ON + with_rdma: OFF + with_timer: OFF +``` -These `train/test` data should be prepared before launching cluster job. To satisfy the requirement that train/test data are placed in different directory from workspace, PADDLE refers train/test data according to index file named as `train.list/test.list` which are used in model config file. So the train/test data also contains train.list/test.list two list file. All local training demo already provides scripts to help you create these two files, and all nodes in cluster job will handle files with same logical code in normal condition. +We'll take `doc/howto/usage/cluster/src/word2vec` as an example to introduce distributed training using PaddlePaddle v2 API. -Generally, you can use same model file from local training for cluster training. What you should have in mind that, the `batch_size` set in `setting` function in model file means batch size in `each` node of cluster job instead of total batch size if synchronization SGD was used. +## Command-line arguments -Following steps are based on [demo/recommendation](https://github.com/PaddlePaddle/Paddle/tree/develop/demo/recommendation) demo in demo directory. +### Starting parameter server -You just go through demo/recommendation tutorial doc until `Train` section, and at last you will get train/test data and model configuration file. Finaly, just use demo/recommendation as workspace for cluster training. +Type the below command to start a parameter server which will wait for trainers to connect: -At last your workspace should look like as follow: -``` -. -|-- common_utils.py -|-- data -| |-- config.json -| |-- config_generator.py -| |-- meta.bin -| |-- meta_config.json -| |-- meta_generator.py -| |-- ml-1m -| |-- ml_data.sh -| |-- ratings.dat.test -| |-- ratings.dat.train -| |-- split.py -| |-- test.list -| `-- train.list -|-- dataprovider.py -|-- evaluate.sh -|-- prediction.py -|-- preprocess.sh -|-- requirements.txt -|-- run.sh -`-- trainer_config.py +```bash +$ paddle pserver --port=7164 --ports_num=1 --ports_num_for_sparse=1 --num_gradient_servers=1 ``` -Not all of these files are needed for cluster training, but it's not necessary to remove useless files. -`trainer_config.py` -Indicates the model config file. +If you wish to run parameter servers in background, and save a log file, you can type: +```bash +$ stdbuf -oL /usr/bin/nohup paddle pserver --port=7164 --ports_num=1 --ports_num_for_sparse=1 --num_gradient_servers=1 &> pserver.log +``` -`train.list` and `test.list` -File index. It stores all relative or absolute file paths of all train/test data at current node. +Parameter Description -`dataprovider.py` -used to read train/test samples. It's same as local training. +- port: **required, default 7164**, port which parameter server will listen on. If ports_num greater than 1, parameter server will listen on multiple ports for more network throughput. +- ports_num: **required, default 1**, total number of ports will listen on. +- ports_num_for_sparse: **required, default 0**, number of ports which serves sparse parameter update. +- num_gradient_servers: **required, default 1**, total number of gradient servers. -`data` -all files in data directory are refered by train.list/test.list which are refered by data provider. +### Starting trainer +Type the command below to start the trainer(name the file whatever you want, like "train.py") +```bash +$ python train.py +``` -## Prepare Cluster Job Configuration +Trainers' network need to be connected with parameter servers' network to finish the job. Trainers need to know port and IPs to locate parameter servers. You can pass arguments to trainers through [environment variables](https://en.wikipedia.org/wiki/Environment_variable) or pass to `paddle.init()` function. Arguments passed to the `paddle.init()` function will overwrite environment variables. -The options below must be carefully set in cluster_train/conf.py +Use environment viriables: -`HOSTS` all nodes hostname or ip that will run cluster job. You can also append user and ssh port with hostname, such as root@192.168.100.17:9090. +```bash +export PADDLE_INIT_USE_GPU=False +export PADDLE_INIT_TRAINER_COUNT=1 +export PADDLE_INIT_PORT=7164 +export PADDLE_INIT_PORTS_NUM=1 +export PADDLE_INIT_PORTS_NUM_FOR_SPARSE=1 +export PADDLE_INIT_NUM_GRADIENT_SERVERS=1 +export PADDLE_INIT_TRAINER_ID=0 +export PADDLE_INIT_PSERVERS=127.0.0.1 +python train.py +``` -`ROOT_DIR` workspace ROOT directory for placing JOB workspace directory +Pass arguments: -`PADDLE_NIC` the NIC(Network Interface Card) interface name for cluster communication channel, such as eth0 for ethternet, ib0 for infiniband. +```python +paddle.init( + use_gpu=False, + trainer_count=1, + port=7164, + ports_num=1, + ports_num_for_sparse=1, + num_gradient_servers=1, + trainer_id=0, + pservers="127.0.0.1") +``` -`PADDLE_PORT` port number for cluster commnunication channel +Parameter Description -`PADDLE_PORTS_NUM` the number of port used for cluster communication channle. if the number of cluster nodes is small(less than 5~6nodes), recommend you set it to larger, such as 2 ~ 8, for better network performance. +- use_gpu: **optional, default False**, set to "True" to enable GPU training. +- trainer_count: **required, default 1**, total count of trainers in the training job. +- port: **required, default 7164**, port to connect to parameter server. +- ports_num: **required, default 1**, number of ports for communication. +- ports_num_for_sparse: **required, default 0**, number of ports for sparse type caculation. +- num_gradient_servers: **required, default 1**, total number of gradient server. +- trainer_id: **required, default 0**, ID for every trainer, start from 0. +- pservers: **required, default 127.0.0.1**, list of IPs of parameter servers, separated by ",". -`PADDLE_PORTS_NUM_FOR_SPARSE` the number of port used for sparse updater cluster commnunication channel. if sparse remote update is used, set it like `PADDLE_PORTS_NUM` +### Prepare Training Dataset -`LD_LIBRARY_PATH` set addtional LD_LIBRARY_PATH for cluster job. You can use it to set CUDA libraries path. +Here's some example code [prepare.py](https://github.com/PaddlePaddle/Paddle/tree/develop/doc/howto/usage/cluster/src/word2vec/prepare.py), it will download public `imikolov` dataset and split it into multiple files according to job parallelism(trainers count). Modify `SPLIT_COUNT` at the begining of `prepare.py` to change the count of output files. -Default Configuration as follow: +In the real world, we often use `MapReduce` job's output as training data, so there will be lots of files. You can use `mod` to assign training file to trainers: ```python -HOSTS = [ - "root@192.168.100.17", - "root@192.168.100.18", - ] - -''' -workspace configuration -''' - -#root dir for workspace -ROOT_DIR = "/home/paddle" - -''' -network configuration -''' -#pserver nics -PADDLE_NIC = "eth0" -#pserver port -PADDLE_PORT = 7164 -#pserver ports num -PADDLE_PORTS_NUM = 2 -#pserver sparse ports num -PADDLE_PORTS_NUM_FOR_SPARSE = 2 - -#environments setting for all processes in cluster job -LD_LIBRARY_PATH="/usr/local/cuda/lib64:/usr/lib64" +import os +train_list = [] +flist = os.listdir("/train_data/") +for f in flist: + suffix = int(f.split("-")[1]) + if suffix % TRAINER_COUNT == TRAINER_ID: + train_list.append(f) ``` -### Launching Cluster Job -`paddle.py` provides automatical scripts to start all PaddlePaddle cluster processes in different nodes. By default, all command line options can set as `paddle.py` command options and `paddle.py` will transparently and automatically set these options to PaddlePaddle lower level processes. +Example code `prepare.py` will split training data and testing data into 3 files with digital suffix like `-00000`, `-00001` and`-00002`: -`paddle.py`provides two distinguished command option for easy job launching. +``` +train.txt +train.txt-00000 +train.txt-00001 +train.txt-00002 +test.txt +test.txt-00000 +test.txt-00001 +test.txt-00002 +``` + +When job started, every trainer needs to get it's own part of data. In some distributed systems a storage service will be provided, so the date under that path can be accessed by all the trainer nodes. Without the storage service, you must copy the training data to each trainer node. + +Different training jobs may have different data format and `reader()` function, developers may need to write different data prepare scripts and `reader()` functions for their job. -`job_dispatch_package` set it with local `workspace`directory, it will be dispatched to all nodes set in conf.py. It could be helpful for frequent hacking workspace files, otherwise frequent mulit-nodes workspace deployment could make your crazy. -`job_workspace` set it with already deployed workspace directory, `paddle.py` will skip dispatch stage to directly launch cluster job with all nodes. It could help to reduce heavy -dispatch latency. +### Prepare Training program -`cluster_train/run.sh` provides command line sample to run `demo/recommendation` cluster job, just modify `job_dispatch_package` and `job_workspace` with your defined directory, then: +We'll create a *workspace* directory on each node, storing your training program, dependencies, mounted or downloaded dataset directory. + + +Your workspace may looks like: ``` -sh run.sh +. +|-- my_lib.py +|-- word_dict.pickle +|-- train.py +|-- train_data_dir/ +| |-- train.txt-00000 +| |-- train.txt-00001 +| |-- train.txt-00002 +`-- test_data_dir/ + |-- test.txt-00000 + |-- test.txt-00001 + `-- test.txt-00002 ``` -The cluster Job will start in several seconds. +- `my_lib.py`: user defined libraries, like PIL libs. This is optional. +- `word_dict.pickle`: dict file for training word embeding. +- `train.py`: training program. Sample code: [api_train_v2_cluster.py](https://github.com/PaddlePaddle/Paddle/tree/develop/doc/howto/usage/cluster/src/word2vec/api_train_v2_cluster.py). ***NOTE:*** You may need to modify the head part of `train.py` when using different cluster platform to retrive configuration environment variables: + + ```python + cluster_train_file = "./train_data_dir/train/train.txt" + cluster_test_file = "./test_data_dir/test/test.txt" + node_id = os.getenv("OMPI_COMM_WORLD_RANK") + if not node_id: + raise EnvironmentError("must provied OMPI_COMM_WORLD_RANK") + ``` -### Kill Cluster Job -`paddle.py` can capture `Ctrl + C` SIGINT signal to automatically kill all processes launched by it. So just stop `paddle.py` to kill cluster job. You should mannally kill job if program crashed. +- `train_data_dir`: containing training data. Mount from storage service or copy trainning data to here. +- `test_data_dir`: containing testing data. -### Check Cluster Training Result -Check log in $workspace/log for details, each node owns same log structure. +## Use cluster platforms or cluster management tools -`paddle_trainer.INFO` -It provides almost all interal output log for training, same as local training. Check runtime model convergence here. +PaddlePaddle supports running jobs on several platforms including: +- [Kubernetes](http://kubernetes.io) open-source system for automating deployment, scaling, and management of containerized applications from Google. +- [OpenMPI](https://www.open-mpi.org) Mature high performance parallel computing framework. +- [Fabric](http://www.fabfile.org) A cluster management tool. Write scripts to submit jobs or manage the cluster. -`paddle_pserver2.INFO` -It provides pserver running log, which could help to diagnose distributed error. +We'll introduce cluster job management on these platforms. The examples can be found under [cluster_train_v2](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/scripts/cluster_train_v2). -`server.log` -It provides stderr and stdout of pserver process. Check error log if training crashs. +These cluster platforms provide API or environment variables for training processes, when the job is dispatched to different nodes. Like node ID, IP or total number of nodes etc. -`train.log` -It provides stderr and stdout of trainer process. Check error log if training crashs. +## Use different clusters -### Check Model Output -After one pass finished, model files will be writed in `output` directory in node 0. -`nodefile` in workspace indicates the node id of current cluster job. + - [fabric](fabric_en.md) + - [openmpi](openmpi_en.md) + - [kubernetes](k8s_en.md) + - [kubernetes on AWS](k8s_aws_en.md) diff --git a/doc/howto/usage/cluster/fabric_cn.md b/doc/howto/usage/cluster/fabric_cn.md new file mode 100644 index 0000000000000000000000000000000000000000..0385e401b399a51fad112e604dc56cb2f84c0a4b --- /dev/null +++ b/doc/howto/usage/cluster/fabric_cn.md @@ -0,0 +1,42 @@ +# 使用fabric启动集群训练 + +## 准备一个Linux集群 +可以在`paddle/scripts/cluster_train_v2/fabric/docker_cluster`目录下,执行`kubectl -f ssh_servers.yaml`启动一个测试集群,并使用`kubectl get po -o wide`获得这些节点的IP地址。 + +## 启动集群作业 + +`paddle.py` 提供了自动化脚本来启动不同节点中的所有 PaddlePaddle 集群进程。默认情况下,所有命令行选项可以设置为 `paddle.py` 命令选项并且 `paddle.py` 将透明、自动地将这些选项应用到 PaddlePaddle 底层进程。 + +`paddle.py` 为方便作业启动提供了两个独特的命令选项。 + +- `job_dispatch_package` 设为本地 `workspace` 目录,它将被分发到 `conf.py` 中设置的所有节点。它有助于帮助频繁修改和访问工作区文件的用户减少负担,否则频繁的多节点工作空间部署可能会很麻烦。 +- `job_workspace` 设为已部署的工作空间目录,`paddle.py` 将跳过分发阶段直接启动所有节点的集群作业。它可以帮助减少分发延迟。 + +`cluster_train/run.sh` 提供了命令样例来运行 `doc/howto/usage/cluster/src/word2vec` 集群任务,只需用您定义的目录修改 `job_dispatch_package` 和 `job_workspace`,然后: +``` +sh run.sh +``` + +集群作业将会在几秒后启动。 + +## 终止集群作业 +`paddle.py`能获取`Ctrl + C` SIGINT 信号来自动终止它启动的所有进程。只需中断 `paddle.py` 任务来终止集群作业。如果程序崩溃你也可以手动终止。 + +## 检查集群训练结果 +详细信息请检查 $workspace/log 里的日志,每一个节点都有相同的日志结构。 + +`paddle_trainer.INFO` +提供几乎所有训练的内部输出日志,与本地训练相同。这里检验运行时间模型的收敛。 + +`paddle_pserver2.INFO` +提供 pserver 运行日志,有助于诊断分布式错误。 + +`server.log` +提供 parameter server 进程的 stderr 和 stdout。训练失败时可以检查错误日志。 + +`train.log` +提供训练过程的 stderr 和 stdout。训练失败时可以检查错误日志。 + +## 检查模型输出 +运行完成后,模型文件将被写入节点 0 的 `output` 目录中。 +工作空间中的 `nodefile` 表示当前集群作业的节点 ID。 diff --git a/doc/howto/usage/cluster/fabric_en.md b/doc/howto/usage/cluster/fabric_en.md new file mode 100644 index 0000000000000000000000000000000000000000..bf270d89ab8514801ca4629cf412f73257429df9 --- /dev/null +++ b/doc/howto/usage/cluster/fabric_en.md @@ -0,0 +1,43 @@ +# Cluster Training Using Fabric + +## Prepare a Linux cluster + +Run `kubectl -f ssh_servers.yaml` under the directory: `paddle/scripts/cluster_train_v2/fabric/docker_cluster` will launch a demo cluster. Run `kubectl get po -o wide` to get IP addresses of these nodes. + +## Launching Cluster Job +`paddle.py` provides automatical scripts to start all PaddlePaddle cluster processes in different nodes. By default, all command line options can be set as `paddle.py` command options and `paddle.py` will transparently and automatically set these options to PaddlePaddle lower level processes. + +`paddle.py`provides two distinguished command option for easy job launching. + +- `job_dispatch_package` set it with local `workspace` directory, it will be dispatched to all nodes which is set in `conf.py`. It could be helpful for frequently manipulating workspace files. otherwise, frequent multi-nodes workspace deployment is very annoying. +- `job_workspace` set it with already deployed workspace directory, `paddle.py` will skip dispatch stage to directly launch cluster job with all nodes. It could help to reduce heavy +dispatch latency. + +`cluster_train/run.sh` provides command line sample to run `demo/recommendation` cluster job, just modify `job_dispatch_package` and `job_workspace` with your defined directory, then: +``` +sh run.sh +``` + +The cluster Job will start in several seconds. + +## Kill Cluster Job +`paddle.py` can capture `Ctrl + C` SIGINT signal to automatically kill all processes launched by it. So just stop `paddle.py` to kill cluster job. You should manually kill the job if the program crashed. + +## Check Cluster Training Result +Check log in $workspace/log for details, each node owns same log structure. + +`paddle_trainer.INFO` +It provides almost all internal output log for training, same as local training. Check runtime model convergence here. + +`paddle_pserver2.INFO` +It provides parameter server running log, which could help to diagnose distributed error. + +`server.log` +It provides stderr and stdout of parameter server process. Check error log if training crashes. + +`train.log` +It provides stderr and stdout of trainer process. Check error log if training crashes. + +## Check Model Output +After one pass finished, model files will be written in `output` directory in node 0. +`nodefile` in workspace indicates the node id of current cluster job. diff --git a/doc/howto/usage/cluster/fluid_cluster_train_en.md b/doc/howto/usage/cluster/fluid_cluster_train_en.md new file mode 100644 index 0000000000000000000000000000000000000000..ae825d9a517c7e9005d4e32f8f34b3f6a79be0c9 --- /dev/null +++ b/doc/howto/usage/cluster/fluid_cluster_train_en.md @@ -0,0 +1,153 @@ +# Fluid Distributed Training + +## Introduction + +In this article, we'll explain how to configure and run distributed training jobs with PaddlePaddle Fluid in a bare metal cluster. + +## Preparations + +### Getting the cluster ready + +Prepare the compute nodes in the cluster. Nodes in this cluster can be of any specification that runs PaddlePaddle, and with a unique IP address assigned to it. Make sure they can communicate to each other. + +### Have PaddlePaddle installed + +PaddlePaddle must be installed on all nodes. If you have GPU cards on your nodes, be sure to properly install drivers and CUDA libraries. + +PaddlePaddle build and installation guide can be found [here](http://www.paddlepaddle.org/docs/develop/documentation/en/getstarted/build_and_install/index_en.html). + +In addition to above, the `cmake` command should be run with the option `WITH_DISTRIBUTE` set to on. An example bare minimum `cmake` command would look as follows: + +``` bash +cmake .. -DWITH_DOC=OFF -DWITH_GPU=OFF -DWITH_DISTRIBUTE=ON -DWITH_SWIG_PY=ON -DWITH_PYTHON=ON +``` + +### Update the training script + +#### Non-cluster training script + +Let's take [Deep Learning 101](http://www.paddlepaddle.org/docs/develop/book/01.fit_a_line/index.html)'s first chapter: "fit a line" as an example. + +The non-cluster version of this demo with fluid API is as follows: + +``` python +import paddle.v2 as paddle +import paddle.v2.fluid as fluid + +x = fluid.layers.data(name='x', shape=[13], dtype='float32') +y_predict = fluid.layers.fc(input=x, size=1, act=None) +y = fluid.layers.data(name='y', shape=[1], dtype='float32') + +cost = fluid.layers.square_error_cost(input=y_predict, label=y) +avg_cost = fluid.layers.mean(x=cost) + +sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.001) +sgd_optimizer.minimize(avg_cost) + +BATCH_SIZE = 20 + +train_reader = paddle.batch( + paddle.reader.shuffle( + paddle.dataset.uci_housing.train(), buf_size=500), + batch_size=BATCH_SIZE) + +place = fluid.CPUPlace() +feeder = fluid.DataFeeder(place=place, feed_list=[x, y]) +exe = fluid.Executor(place) + +exe.run(fluid.default_startup_program()) + +PASS_NUM = 100 +for pass_id in range(PASS_NUM): + fluid.io.save_persistables(exe, "./fit_a_line.model/") + fluid.io.load_persistables(exe, "./fit_a_line.model/") + for data in train_reader(): + avg_loss_value, = exe.run(fluid.default_main_program(), + feed=feeder.feed(data), + fetch_list=[avg_cost]) + + if avg_loss_value[0] < 10.0: + exit(0) # if avg cost less than 10.0, we think our code is good. +exit(1) +``` + +We created a simple fully-connected neural network training program and handed it to the fluid executor to run for 100 passes. + +Now let's try to convert it to a distributed version to run on a cluster. + +#### Introducing parameter server + +As we can see from the non-cluster version of training script, there is only one role in the script: the trainer, that performs the computing as well as holds the parameters. In cluster training, since multi-trainers are working on the same task, they need one centralized place to hold and distribute parameters. This centralized place is called the Parameter Server in PaddlePaddle. + +![parameter server architecture](src/trainer.png) + +Parameter Server in fluid not only holds the parameters but is also assigned with a part of the program. Trainers communicate with parameter servers via send/receive OPs. For more technical details, please refer to [this document](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/dist_refactor/distributed_architecture.md). + +Now we need to create programs for both: trainers and parameter servers, the question is how? + +#### Slice the program + +Fluid provides a tool called "Distributed Transpiler" that automatically converts the non-cluster program into cluster program. + +The idea behind this tool is to find the optimize OPs and gradient parameters, slice the program into 2 pieces and connect them with send/receive OP. + +Optimize OPs and gradient parameters can be found from the return values of optimizer's minimize function. + +To put them together: + +``` python +... #define the program, cost, and create sgd optimizer + +optimize_ops, params_grads = sgd_optimizer.minimize(avg_cost) #get optimize OPs and gradient parameters + +t = fluid.DistributeTranspiler() # create the transpiler instance +# slice the program into 2 pieces with optimizer_ops and gradient parameters list, as well as pserver_endpoints, which is a comma separated list of [IP:PORT] and number of trainers +t.transpile(optimize_ops, params_grads, pservers=pserver_endpoints, trainers=2) + +... #create executor + +# in pserver, run this +#current_endpoint here means current pserver IP:PORT you wish to run on +pserver_prog = t.get_pserver_program(current_endpoint) +pserver_startup = t.get_startup_program(current_endpoint, pserver_prog) +exe.run(pserver_startup) +exe.run(pserver_prog) + +# in trainer, run this +... # define data reader +exe.run(fluid.default_startup_program()) +for pass_id in range(100): + for data in train_reader(): + exe.run(t.get_trainer_program()) + + +``` + +### E2E demo + +Please find the complete demo from [here](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/v2/fluid/tests/book_distribute/notest_dist_fit_a_line.py). +First `cd` into the folder that contains the `python` files. In this case: + +```bash +cd /paddle/python/paddle/v2/fluid/tests/book_distribute +``` + +In parameter server node run the following in the command line: + +``` bash +PSERVERS=192.168.1.2:6174 SERVER_ENDPOINT=192.168.1.2:6174 TRAINING_ROLE=PSERVER python notest_dist_fit_a_line.py +``` + +*please note we assume that your parameter server runs at 192.168.1.2:6174* + +Wait until the prompt `Server listening on 192.168.1.2:6174` + +Then in 2 of your trainer nodes run this: + +``` bash +PSERVERS=192.168.1.2:6174 SERVER_ENDPOINT=192.168.1.2:6174 TRAINING_ROLE=TRAINER python notest_dist_fit_a_line.py +``` + +*the reason you need to run this command twice in 2 nodes is because: in the script we set the trainer count to be 2. You can change this setting on line 50* + +Now you have 2 trainers and 1 parameter server up and running. diff --git a/doc/howto/usage/cluster/k8s_aws_cn.md b/doc/howto/usage/cluster/k8s_aws_cn.md new file mode 120000 index 0000000000000000000000000000000000000000..c44cd9a731bed7067cdf19aa2f714abdce6c736a --- /dev/null +++ b/doc/howto/usage/cluster/k8s_aws_cn.md @@ -0,0 +1 @@ +k8s_aws_en.md \ No newline at end of file diff --git a/doc/howto/usage/cluster/k8s_aws_en.md b/doc/howto/usage/cluster/k8s_aws_en.md new file mode 100644 index 0000000000000000000000000000000000000000..0dfa8237a3fa2c9c3ee11e873c9fbbed3cd6018f --- /dev/null +++ b/doc/howto/usage/cluster/k8s_aws_en.md @@ -0,0 +1,689 @@ + +# Distributed PaddlePaddle Training on AWS with Kubernetes + +We will show you step by step on how to run distributed PaddlePaddle training on AWS cluster with Kubernetes. Let's start from core concepts. + +## Distributed PaddlePaddle Training Core Concepts + +### Distributed Training Job + +A distributed training job is represented by a [Kubernetes job](https://kubernetes.io/docs/user-guide/jobs/#what-is-a-job). + +Each Kuberentes job is described by a job config file, which specifies the information like the number of [pods](https://kubernetes.io/docs/user-guide/pods/#what-is-a-pod) in the job and environment variables. + +In a distributed training job, we would: + +1. prepare partitioned training data and configuration file on a distributed file system (in this tutorial we use Amazon Elastic File System), and +1. create and submit the Kubernetes job config to the Kubernetes cluster to start the training job. + +### Parameter Servers and Trainers + +There are two roles in a PaddlePaddle cluster: *parameter server (pserver)* and *trainer*. Each parameter server process maintains a shard of the global model. Each trainer has its local copy of the model, and uses its local data to update the model. During the training process, trainers send model updates to parameter servers, parameter servers are responsible for aggregating these updates, so that trainers can synchronize their local copy with the global model. + +
![Model is partitioned into two shards. Managed by two parameter servers respectively.](src/pserver_and_trainer.png)
+ +In order to communicate with pserver, trainer needs to know the ip address of each pserver. In kubernetes it's better to use a service discovery mechanism (e.g., DNS hostname) rather than static ip address, since any pserver's pod may be killed and a new pod could be schduled onto another node of different ip address. However, now we are using static ip. This will be improved. + +Parameter server and trainer are packaged into a same docker image. They will run once pod is scheduled by kubernetes job. + +### Trainer ID + +Each trainer process requires a trainer ID, a zero-based index value, passed in as a command-line parameter. The trainer process thus reads the data partition indexed by this ID. + +### Training + +The entry-point of a container is a shell script. It can see some environment variables pre-defined by Kubernetes. This includes one that gives the job's identity, which can be used in a remote call to the Kubernetes apiserver that lists all pods in the job. + +We rank each pod by sorting them by their ips. The rank of each pod could be the "pod ID". Because we run one trainer and one parameter server in each pod, we can use this "pod ID" as the trainer ID. A detailed workflow of the entry-point script is as follows: + +1. Query the api server to get pod information, and assign the `trainer_id` by sorting the ip. +1. Copy the training data from EFS persistent volume into container. +1. Parse the `paddle pserver` and `paddle trainer` startup parameters from environment variables, and then start up the processes. +1. Trainer with `train_id` 0 will automatically write results onto EFS volume. + + +## PaddlePaddle on AWS with Kubernetes + +### Choose AWS Service Region +This tutorial requires several AWS services work in the same region. Before we create anything in AWS, please check the following link +https://aws.amazon.com/about-aws/global-infrastructure/regional-product-services/ +Choose a region which has the following services available: EC2, EFS, VPS, CloudFormation, KMS, VPC, S3. +In this tutorial, we use "Oregon(us-west-2)" as example. + +### Create AWS Account and IAM Account + +Under each AWS account, we can create multiple [IAM](http://docs.aws.amazon.com/IAM/latest/UserGuide/introduction.html) users. This allows us to grant some privileges to each IAM user and to create/operate AWS clusters as an IAM user. + +To sign up an AWS account, please +follow +[this guide](http://docs.aws.amazon.com/lambda/latest/dg/setting-up.html). +To create IAM users and user groups under an AWS account, please +follow +[this guide](http://docs.aws.amazon.com/IAM/latest/UserGuide/id_users_create.html). + +Please be aware that this tutorial needs the following privileges for the user in IAM: + +- AmazonEC2FullAccess +- AmazonS3FullAccess +- AmazonRoute53FullAccess +- AmazonRoute53DomainsFullAccess +- AmazonElasticFileSystemFullAccess +- AmazonVPCFullAccess +- IAMUserSSHKeys +- IAMFullAccess +- NetworkAdministrator +- AWSKeyManagementServicePowerUser + + +### Download kube-aws and kubectl + +#### kube-aws + +[kube-aws](https://github.com/coreos/kube-aws) is a CLI tool to automate cluster deployment to AWS. +##### Verify kube-aws integrity +Note: if you are using a non-official release (e.g RC release) kube-aws, you can skip this setp. +Import the CoreOS Application Signing Public Key: + +``` +gpg2 --keyserver pgp.mit.edu --recv-key FC8A365E +``` + +Validate the key fingerprint: + +``` +gpg2 --fingerprint FC8A365E +``` +The correct key fingerprint is `18AD 5014 C99E F7E3 BA5F 6CE9 50BD D3E0 FC8A 365E` + +We can download `kube-aws` from its [release page](https://github.com/coreos/kube-aws/releases). In this tutorial, we use version 0.9.1 + +Validate the tarball's GPG signature: + +``` +PLATFORM=linux-amd64 + # Or +PLATFORM=darwin-amd64 + +gpg2 --verify kube-aws-${PLATFORM}.tar.gz.sig kube-aws-${PLATFORM}.tar.gz +``` +##### Install kube-aws +Extract the binary: + +``` +tar zxvf kube-aws-${PLATFORM}.tar.gz +``` + +Add kube-aws to your path: + +``` +mv ${PLATFORM}/kube-aws /usr/local/bin +``` + + +#### kubectl + +[kubectl](https://kubernetes.io/docs/user-guide/kubectl-overview/) is a command line interface for running commands against Kubernetes clusters. + +Download `kubectl` from the Kubernetes release artifact site with the `curl` tool. + +``` +# OS X +curl -O https://storage.googleapis.com/kubernetes-release/release/"$(curl -s https://storage.googleapis.com/kubernetes-release/release/stable.txt)"/bin/darwin/amd64/kubectl + +# Linux +curl -O https://storage.googleapis.com/kubernetes-release/release/"$(curl -s https://storage.googleapis.com/kubernetes-release/release/stable.txt)"/bin/linux/amd64/kubectl +``` + +Make the kubectl binary executable and move it to your PATH (e.g. `/usr/local/bin`): + +``` +chmod +x ./kubectl +sudo mv ./kubectl /usr/local/bin/kubectl +``` + +### Configure AWS Credentials + +First check out [this](http://docs.aws.amazon.com/cli/latest/userguide/installing.html) for installing the AWS command line interface. + +And then configure your AWS account information: + +``` +aws configure +``` + + +Fill in the required fields: + + +``` +AWS Access Key ID: YOUR_ACCESS_KEY_ID +AWS Secrete Access Key: YOUR_SECRETE_ACCESS_KEY +Default region name: us-west-2 +Default output format: json +``` + +`YOUR_ACCESS_KEY_ID`, and `YOUR_SECRETE_ACCESS_KEY` is the IAM key and secret from [Create AWS Account and IAM Account](#create-aws-account-and-iam-account) + +Verify that your credentials work by describing any instances you may already have running on your account: + +``` +aws ec2 describe-instances +``` + +### Define Cluster Parameters + +#### EC2 key pair + +The keypair that will authenticate SSH access to your EC2 instances. The public half of this key pair will be configured on each CoreOS node. + +Follow [EC2 Keypair User Guide](http://docs.aws.amazon.com/AWSEC2/latest/UserGuide/ec2-key-pairs.html) to create a EC2 key pair + +After creating a key pair, you will use the key pair name to configure the cluster. + +Key pairs are only available to EC2 instances in the same region. We are using us-west-2 in our tutorial, so make sure to creat key pairs in that region (Oregon). + +Your browser will download a `key-name.pem` file which is the key to access the EC2 instances. We will use it later. + + +#### KMS key + +Amazon KMS keys are used to encrypt and decrypt cluster TLS assets. If you already have a KMS Key that you would like to use, you can skip creating a new key and provide the Arn string for your existing key. + +You can create a KMS key with the aws command line tool: + +``` +aws kms --region=us-west-2 create-key --description="kube-aws assets" +{ + "KeyMetadata": { + "CreationDate": 1458235139.724, + "KeyState": "Enabled", + "Arn": "arn:aws:kms:us-west-2:aaaaaaaaaaaaa:key/xxxxxxxxxxxxxxxxxxx", + "AWSAccountId": "xxxxxxxxxxxxx", + "Enabled": true, + "KeyUsage": "ENCRYPT_DECRYPT", + "KeyId": "xxxxxxxxx", + "Description": "kube-aws assets" + } +} +``` + +We will need to use the value of `Arn` later. + +And then let's add several inline policies in your IAM user permission. + +Go to [IAM Console](https://console.aws.amazon.com/iam/home?region=us-west-2#/home). Click on button `Users`, click user that we just created, and then click on `Add inline policy` button, and select `Custom Policy`. + +Paste into following inline policies: + +``` + (Caution: node_0, node_1, node_2 directories represents PaddlePaddle node and train_id, not the Kubernetes node){ + "Version": "2012-10-17", + "Statement": [ + { + "Sid": "Stmt1482205552000", + "Effect": "Allow", + "Action": [ + "kms:Decrypt", + "kms:Encrypt" + ], + "Resource": [ + "arn:aws:kms:*:AWS_ACCOUNT_ID:key/*" + ] + }, + { + "Sid": "Stmt1482205746000", + "Effect": "Allow", + "Action": [ + "cloudformation:CreateStack", + "cloudformation:UpdateStack", + "cloudformation:DeleteStack", + "cloudformation:DescribeStacks", + "cloudformation:DescribeStackResource", + "cloudformation:GetTemplate", + "cloudformation:DescribeStackEvents" + ], + "Resource": [ + "arn:aws:cloudformation:us-west-2:AWS_ACCOUNT_ID:stack/MY_CLUSTER_NAME/*" + ] + } + ] +} +``` +`Version` : Its value has to be exactly "2012-10-17". +`AWS_ACCOUNT_ID`: You can get it from following command line: + +``` +aws sts get-caller-identity --output text --query Account +``` + +`MY_CLUSTER_NAME`: Pick a MY_CLUSTER_NAME that you like, you will use it later as well. +Please note, stack name must satisfy regular expression pattern: [a-zA-Z][-a-zA-Z0-9*]*, which means no "_" or "-" in stack name, or kube-aws will throw error in later steps. + +#### External DNS name + +When the cluster is created, the controller will expose the TLS-secured API on a DNS name. + +DNS name should have a CNAME points to cluster DNS name or an A record points to the cluster IP address. + +We will need to use DNS name later in tutorial. If you don't already own one, you can choose any DNS name (e.g., `paddle`) and modify `/etc/hosts` to associate cluster IP with that DNS name for your local machine. And add name service (route53) in aws to associate the IP to paddle for cluster. We will find the cluster IP in later steps. + +#### S3 bucket + +You need to create an S3 bucket before startup the Kubernetes cluster. + +There are some bugs in aws cli in creating S3 bucket, so let's use the [S3 Console](https://console.aws.amazon.com/s3/home?region=us-west-2). + +Click on `Create Bucket`, fill in a unique BUCKET_NAME, and make sure region is us-west-2 (Oregon). + + +#### Initialize Assets + +Create a directory on your local machine to hold the generated assets: + +``` +$ mkdir my-cluster +$ cd my-cluster +``` + +Initialize the cluster CloudFormation stack with the KMS Arn, key pair name, and DNS name from the previous step: + +``` +kube-aws init \ +--cluster-name=MY_CLUSTER_NAME \ +--external-dns-name=MY_EXTERNAL_DNS_NAME \ +--region=us-west-2 \ +--availability-zone=us-west-2a \ +--key-name=KEY_PAIR_NAME \ +--kms-key-arn="arn:aws:kms:us-west-2:xxxxxxxxxx:key/xxxxxxxxxxxxxxxxxxx" +``` + +`MY_CLUSTER_NAME`: the one you picked in [KMS key](#kms-key) + +`MY_EXTERNAL_DNS_NAME`: see [External DNS name](#external-dns-name) + +`KEY_PAIR_NAME`: see [EC2 key pair](#ec2-key-pair) + +`--kms-key-arn`: the "Arn" in [KMS key](#kms-key) + +Here `us-west-2a` is used for parameter `--availability-zone`, but supported availability zone varies among AWS accounts. + +Please check if `us-west-2a` is supported by `aws ec2 --region us-west-2 describe-availability-zones`, if not switch to other supported availability zone. (e.g., `us-west-2a`, or `us-west-2b`) + + +There will now be a cluster.yaml file in the asset directory. This is the main configuration file for your cluster. + +By default `kube-aws` will only create one worker node. Let's edit `cluster.yaml` and change `workerCount` from 1 to 3. + + +#### Render contents of the asset directory + +In the simplest case, you can have kube-aws generate both your TLS identities and certificate authority for you. + +``` +kube-aws render credentials --generate-ca +``` + +The next command generates the default set of cluster assets in your asset directory. + +``` +kube-aws render stack +``` +Assets (templates and credentials) that are used to create, update and interact with your Kubernetes cluster will be created under your current folder. + + +### Kubernetes Cluster Start Up + +#### Create the instances defined in the CloudFormation template + +Now let's create your cluster (choose any `PREFIX` for the command below): + +``` +kube-aws up --s3-uri s3://BUCKET_NAME/PREFIX +``` + +`BUCKET_NAME`: the bucket name that you used in [S3 bucket](#s3-bucket) + + +#### Configure DNS + +You can invoke `kube-aws status` to get the cluster API endpoint after cluster creation. + +``` +$ kube-aws status +Cluster Name: paddle-cluster +Controller DNS Name: paddle-cl-ElbAPISe-EEOI3EZPR86C-531251350.us-west-2.elb.amazonaws.com +``` + +If you own a DNS name, set the A record to any of the above ip. __Or__ you can set up CNAME point to `Controller DNS Name` (`paddle-cl-ElbAPISe-EEOI3EZPR86C-531251350.us-west-2.elb.amazonaws.com`) + +##### Find IP address + +Use command `dig` to check the load balancer hostname to get the ip address. + +``` +$ dig paddle-cl-ElbAPISe-EEOI3EZPR86C-531251350.us-west-2.elb.amazonaws.com + +;; QUESTION SECTION: +;paddle-cl-ElbAPISe-EEOI3EZPR86C-531251350.us-west-2.elb.amazonaws.com. IN A + +;; ANSWER SECTION: +paddle-cl-ElbAPISe-EEOI3EZPR86C-531251350.us-west-2.elb.amazonaws.com. 59 IN A 54.241.164.52 +paddle-cl-ElbAPISe-EEOI3EZPR86C-531251350.us-west-2.elb.amazonaws.com. 59 IN A 54.67.102.112 +``` + +In the above output, both ip `54.241.164.52`, `54.67.102.112` will work. + +*If you own a DNS name*, set the A record to any of the above ip. Then you can skip to the step "Access the cluster". + +*If you do not own a DNS name*: +##### Update local DNS association +Edit `/etc/hosts` to associate above ip with the DNS name. +##### Add Route53 private name service in VPC + - Open [Route53 Console](https://console.aws.amazon.com/route53/home) + - Create hosted zone with following config + - Domain name: "paddle" + - Type: "Private hosted zone for amazon VPC" + - VPC ID: `` + + ![route53 zone setting](src/route53_create_zone.png) + - Add A record + - Click on the zone "paddle" just created + - Click the button "Create record set" + - Name : leave blank + - type: "A" + - Value: `` + + ![route53 create recordset](src/route53_create_recordset.png) + - Verify name service + - Connect to any instance created by kube-aws via ssh + - Run command "host paddle", see if the ip returned is the private ip of kube-controller + +#### Access the cluster + +Once the API server is running, you should see: + +``` +$ kubectl --kubeconfig=kubeconfig get nodes +NAME STATUS AGE +ip-10-0-0-134.us-west-2.compute.internal Ready 6m +ip-10-0-0-238.us-west-2.compute.internal Ready 6m +ip-10-0-0-50.us-west-2.compute.internal Ready 6m +ip-10-0-0-55.us-west-2.compute.internal Ready 6m +``` + + +### Setup Elastic File System for Cluster + +Training data is usually served on a distributed filesystem, we use Elastic File System (EFS) on AWS. + +1. Create security group for EFS in [security group console](https://us-west-2.console.aws.amazon.com/ec2/v2/home?region=us-west-2#SecurityGroups:sort=groupId) + 1. Look up security group id for `paddle-cluster-sg-worker` (`sg-055ee37d` in the image below) +
![](src/worker_security_group.png)
+ 2. Add security group `paddle-efs` with `ALL TCP` inbound rule and custom source as group id of `paddle-cluster-sg-worker`. And VPC of `paddle-cluster-vpc`. Make sure availability zone is same as the one you used in [Initialize Assets](#initialize-assets). +
![](src/add_security_group.png)
+ +2. Create the Elastic File System in [EFS console](https://us-west-2.console.aws.amazon.com/efs/home?region=us-west-2#/wizard/1) with `paddle-cluster-vpc` VPC. Make sure subnet is `paddle-cluster-Subnet0` andd security group is `paddle-efs`. +
![](src/create_efs.png)
+ + +### Start PaddlePaddle Training Demo on AWS + +#### Configure Kubernetes Volume that Points to EFS + +First we need to create a [PersistentVolume](https://kubernetes.io/docs/user-guide/persistent-volumes/) to provision EFS volumn. + +Save following snippet as `pv.yaml` +``` +apiVersion: v1 +kind: PersistentVolume +metadata: + name: efsvol +spec: + capacity: + storage: 100Gi + accessModes: + - ReadWriteMany + nfs: + server: EFS_DNS_NAME + path: "/" +``` + +`EFS_DNS_NAME`: DNS name as shown in description of `paddle-efs` that we created. Looks similar to `fs-2cbf7385.efs.us-west-2.amazonaws.com` + +Run following command to create a persistent volumn: +``` +kubectl --kubeconfig=kubeconfig create -f pv.yaml +``` + +Next let's create a [PersistentVolumeClaim](https://kubernetes.io/docs/user-guide/persistent-volumes/) to claim the persistent volume. + +Save following snippet as `pvc.yaml`. +``` +kind: PersistentVolumeClaim +apiVersion: v1 +metadata: + name: efsvol +spec: + accessModes: + - ReadWriteMany + resources: + requests: + storage: 50Gi +``` + +Run following command to create a persistent volumn claim: +``` +kubectl --kubeconfig=kubeconfig create -f pvc.yaml +``` + +#### Prepare Training Data + +We will now launch a kubernetes job that downloads, saves and evenly splits training data into 3 shards on the persistent volumn that we just created. + +save following snippet as `paddle-data-job.yaml` +``` +apiVersion: batch/v1 +kind: Job +metadata: + name: paddle-data +spec: + template: + metadata: + name: pi + spec: + containers: + - name: paddle-data + image: paddlepaddle/paddle-tutorial:k8s_data + imagePullPolicy: Always + volumeMounts: + - mountPath: "/efs" + name: efs + env: + - name: OUT_DIR + value: /efs/paddle-cluster-job + - name: SPLIT_COUNT + value: "3" + volumes: + - name: efs + persistentVolumeClaim: + claimName: efsvol + restartPolicy: Never +``` + +Run following command to launch the job: +``` +kubectl --kubeconfig=kubeconfig create -f paddle-data-job.yaml +``` + +Job may take 7 min to finish, use following command to check job status. Do not proceed until `SUCCESSFUL` for `paddle-data` job is `1` +``` +$ kubectl --kubeconfig=kubeconfig get jobs +NAME DESIRED SUCCESSFUL AGE +paddle-data 1 1 6m +``` + +Data preparation is done by docker image `paddlepaddle/paddle-tutorial:k8s_data`, see [here](src/k8s_data/README.md) for how to build this docker image and source code. + +#### Start Training + +Now we are ready to start paddle training job. Save following snippet as `paddle-cluster-job.yaml` +``` +apiVersion: batch/v1 +kind: Job +metadata: + name: paddle-cluster-job +spec: + parallelism: 3 + completions: 3 + template: + metadata: + name: paddle-cluster-job + spec: + volumes: + - name: efs + persistentVolumeClaim: + claimName: efsvol + containers: + - name: trainer + image: paddlepaddle/paddle-tutorial:k8s_train + command: ["bin/bash", "-c", "/root/start.sh"] + env: + - name: JOB_NAME + value: paddle-cluster-job + - name: JOB_PATH + value: /home/jobpath + - name: JOB_NAMESPACE + value: default + - name: TRAIN_CONFIG_DIR + value: quick_start + - name: CONF_PADDLE_NIC + value: eth0 + - name: CONF_PADDLE_PORT + value: "7164" + - name: CONF_PADDLE_PORTS_NUM + value: "2" + - name: CONF_PADDLE_PORTS_NUM_SPARSE + value: "2" + - name: CONF_PADDLE_GRADIENT_NUM + value: "3" + - name: TRAINER_COUNT + value: "3" + volumeMounts: + - mountPath: "/home/jobpath" + name: efs + ports: + - name: jobport0 + hostPort: 7164 + containerPort: 7164 + - name: jobport1 + hostPort: 7165 + containerPort: 7165 + - name: jobport2 + hostPort: 7166 + containerPort: 7166 + - name: jobport3 + hostPort: 7167 + containerPort: 7167 + restartPolicy: Never +``` + +`parallelism: 3, completions: 3` means this job will simultaneously start 3 PaddlePaddle pods, and this job will be finished when there are 3 finished pods. + +`env` field represents container's environment variables, we specify PaddlePaddle parameters by environment variables. + +`ports` indicates that TCP port 7164 - 7167 are exposed for communication between `pserver` ans trainer. port starts continously from `CONF_PADDLE_PORT` (7164) to `CONF_PADDLE_PORT + CONF_PADDLE_PORTS_NUM + CONF_PADDLE_PORTS_NUM_SPARSE - 1` (7167). We use multiple ports for dense and sparse paramter updates to improve latency. + +Run following command to launch the job. +``` +kubectl --kubeconfig=kubeconfig create -f paddle-claster-job.yaml +``` + +Inspect individual pods + +``` +$ kubectl --kubeconfig=kubeconfig get pods +NAME READY STATUS RESTARTS AGE +paddle-cluster-job-cm469 1/1 Running 0 9m +paddle-cluster-job-fnt03 1/1 Running 0 9m +paddle-cluster-job-jx4xr 1/1 Running 0 9m +``` + +Inspect individual console output +``` +kubectl --kubeconfig=kubeconfig log -f POD_NAME +``` + +`POD_NAME`: name of any pod (e.g., `paddle-cluster-job-cm469`). + +Run `kubectl --kubeconfig=kubeconfig describe job paddle-cluster-job` to check training job status. It will complete in around 20 minutes. + +The details for start `pserver` and `trainer` are hidden inside docker image `paddlepaddle/paddle-tutorial:k8s_train`, see [here](src/k8s_train/README.md) for how to build the docker image and source code. + +#### Inspect Training Output + +Training output (model snapshot and logs) will be saved in EFS. We can ssh into worker EC2 instance, mount EFS and check training output. + +1. ssh Into Worker EC2 instance +``` +chmod 400 key-name.pem +ssh -i key-name.pem core@INSTANCE_IP +``` + +`INSTANCE_IP`: public IP address of EC2 kubernetes worker node. Go to [EC2 console](https://us-west-2.console.aws.amazon.com/ec2/v2/home?region=us-west-2#Instances:sort=instanceId) and check `public IP` of any `paddle-cluster-kube-aws-worker` instance. + +2. Mount EFS +``` +mkdir efs +sudo mount -t nfs4 -o nfsvers=4.1,rsize=1048576,wsize=1048576,hard,timeo=600,retrans=2 EFS_DNS_NAME:/ efs +``` + +`EFS_DNS_NAME`: DNS name as shown in description of `paddle-efs` that we created. Look similar to `fs-2cbf7385.efs.us-west-2.amazonaws.com`. + +Now folder `efs` will have structure similar to: +``` +-- paddle-cluster-job + |-- ... + |-- output + | |-- node_0 + | | |-- server.log + | | `-- train.log + | |-- node_1 + | | |-- server.log + | | `-- train.log + | |-- node_2 + | | |-- server.log + | | `-- train.log + | |-- pass-00000 + | | |-- ___fc_layer_0__.w0 + | | |-- ___fc_layer_0__.wbias + | | |-- done + | | |-- path.txt + | | `-- trainer_config.lr.py + | |-- pass-00001... +``` +`server.log` contains log for `pserver`. `train.log` contains log for `trainer`. Model description and snapshot is stored in `pass-0000*`. + +### Kubernetes Cluster Tear Down + +#### Delete EFS + +Go to [EFS Console](https://us-west-2.console.aws.amazon.com/efs/home?region=us-west-2) and delete the EFS volumn that we created. + +#### Delete security group + +Go to [Security Group Console](https://us-west-2.console.aws.amazon.com/ec2/v2/home?region=us-west-2#SecurityGroups:sort=groupId) and delete security group `paddle-efs`. + + +#### Delete S3 Bucket + +Go to [S3 Console](https://console.aws.amazon.com/s3/home?region=us-west-2#) and delete the S3 bucket that we created. + +#### Destroy Cluster + +``` +kube-aws destroy +``` + +The command will return immediately, but it might take 5 min to tear down the whole cluster. + +You can go to [CludFormation Console](https://us-west-2.console.aws.amazon.com/cloudformation/home?region=us-west-2#/stacks?filter=active) to check destroy process. diff --git a/doc/howto/usage/cluster/k8s_cn.md b/doc/howto/usage/cluster/k8s_cn.md new file mode 100644 index 0000000000000000000000000000000000000000..c1a11f7165a2f9da9dd044641274447e7943a597 --- /dev/null +++ b/doc/howto/usage/cluster/k8s_cn.md @@ -0,0 +1,206 @@ +# Kubernetes单机训练 + +在这篇文档里,我们介绍如何在 Kubernetes 集群上启动一个单机使用CPU的PaddlePaddle训练作业。在下一篇中,我们将介绍如何启动分布式训练作业。 + +## 制作Docker镜像 + +在一个功能齐全的Kubernetes机群里,通常我们会安装Ceph等分布式文件系统来存储训练数据。这样的话,一个分布式PaddlePaddle训练任务中 +的每个进程都可以从Ceph读取数据。在这个例子里,我们只演示一个单机作业,所以可以简化对环境的要求,把训练数据直接放在 +PaddlePaddle的Docker Image里。为此,我们需要制作一个包含训练数据的PaddlePaddle镜像。 + +PaddlePaddle的 `paddlepaddle/paddle:cpu-demo-latest` 镜像里有PaddlePaddle的源码与demo, +(请注意,默认的PaddlePaddle生产环境镜像 `paddlepaddle/paddle:latest` 是不包括源码的,PaddlePaddle的各版本镜像可以参考 +[Docker Installation Guide](http://paddlepaddle.org/docs/develop/documentation/zh/getstarted/build_and_install/docker_install_cn.html)), +下面我们使用这个镜像来下载数据到Docker Container中,并把这个包含了训练数据的Container保存为一个新的镜像。 + +### 运行容器 + +``` +$ docker run --name quick_start_data -it paddlepaddle/paddle:cpu-demo-latest +``` + +### 下载数据 + +进入容器`/root/paddle/demo/quick_start/data`目录,使用`get_data.sh`下载数据 + +``` +$ root@fbd1f2bb71f4:~/paddle/demo/quick_start/data# ./get_data.sh + +Downloading Amazon Electronics reviews data... +--2016-10-31 01:33:43-- http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/reviews_Electronics_5.json.gz +Resolving snap.stanford.edu (snap.stanford.edu)... 171.64.75.80 +Connecting to snap.stanford.edu (snap.stanford.edu)|171.64.75.80|:80... connected. +HTTP request sent, awaiting response... 200 OK +Length: 495854086 (473M) [application/x-gzip] +Saving to: 'reviews_Electronics_5.json.gz' + + 10% [=======> ] 874,279 64.7KB/s eta 2h 13m + +``` + +### 修改启动脚本 + +下载完数据后,修改`/root/paddle/demo/quick_start/train.sh`文件,内容如下(增加了一条cd命令) +``` +set -e +cd /root/paddle/demo/quick_start +cfg=trainer_config.lr.py +#cfg=trainer_config.emb.py +#cfg=trainer_config.cnn.py +#cfg=trainer_config.lstm.py +#cfg=trainer_config.bidi-lstm.py +#cfg=trainer_config.db-lstm.py +paddle train \ + --config=$cfg \ + --save_dir=./output \ + --trainer_count=4 \ + --log_period=20 \ + --num_passes=15 \ + --use_gpu=false \ + --show_parameter_stats_period=100 \ + --test_all_data_in_one_period=1 \ + 2>&1 | tee 'train.log' +``` + +### 提交镜像 + +修改启动脚本后,退出容器,使用`docker commit`命令创建新镜像。 + +``` +$ docker commit quick_start_data mypaddle/paddle:quickstart +``` + +## 使用 Kubernetes 进行训练 + +>针对任务运行完成后容器自动退出的场景,Kubernetes有Job类型的资源来支持。下文就是用Job类型的资源来进行训练。 + +### 编写yaml文件 + +在训练时,输出结果可能会随着容器的消耗而被删除,需要在创建容器前挂载卷以便我们保存训练结果。使用我们之前构造的镜像,可以创建一个 [Kubernetes Job](http://kubernetes.io/docs/user-guide/jobs/#what-is-a-job),简单的yaml文件如下: + +``` +apiVersion: batch/v1 +kind: Job +metadata: + name: quickstart +spec: + parallelism: 1 + completions: 1 + template: + metadata: + name: quickstart + spec: + volumes: + - name: output + hostPath: + path: /home/work/paddle_output + containers: + - name: pi + image: mypaddle/paddle:quickstart + command: ["bin/bash", "-c", "/root/paddle/demo/quick_start/train.sh"] + volumeMounts: + - name: output + mountPath: /root/paddle/demo/quick_start/output + restartPolicy: Never +``` + +### 创建PaddlePaddle Job + +使用上文创建的yaml文件创建Kubernetes Job,命令为: + +``` +$ kubectl create -f paddle.yaml +``` + +查看job的详细情况: + +``` +$ kubectl get job +NAME DESIRED SUCCESSFUL AGE +quickstart 1 0 58s + +$ kubectl describe job quickstart +Name: quickstart +Namespace: default +Image(s): registry.baidu.com/public/paddle:cpu-demo-latest +Selector: controller-uid=f120da72-9f18-11e6-b363-448a5b355b84 +Parallelism: 1 +Completions: 1 +Start Time: Mon, 31 Oct 2016 11:20:16 +0800 +Labels: controller-uid=f120da72-9f18-11e6-b363-448a5b355b84,job-name=quickstart +Pods Statuses: 0 Running / 1 Succeeded / 0 Failed +Volumes: + output: + Type: HostPath (bare host directory volume) + Path: /home/work/paddle_output +Events: + FirstSeen LastSeen Count From SubobjectPath Type Reason Message + --------- -------- ----- ---- ------------- -------- ------ ------- + 1m 1m 1 {job-controller } Normal SuccessfulCreate Created pod: quickstart-fa0wx +``` + +### 查看训练结果 + +根据Job对应的Pod信息,可以查看此Pod运行的宿主机。 + +``` +kubectl describe pod quickstart-fa0wx +Name: quickstart-fa0wx +Namespace: default +Node: paddle-demo-let02/10.206.202.44 +Start Time: Mon, 31 Oct 2016 11:20:17 +0800 +Labels: controller-uid=f120da72-9f18-11e6-b363-448a5b355b84,job-name=quickstart +Status: Succeeded +IP: 10.0.0.9 +Controllers: Job/quickstart +Containers: + quickstart: + Container ID: docker://b8561f5c79193550d64fa47418a9e67ebdd71546186e840f88de5026b8097465 + Image: registry.baidu.com/public/paddle:cpu-demo-latest + Image ID: docker://18e457ce3d362ff5f3febf8e7f85ffec852f70f3b629add10aed84f930a68750 + Port: + Command: + bin/bash + -c + /root/paddle/demo/quick_start/train.sh + QoS Tier: + cpu: BestEffort + memory: BestEffort + State: Terminated + Reason: Completed + Exit Code: 0 + Started: Mon, 31 Oct 2016 11:20:20 +0800 + Finished: Mon, 31 Oct 2016 11:21:46 +0800 + Ready: False + Restart Count: 0 + Environment Variables: +Conditions: + Type Status + Ready False +Volumes: + output: + Type: HostPath (bare host directory volume) + Path: /home/work/paddle_output +``` + +我们还可以登录到宿主机上查看训练结果。 + +``` +[root@paddle-demo-let02 paddle_output]# ll +total 60 +drwxr-xr-x 2 root root 4096 Oct 31 11:20 pass-00000 +drwxr-xr-x 2 root root 4096 Oct 31 11:20 pass-00001 +drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00002 +drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00003 +drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00004 +drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00005 +drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00006 +drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00007 +drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00008 +drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00009 +drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00010 +drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00011 +drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00012 +drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00013 +drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00014 +``` diff --git a/doc/howto/usage/cluster/k8s_distributed_cn.md b/doc/howto/usage/cluster/k8s_distributed_cn.md new file mode 100644 index 0000000000000000000000000000000000000000..167089b8074b33e3b094fa3ec8e377630cec42ac --- /dev/null +++ b/doc/howto/usage/cluster/k8s_distributed_cn.md @@ -0,0 +1,312 @@ +# Kubernetes分布式训练 + +前一篇文章介绍了如何在Kubernetes集群上启动一个单机PaddlePaddle训练作业 (Job)。在这篇文章里,我们介绍如何在Kubernetes集群上进行分布式PaddlePaddle训练作业。关于PaddlePaddle的分布式训练,文章 [Cluster Training](http://www.paddlepaddle.org/docs/develop/documentation/zh/howto/usage/cluster/cluster_train_cn.html)介绍了一种通过SSH远程分发任务,进行分布式训练的方法,与此不同的是,本文将介绍在Kubernetes容器管理平台上快速构建PaddlePaddle容器集群,进行分布式训练的方案。 + +## 整体方案 + +在训练之前,用户将配置与训练数据切分好放在分布式文件系统预先分配好的目录中(不同的分布式文件系统,需要使用其制定的方式挂载后并导入数据),训练时,程序从此目录拷贝文件到容器内进行训练,将结果保存到此目录里。整体的结构图如下: + +![paddle on kubernetes结构图](src/k8s-paddle-arch.png) + +上图描述了一个3节点的分布式训练场景,在每个Pod上都通过volume方式挂载分布式文件系统的一个目录用于保存训练数据和输出结果。Kubernetes为这次训练创建了3个pod并且调度到了3个node上运行,每个pod包含一个PaddlePaddle容器。在容器创建后,会启动pserver与trainer进程,读取volume中的数据进行这次分布式训练。 + +根据前文的描述,要在已有的Kubernetes集群上进行PaddlePaddle的分布式训练,按照下面步骤即可: + +1. [制作PaddlePaddle镜像](#制作镜像) +1. [将训练文件与切分好的数据上传到共享存储](#上传训练文件) +1. [编写本次训练的YAML文件,创建一个Kubernetes job](#创建Job) +1. [训练结束后查看输出结果](#查看输出) + +下面就根据这几个步骤分别介绍。 + +### 制作镜像 + +PaddlePaddle镜像需要提供`paddle pserver`与`paddle train`进程的运行环境,用这个镜像创建的容器需要有以下两个功能: + +- 拷贝训练文件到容器内 +- 生成`paddle pserver`与`paddle train`进程的启动参数,并且启动训练 + +因为官方镜像 `paddlepaddle/paddle:latest` 内已经包含PaddlePaddle的执行程序但是还没上述功能,所以我们可以在这个基础上,添加启动脚本,制作新镜像来完成以上的工作。参考镜像的[*Dockerfile*](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/howto/usage/cluster/src/k8s_train/Dockerfile)。 + +```bash +$ cd doc/howto/usage/k8s/src/k8s_train +$ docker build -t [YOUR_REPO]/paddle:mypaddle . +``` + +然后将构建成功的镜像上传到镜像仓库。 + +```bash +docker push [YOUR_REPO]/paddle:mypaddle +``` + +注意上述命令中`[YOUR_REPO]`表示读者所使用的Docker镜像仓库地址,读者需要替换成自己使用的仓库地址。下文使用`[YOUR_REPO]/paddle:mypaddle`这个地址来表示此步骤所构建出的镜像。 + +### 准备训练数据 + +这里我们通过在Kubernetes集群上启动一个Job来下载并切割数据,也可以通过修改[k8s_train](./src/k8s_train/README.md)的内容来定制image. + +在启动Job之前,需要根据不同的分布式存储来绑定一个[persistentVolumeClaim](https://kubernetes.io/docs/user-guide/persistent-volumes/),生成的数据将会存储在这个volume下. + +```yaml +apiVersion: batch/v1 +kind: Job +metadata: + name: paddle-data +spec: + template: + metadata: + name: pi + spec: + hostNetwork: true + containers: + - name: paddle-data + image: paddlepaddle/paddle-tutorial:k8s_data + imagePullPolicy: Always + volumeMounts: + - mountPath: "/mnt" + name: nfs + env: + - name: OUT_DIR + value: /home/work/mfs/paddle-cluster-job + - name: SPLIT_COUNT + value: "3" + volumes: + - name: nfs + persistentVolumeClaim: + claimName: mfs + restartPolicy: Never +``` + +完成后volume中的文件内容大致如下: +```base +[root@paddle-kubernetes-node0 nfsdir]$ tree -d +. +`-- paddle-cluster-job + |-- 0 + | `-- data + |-- 1 + | `-- data + |-- 2 + | `-- data + |-- output + |-- quick_start +``` + +目录中paddle-cluster-job是本次训练对应的job name,本次训练要求有3个PaddlePaddle节点,在paddle-cluster-job/data目录中存放切分好的数据,文件夹0,1,2分别代表3个节点的trainer_id。recommendation文件夹内存放训练文件,output文件夹存放训练结果与日志。 + +### 创建Job + +Kubernetes可以通过YAML文件来创建相关对象,然后可以使用命令行工具创建job。 + +Job YAML文件描述了这次训练使用的Docker镜像,需要启动的节点个数以及 `paddle pserver`与 `paddle train`进程启动的必要参数,也描述了容器需要使用的存储卷挂载的情况。YAML文件中各个字段的具体含义,可以查看[Kubernetes Job API](http://kubernetes.io/docs/api-reference/batch/v1/definitions/#_v1_job)。例如,本次训练的YAML文件可以写成: + +```yaml +apiVersion: batch/v1 +kind: Job +metadata: + name: paddle-cluster-job +spec: + parallelism: 3 + completions: 3 + template: + metadata: + name: paddle-cluster-job + spec: + volumes: + - name: jobpath + hostPath: + path: /home/work/mfs + containers: + - name: trainer + image: [YOUR_REPO]/paddle:mypaddle + command: ["bin/bash", "-c", "/root/start.sh"] + env: + - name: JOB_NAME + value: paddle-cluster-job + - name: JOB_PATH + value: /home/jobpath + - name: JOB_NAMESPACE + value: default + - name: TRAIN_CONFIG_DIR + value: recommendation + - name: CONF_PADDLE_NIC + value: eth0 + - name: CONF_PADDLE_PORT + value: "7164" + - name: CONF_PADDLE_PORTS_NUM + value: "2" + - name: CONF_PADDLE_PORTS_NUM_SPARSE + value: "2" + - name: CONF_PADDLE_GRADIENT_NUM + value: "3" + volumeMounts: + - name: jobpath + mountPath: /home/jobpath + restartPolicy: Never +``` + +文件中,`metadata`下的`name`表示这个job的名字。`parallelism,completions`字段表示这个job会同时开启3个PaddlePaddle节点,成功训练且退出的pod数目为3时,这个job才算成功结束。然后申明一个存储卷`jobpath`,代表宿主机目录`/home/work/mfs`,在对容器的描述`containers`字段中,将此目录挂载为容器的`/home/jobpath`目录,这样容器的`/home/jobpath`目录就成为了共享存储,放在这个目录里的文件其实是保存到了MFS上。 + +`env`字段表示容器的环境变量,我们将`paddle`运行的一些参数通过这种方式传递到容器内: + + +- JOB_PATH:共享存储挂在的路径 +- JOB_NAME:Job的名字 +- TRAIN_CONFIG_DIR:本次训练文件所在目录,与JOB_PATH,JOB_NAME组合可以找到本次训练需要的文件路径 +- CONF_PADDLE_NIC:`paddle pserver`进程需要的`--nics`参数,即网卡名 +- CONF_PADDLE_PORT:`paddle paserver`的`--port`参数 +- CONF_PADDLE_PORTS_NUM:稠密更新的端口数量,即`--ports_num`参数 +- CONF_PADDLE_PORTS_NUM_SPARSE:稀疏更新的端口数量,即`--ports_num_for_sparse`参数 +- CONF_PADDLE_GRADIENT_NUM:训练节点数量,即`--num_gradient_servers参数` + +这些参数的具体描述,读者可以查看[这里](http://www.paddlepaddle.org/docs/develop/documentation/zh/howto/usage/cmd_parameter/detail_introduction_cn.html)。 + +编写完YAML文件后,可以使用Kubernetes的命令行工具创建job。 + +```bash +kubectl create -f job.yaml +``` + +创建成功后,Kubernetes就会创建3个pod作为PaddlePaddle节点然后拉取镜像,启动容器开始训练。 + + +### 查看输出 + +在训练过程中,可以在共享存储上查看输出的日志和模型,例如output目录下就存放了输出结果。注意node_0,node_1,node_2这几个目录表示PaddlePaddle节点与trainer_id,并不是Kubernetes中的node概念。 + +```bash +[root@paddle-kubernetes-node0 output]# tree -d +. +├── node_0 +│   ├── server.log +│   └── train.log +├── node_1 +│   ├── server.log +│   └── train.log +├── node_2 +...... +├── pass-00002 +│   ├── done +│   ├── ___embedding_0__.w0 +│   ├── ___embedding_1__.w0 +...... +``` + +我们可以通过日志查看容器训练的情况,例如: + +```bash +[root@paddle-kubernetes-node0 node_0]# cat train.log +I1116 09:10:17.123121 50 Util.cpp:155] commandline: + /usr/local/bin/../opt/paddle/bin/paddle_trainer + --nics=eth0 --port=7164 + --ports_num=2 --comment=paddle_process_by_paddle + --pservers=192.168.129.66,192.168.223.143,192.168.129.71 + --ports_num_for_sparse=2 --config=./trainer_config.py + --trainer_count=4 --num_passes=10 --use_gpu=0 + --log_period=50 --dot_period=10 --saving_period=1 + --local=0 --trainer_id=0 + --save_dir=/home/jobpath/paddle-cluster-job/output +I1116 09:10:17.123440 50 Util.cpp:130] Calling runInitFunctions +I1116 09:10:17.123764 50 Util.cpp:143] Call runInitFunctions done. +[WARNING 2016-11-16 09:10:17,227 default_decorators.py:40] please use keyword arguments in paddle config. +[INFO 2016-11-16 09:10:17,239 networks.py:1282] The input order is [movie_id, title, genres, user_id, gender, age, occupation, rating] +[INFO 2016-11-16 09:10:17,239 networks.py:1289] The output order is [__square_error_cost_0__] +I1116 09:10:17.392917 50 Trainer.cpp:170] trainer mode: Normal +I1116 09:10:17.613910 50 PyDataProvider2.cpp:257] loading dataprovider dataprovider::process +I1116 09:10:17.680917 50 PyDataProvider2.cpp:257] loading dataprovider dataprovider::process +I1116 09:10:17.681543 50 GradientMachine.cpp:134] Initing parameters.. +I1116 09:10:18.012390 50 GradientMachine.cpp:141] Init parameters done. +I1116 09:10:18.018641 50 ParameterClient2.cpp:122] pserver 0 192.168.129.66:7164 +I1116 09:10:18.018950 50 ParameterClient2.cpp:122] pserver 1 192.168.129.66:7165 +I1116 09:10:18.019069 50 ParameterClient2.cpp:122] pserver 2 192.168.223.143:7164 +I1116 09:10:18.019492 50 ParameterClient2.cpp:122] pserver 3 192.168.223.143:7165 +I1116 09:10:18.019716 50 ParameterClient2.cpp:122] pserver 4 192.168.129.71:7164 +I1116 09:10:18.019836 50 ParameterClient2.cpp:122] pserver 5 192.168.129.71:7165 +``` + + +## 一些细节的补充 + +### 使用环境变量 + +使用容器方式运行训练任务的Kubernetes Job,通常会使用环境变量配置Job的配置信息`start_paddle.py`提供了一个启动脚本,将环境变量转换成paddle的命令行参数: +``` +API = "/api/v1/namespaces/" +JOBSELECTOR = "labelSelector=job-name=" +JOB_PATH = os.getenv("JOB_PATH") + "/" + os.getenv("JOB_NAME") +JOB_PATH_OUTPUT = JOB_PATH + "/output" +JOBNAME = os.getenv("JOB_NAME") +NAMESPACE = os.getenv("JOB_NAMESPACE") +PADDLE_NIC = os.getenv("CONF_PADDLE_NIC") +PADDLE_PORT = os.getenv("CONF_PADDLE_PORT") +PADDLE_PORTS_NUM = os.getenv("CONF_PADDLE_PORTS_NUM") +PADDLE_PORTS_NUM_SPARSE = os.getenv("CONF_PADDLE_PORTS_NUM_SPARSE") +PADDLE_SERVER_NUM = os.getenv("CONF_PADDLE_GRADIENT_NUM") +``` + +### Pod间通信 +`start_paddle.py`脚本开始时,会先进行参数的初始化与解析。 + +```python +parser = argparse.ArgumentParser(prog="start_paddle.py", + description='simple tool for k8s') + args, train_args_list = parser.parse_known_args() + train_args = refine_unknown_args(train_args_list) + train_args_dict = dict(zip(train_args[:-1:2], train_args[1::2])) + podlist = getPodList() +``` + +然后通过函数`getPodList()`访问Kubernetes的接口来查询此job对应的所有pod信息。当所有pod都处于running状态(容器运行都运行)时,再通过函数`getIdMap(podlist)`获取trainer_id。 + +```python + podlist = getPodList() + # need to wait until all pods are running + while not isPodAllRunning(podlist): + time.sleep(10) + podlist = getPodList() + idMap = getIdMap(podlist) +``` +* *注意*: `getPodList()`会获取当前namespace下的所有pod,如果已经有pod运行,可能会导致出错。这种集群节点管理方式会在将来使用[statfulsets](https://kubernetes.io/docs/concepts/abstractions/controllers/statefulsets/)代替。 + +在函数`getIdMap(podlist)`内部,我们通过读取`podlist`中每个pod的IP地址,将IP排序生成的序号作为trainer_id。 + +```python +def getIdMap(podlist): + ''' + generate tainer_id by ip + ''' + ips = [] + for pod in podlist["items"]: + ips.append(pod["status"]["podIP"]) + ips.sort() + idMap = {} + for i in range(len(ips)): + idMap[ips[i]] = i + return idMap +``` + +在得到`idMap`后,通过函数`startPaddle(idMap, train_args_dict)`构造`paddle pserver`与`paddle train`的启动参数并执行进程。 + +### 启动任务 + +在函数`startPaddle`中,最主要的工作就是解析出`paddle pserver`与`paddle train`的启动参数。例如`paddle train`参数的解析,解析环境变量得到`PADDLE_NIC`,`PADDLE_PORT`,`PADDLE_PORTS_NUM`等参数,然后通过自身的IP地址在`idMap`中获取`trainerId`。 + +```python + program = 'paddle train' + args = " --nics=" + PADDLE_NIC + args += " --port=" + str(PADDLE_PORT) + args += " --ports_num=" + str(PADDLE_PORTS_NUM) + args += " --comment=" + "paddle_process_by_paddle" + ip_string = "" + for ip in idMap.keys(): + ip_string += (ip + ",") + ip_string = ip_string.rstrip(",") + args += " --pservers=" + ip_string + args_ext = "" + for key, value in train_args_dict.items(): + args_ext += (' --' + key + '=' + value) + localIP = socket.gethostbyname(socket.gethostname()) + trainerId = idMap[localIP] + args += " " + args_ext + " --trainer_id=" + \ + str(trainerId) + " --save_dir=" + JOB_PATH_OUTPUT +``` diff --git a/doc/howto/usage/cluster/k8s_en.md b/doc/howto/usage/cluster/k8s_en.md new file mode 100644 index 0000000000000000000000000000000000000000..c374f00a495d705ceddf8d3d930768ceeb93282b --- /dev/null +++ b/doc/howto/usage/cluster/k8s_en.md @@ -0,0 +1,210 @@ +# PaddlePaddle On Kubernetes + +In this article, we will introduce how to run PaddlePaddle training job on single CPU machine using Kubernetes. In next article, we will introduce how to run PaddlePaddle training job on distributed cluster. + +## Build Docker Image + +In distributed Kubernetes cluster, we will use Ceph or other distributed +storage system for storing training related data so that all processes in +PaddlePaddle training can retrieve data from Ceph. In this example, we will +only demo training job on single machine. In order to simplify the requirement +of the environment, we will directly put training data into the PaddlePaddle Docker Image, +so we need to create a PaddlePaddle Docker image that includes the training data. + +The production Docker Image `paddlepaddle/paddle:cpu-demo-latest` has the PaddlePaddle +source code and demo. (Caution: Default PaddlePaddle Docker Image `paddlepaddle/paddle:latest` doesn't include +the source code, PaddlePaddle's different versions of Docker Image can be referred here: +[Docker Installation Guide](http://paddlepaddle.org/docs/develop/documentation/zh/getstarted/build_and_install/docker_install_en.html)), +so we run this Docker Image and download the training data, and then commit the whole +Container to be a new Docker Image. + +### Run Docker Container + +``` +$ docker run --name quick_start_data -it paddlepaddle/paddle:cpu-demo-latest +``` + +### Download Training Data + +Getting into `/root/paddle/demo/quick_start/data` Directory,using `get_data.sh` to download training data. +Then getting into `/root/paddle/demo/quick_start` Directory, using `preprocess.sh` to pre-process training data. + +``` +$ root@fbd1f2bb71f4:~/paddle/demo/quick_start/data# ./get_data.sh + +Downloading Amazon Electronics reviews data... +--2016-10-31 01:33:43-- http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/reviews_Electronics_5.json.gz +Resolving snap.stanford.edu (snap.stanford.edu)... 171.64.75.80 +Connecting to snap.stanford.edu (snap.stanford.edu)|171.64.75.80|:80... connected. +HTTP request sent, awaiting response... 200 OK +Length: 495854086 (473M) [application/x-gzip] +Saving to: 'reviews_Electronics_5.json.gz' + + 10% [=======> ] 874,279 64.7KB/s eta 2h 13m + +``` + +### Modify Startup Script + +After downloading the data,modify `/root/paddle/demo/quick_start/train.sh` file contents are as follows (one more cd cmd): +``` +set -e +cd /root/paddle/demo/quick_start +cfg=trainer_config.lr.py +#cfg=trainer_config.emb.py +#cfg=trainer_config.cnn.py +#cfg=trainer_config.lstm.py +#cfg=trainer_config.bidi-lstm.py +#cfg=trainer_config.db-lstm.py +paddle train \ + --config=$cfg \ + --save_dir=./output \ + --trainer_count=4 \ + --log_period=20 \ + --num_passes=15 \ + --use_gpu=false \ + --show_parameter_stats_period=100 \ + --test_all_data_in_one_period=1 \ + 2>&1 | tee 'train.log' +``` + +### Commit Docker Image + +``` +$ docker commit quick_start_data mypaddle/paddle:quickstart +``` + +## Use Kubernetes For Training + +We will use Kubernetes job for training process, following steps shows how to do the training with Kubernetes. + +### Create Yaml Files + +The output result in container will be demolished when job finished (container stopped running), so we need to mount the volume out to the local disk when creating the container to store the training result. Using our previously created image, we can create a [Kubernetes Job](http://kubernetes.io/docs/user-guide/jobs/#what-is-a-job), the yaml contents are as follows: + +``` +apiVersion: batch/v1 +kind: Job +metadata: + name: quickstart +spec: + parallelism: 1 + completions: 1 + template: + metadata: + name: quickstart + spec: + volumes: + - name: output + hostPath: + path: /home/work/paddle_output + containers: + - name: pi + image: mypaddle/paddle:quickstart + command: ["bin/bash", "-c", "/root/paddle/demo/quick_start/train.sh"] + volumeMounts: + - name: output + mountPath: /root/paddle/demo/quick_start/output + restartPolicy: Never +``` + +### Start PaddlePaddle Job + +Using the above yaml file to start the Kubernetes job. + +``` +$ kubectl create -f paddle.yaml +``` + +Get the detailed status of the job: + +``` +$ kubectl get job +NAME DESIRED SUCCESSFUL AGE +quickstart 1 0 58s + +$ kubectl describe job quickstart +Name: quickstart +Namespace: default +Image(s): registry.baidu.com/public/paddle:cpu-demo-latest +Selector: controller-uid=f120da72-9f18-11e6-b363-448a5b355b84 +Parallelism: 1 +Completions: 1 +Start Time: Mon, 31 Oct 2016 11:20:16 +0800 +Labels: controller-uid=f120da72-9f18-11e6-b363-448a5b355b84,job-name=quickstart +Pods Statuses: 0 Running / 1 Succeeded / 0 Failed +Volumes: + output: + Type: HostPath (bare host directory volume) + Path: /home/work/paddle_output +Events: + FirstSeen LastSeen Count From SubobjectPath Type Reason Message + --------- -------- ----- ---- ------------- -------- ------ ------- + 1m 1m 1 {job-controller } Normal SuccessfulCreate Created pod: quickstart-fa0wx +``` + +### Get Training Result + +We can use kubectl command to take a look at the status of related pod. + +``` +$ kubectl describe pod quickstart-fa0wx +Name: quickstart-fa0wx +Namespace: default +Node: paddle-demo-let02/10.206.202.44 +Start Time: Mon, 31 Oct 2016 11:20:17 +0800 +Labels: controller-uid=f120da72-9f18-11e6-b363-448a5b355b84,job-name=quickstart +Status: Succeeded +IP: 10.0.0.9 +Controllers: Job/quickstart +Containers: + quickstart: + Container ID: docker://b8561f5c79193550d64fa47418a9e67ebdd71546186e840f88de5026b8097465 + Image: registry.baidu.com/public/paddle:cpu-demo-latest + Image ID: docker://18e457ce3d362ff5f3febf8e7f85ffec852f70f3b629add10aed84f930a68750 + Port: + Command: + bin/bash + -c + /root/paddle/demo/quick_start/train.sh + QoS Tier: + cpu: BestEffort + memory: BestEffort + State: Terminated + Reason: Completed + Exit Code: 0 + Started: Mon, 31 Oct 2016 11:20:20 +0800 + Finished: Mon, 31 Oct 2016 11:21:46 +0800 + Ready: False + Restart Count: 0 + Environment Variables: +Conditions: + Type Status + Ready False +Volumes: + output: + Type: HostPath (bare host directory volume) + Path: /home/work/paddle_output +``` + +We can also ssh to Kubernetes node to take a look at the training result. + +``` +[root@paddle-demo-let02 paddle_output]# ll +total 60 +drwxr-xr-x 2 root root 4096 Oct 31 11:20 pass-00000 +drwxr-xr-x 2 root root 4096 Oct 31 11:20 pass-00001 +drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00002 +drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00003 +drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00004 +drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00005 +drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00006 +drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00007 +drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00008 +drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00009 +drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00010 +drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00011 +drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00012 +drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00013 +drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00014 +``` diff --git a/doc/howto/usage/cluster/openmpi_cn.md b/doc/howto/usage/cluster/openmpi_cn.md new file mode 100644 index 0000000000000000000000000000000000000000..831cafdc03c6a908f31769d0467de022df42dab5 --- /dev/null +++ b/doc/howto/usage/cluster/openmpi_cn.md @@ -0,0 +1,41 @@ +# 在OpenMPI集群中提交训练作业 + +## 准备OpenMPI集群 + +执行下面的命令以启动3个节点的OpenMPI集群和一个"head"节点: + +```bash +paddle/scripts/cluster_train_v2/openmpi/docker_cluster +kubectl create -f head.yaml +kubectl create -f mpi-nodes.yaml +``` + +然后可以从head节点ssh无密码登录到OpenMPI的每个节点上。 + +## 启动集群作业 + +您可以按照下面的步骤在OpenMPI集群中提交paddle训练任务: + +```bash +# 获得head和node节点的IP地址 +kubectl get po -o wide +# 将node节点的IP地址保存到machines文件中 +kubectl get po -o wide | grep nodes | awk '{print $6}' > machines +# 拷贝必要的文件到head节点 +scp -i ssh/id_rsa.mpi.pub machines prepare.py train.py start_mpi_train.sh tutorial@[headIP]:~ +# ssh 登录到head节点 +ssh -i ssh/id_rsa.mpi.pub tutorial@[headIP] +# --------------- 以下操作均在head节点中执行 --------------- +# 准备训练数据 +python prepare.py +# 拷贝训练程序和字典文件到每台MPI节点 +cat machines | xargs -i scp word_dict.pickle train.py start_mpi_train.sh machines {}:/home/tutorial +# 创建日志目录 +mpirun -hostfile machines -n 3 mkdir /home/tutorial/logs +# 拷贝训练数据到各自的节点 +scp train.txt-00000 test.txt-00000 [node1IP]:/home/tutorial +scp train.txt-00001 test.txt-00001 [node2IP]:/home/tutorial +scp train.txt-00002 test.txt-00002 [node3IP]:/home/tutorial +# 启动训练任务 +mpirun -hostfile machines -n 3 /home/tutorial/start_mpi_train.sh +``` diff --git a/doc/howto/usage/cluster/openmpi_en.md b/doc/howto/usage/cluster/openmpi_en.md new file mode 100644 index 0000000000000000000000000000000000000000..09af46e25ebe1f843dc7c7be0997dc706413b65c --- /dev/null +++ b/doc/howto/usage/cluster/openmpi_en.md @@ -0,0 +1,41 @@ +# Cluster Training Using OpenMPI + +## Prepare an OpenMPI cluster + +Run the following command to start a 3-node MPI cluster and one "head" node. + +```bash +cd paddle/scripts/cluster_train_v2/openmpi/docker_cluster +kubectl create -f head.yaml +kubectl create -f mpi-nodes.yaml +``` + +Then you can log in to every OpenMPI node using ssh without input any passwords. + +## Launching Cluster Job + +Follow the steps to launch a PaddlePaddle training job in OpenMPI cluster:\ + +```bash +# find out node IP addresses +kubectl get po -o wide +# generate a "machines" file containing node IP addresses +kubectl get po -o wide | grep nodes | awk '{print $6}' > machines +# copy necessary files onto "head" node +scp -i ssh/id_rsa.mpi.pub machines prepare.py train.py start_mpi_train.sh tutorial@[headIP]:~ +# login to head node using ssh +ssh -i ssh/id_rsa.mpi.pub tutorial@[headIP] +# --------------- in head node --------------- +# prepare training data +python prepare.py +# copy training data and dict file to MPI nodes +cat machines | xargs -i scp word_dict.pickle train.py start_mpi_train.sh machines {}:/home/tutorial +# creat a directory for storing log files +mpirun -hostfile machines -n 3 mkdir /home/tutorial/logs +# copy training data to every node +scp train.txt-00000 test.txt-00000 [node1IP]:/home/tutorial +scp train.txt-00001 test.txt-00001 [node2IP]:/home/tutorial +scp train.txt-00002 test.txt-00002 [node3IP]:/home/tutorial +# start the job +mpirun -hostfile machines -n 3 /home/tutorial/start_mpi_train.sh +``` diff --git a/doc/howto/usage/cluster/src/Dockerfile b/doc/howto/usage/cluster/src/Dockerfile new file mode 100644 index 0000000000000000000000000000000000000000..e178bf4da0f32fca9586b5b69a2c7419de5d9cb1 --- /dev/null +++ b/doc/howto/usage/cluster/src/Dockerfile @@ -0,0 +1,7 @@ +FROM paddlepaddle/paddle:latest + +MAINTAINER zjsxzong89@gmail.com + +COPY start.sh /root/ +COPY start_paddle.py /root/ +CMD ["bash"," -c","/root/start.sh"] \ No newline at end of file diff --git a/doc/howto/usage/k8s/src/add_security_group.png b/doc/howto/usage/cluster/src/add_security_group.png similarity index 100% rename from doc/howto/usage/k8s/src/add_security_group.png rename to doc/howto/usage/cluster/src/add_security_group.png diff --git a/doc/howto/usage/k8s/src/create_efs.png b/doc/howto/usage/cluster/src/create_efs.png similarity index 100% rename from doc/howto/usage/k8s/src/create_efs.png rename to doc/howto/usage/cluster/src/create_efs.png diff --git a/doc/howto/usage/k8s/src/efs_mount.png b/doc/howto/usage/cluster/src/efs_mount.png similarity index 100% rename from doc/howto/usage/k8s/src/efs_mount.png rename to doc/howto/usage/cluster/src/efs_mount.png diff --git a/doc/howto/usage/cluster/src/k8s-paddle-arch.png b/doc/howto/usage/cluster/src/k8s-paddle-arch.png new file mode 100644 index 0000000000000000000000000000000000000000..b3800c4fe81302d35e49f7dbacb9221c4dfa5cde Binary files /dev/null and b/doc/howto/usage/cluster/src/k8s-paddle-arch.png differ diff --git a/doc/howto/usage/k8s/src/k8s_data/Dockerfile b/doc/howto/usage/cluster/src/k8s_data/Dockerfile similarity index 100% rename from doc/howto/usage/k8s/src/k8s_data/Dockerfile rename to doc/howto/usage/cluster/src/k8s_data/Dockerfile diff --git a/doc/howto/usage/k8s/src/k8s_data/README.md b/doc/howto/usage/cluster/src/k8s_data/README.md similarity index 100% rename from doc/howto/usage/k8s/src/k8s_data/README.md rename to doc/howto/usage/cluster/src/k8s_data/README.md diff --git a/doc/howto/usage/k8s/src/k8s_data/get_data.sh b/doc/howto/usage/cluster/src/k8s_data/get_data.sh similarity index 100% rename from doc/howto/usage/k8s/src/k8s_data/get_data.sh rename to doc/howto/usage/cluster/src/k8s_data/get_data.sh diff --git a/doc/howto/usage/cluster/src/k8s_train/Dockerfile b/doc/howto/usage/cluster/src/k8s_train/Dockerfile new file mode 100644 index 0000000000000000000000000000000000000000..77f021a89a70d934bf70424eaa3c6dc3f7c93a28 --- /dev/null +++ b/doc/howto/usage/cluster/src/k8s_train/Dockerfile @@ -0,0 +1,6 @@ +FROM paddlepaddle/paddle:latest + +COPY start.sh /root/ +COPY start_paddle.py /root/ +RUN chmod +x /root/start.sh +CMD ["bash"," -c","/root/start.sh"] diff --git a/doc/howto/usage/k8s/src/k8s_train/README.md b/doc/howto/usage/cluster/src/k8s_train/README.md similarity index 100% rename from doc/howto/usage/k8s/src/k8s_train/README.md rename to doc/howto/usage/cluster/src/k8s_train/README.md diff --git a/doc/howto/usage/k8s/src/k8s_train/start.sh b/doc/howto/usage/cluster/src/k8s_train/start.sh similarity index 100% rename from doc/howto/usage/k8s/src/k8s_train/start.sh rename to doc/howto/usage/cluster/src/k8s_train/start.sh diff --git a/doc/howto/usage/k8s/src/k8s_train/start_paddle.py b/doc/howto/usage/cluster/src/k8s_train/start_paddle.py similarity index 100% rename from doc/howto/usage/k8s/src/k8s_train/start_paddle.py rename to doc/howto/usage/cluster/src/k8s_train/start_paddle.py diff --git a/doc/howto/usage/k8s/src/managed_policy.png b/doc/howto/usage/cluster/src/managed_policy.png similarity index 100% rename from doc/howto/usage/k8s/src/managed_policy.png rename to doc/howto/usage/cluster/src/managed_policy.png diff --git a/doc/howto/usage/k8s/src/pserver_and_trainer.png b/doc/howto/usage/cluster/src/pserver_and_trainer.png similarity index 100% rename from doc/howto/usage/k8s/src/pserver_and_trainer.png rename to doc/howto/usage/cluster/src/pserver_and_trainer.png diff --git a/doc/howto/usage/k8s/src/route53_create_recordset.png b/doc/howto/usage/cluster/src/route53_create_recordset.png similarity index 100% rename from doc/howto/usage/k8s/src/route53_create_recordset.png rename to doc/howto/usage/cluster/src/route53_create_recordset.png diff --git a/doc/howto/usage/k8s/src/route53_create_zone.png b/doc/howto/usage/cluster/src/route53_create_zone.png similarity index 100% rename from doc/howto/usage/k8s/src/route53_create_zone.png rename to doc/howto/usage/cluster/src/route53_create_zone.png diff --git a/doc/howto/usage/cluster/src/trainer.png b/doc/howto/usage/cluster/src/trainer.png new file mode 100644 index 0000000000000000000000000000000000000000..6537d3d56589ca9f19a77a50a970e4b5275e6ce0 Binary files /dev/null and b/doc/howto/usage/cluster/src/trainer.png differ diff --git a/doc/howto/usage/cluster/src/trainer_cn.png b/doc/howto/usage/cluster/src/trainer_cn.png new file mode 100644 index 0000000000000000000000000000000000000000..f9525739cc8bc6506adde642aafa0a85ae3ebebc Binary files /dev/null and b/doc/howto/usage/cluster/src/trainer_cn.png differ diff --git a/doc/howto/usage/cluster/src/word2vec/api_train_v2.py b/doc/howto/usage/cluster/src/word2vec/api_train_v2.py new file mode 100644 index 0000000000000000000000000000000000000000..9a65f14628d8a0808dce25187b482354c72a838d --- /dev/null +++ b/doc/howto/usage/cluster/src/word2vec/api_train_v2.py @@ -0,0 +1,114 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import gzip +import math + +import paddle.v2 as paddle + +embsize = 32 +hiddensize = 256 +N = 5 + + +def wordemb(inlayer): + wordemb = paddle.layer.embedding( + input=inlayer, + size=embsize, + param_attr=paddle.attr.Param( + name="_proj", + initial_std=0.001, + learning_rate=1, + l2_rate=0, + sparse_update=True)) + return wordemb + + +def main(): + # for local training + cluster_train = False + + if not cluster_train: + paddle.init(use_gpu=False, trainer_count=1) + else: + paddle.init( + use_gpu=False, + trainer_count=2, + port=7164, + ports_num=1, + ports_num_for_sparse=1, + num_gradient_servers=1) + word_dict = paddle.dataset.imikolov.build_dict() + dict_size = len(word_dict) + firstword = paddle.layer.data( + name="firstw", type=paddle.data_type.integer_value(dict_size)) + secondword = paddle.layer.data( + name="secondw", type=paddle.data_type.integer_value(dict_size)) + thirdword = paddle.layer.data( + name="thirdw", type=paddle.data_type.integer_value(dict_size)) + fourthword = paddle.layer.data( + name="fourthw", type=paddle.data_type.integer_value(dict_size)) + nextword = paddle.layer.data( + name="fifthw", type=paddle.data_type.integer_value(dict_size)) + + Efirst = wordemb(firstword) + Esecond = wordemb(secondword) + Ethird = wordemb(thirdword) + Efourth = wordemb(fourthword) + + contextemb = paddle.layer.concat(input=[Efirst, Esecond, Ethird, Efourth]) + hidden1 = paddle.layer.fc(input=contextemb, + size=hiddensize, + act=paddle.activation.Sigmoid(), + layer_attr=paddle.attr.Extra(drop_rate=0.5), + bias_attr=paddle.attr.Param(learning_rate=2), + param_attr=paddle.attr.Param( + initial_std=1. / math.sqrt(embsize * 8), + learning_rate=1)) + predictword = paddle.layer.fc(input=hidden1, + size=dict_size, + bias_attr=paddle.attr.Param(learning_rate=2), + act=paddle.activation.Softmax()) + + def event_handler(event): + if isinstance(event, paddle.event.EndIteration): + if event.batch_id % 100 == 0: + with gzip.open("batch-" + str(event.batch_id) + ".tar.gz", + 'w') as f: + trainer.save_parameter_to_tar(f) + result = trainer.test( + paddle.batch( + paddle.dataset.imikolov.test(word_dict, N), 32)) + print "Pass %d, Batch %d, Cost %f, %s, Testing metrics %s" % ( + event.pass_id, event.batch_id, event.cost, event.metrics, + result.metrics) + + cost = paddle.layer.classification_cost(input=predictword, label=nextword) + + parameters = paddle.parameters.create(cost) + adagrad = paddle.optimizer.AdaGrad( + learning_rate=3e-3, + regularization=paddle.optimizer.L2Regularization(8e-4)) + trainer = paddle.trainer.SGD(cost, + parameters, + adagrad, + is_local=not cluster_train) + trainer.train( + paddle.batch(paddle.dataset.imikolov.train(word_dict, N), 32), + num_passes=30, + event_handler=event_handler) + + +if __name__ == '__main__': + main() diff --git a/doc/howto/usage/cluster/src/word2vec/api_train_v2_cluster.py b/doc/howto/usage/cluster/src/word2vec/api_train_v2_cluster.py new file mode 100644 index 0000000000000000000000000000000000000000..2afce9a66e521f9e3a8d566dd23762969f0594f5 --- /dev/null +++ b/doc/howto/usage/cluster/src/word2vec/api_train_v2_cluster.py @@ -0,0 +1,137 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import math +import os +import paddle.v2 as paddle +import pickle + +embsize = 32 +hiddensize = 256 +N = 5 +cluster_train_file = "./train_data_dir/train/train.txt" +cluster_test_file = "./test_data_dir/test/test.txt" +node_id = os.getenv("OMPI_COMM_WORLD_RANK") +if not node_id: + raise EnvironmentError("must provied OMPI_COMM_WORLD_RANK") + + +def wordemb(inlayer): + wordemb = paddle.layer.embedding( + input=inlayer, + size=embsize, + param_attr=paddle.attr.Param( + name="_proj", + initial_std=0.001, + learning_rate=1, + l2_rate=0, + sparse_update=True)) + return wordemb + + +def cluster_reader_cluster(filename, node_id): + def cluster_reader(): + with open("-".join([filename, "%05d" % int(node_id)]), "r") as f: + for l in f: + csv_data = [int(cell) for cell in l.split(",")] + yield tuple(csv_data) + + return cluster_reader + + +def main(): + # get arguments from env + + # for local training + TRUTH = ["true", "True", "TRUE", "1", "yes", "Yes", "YES"] + cluster_train = os.getenv('PADDLE_CLUSTER_TRAIN', "False") in TRUTH + use_gpu = os.getenv('PADDLE_INIT_USE_GPU', "False") + + if not cluster_train: + paddle.init( + use_gpu=use_gpu, + trainer_count=int(os.getenv("PADDLE_INIT_TRAINER_COUNT", "1"))) + else: + paddle.init( + use_gpu=use_gpu, + trainer_count=int(os.getenv("PADDLE_INIT_TRAINER_COUNT", "1")), + port=int(os.getenv("PADDLE_INIT_PORT", "7164")), + ports_num=int(os.getenv("PADDLE_INIT_PORTS_NUM", "1")), + ports_num_for_sparse=int( + os.getenv("PADDLE_INIT_PORTS_NUM_FOR_SPARSE", "1")), + num_gradient_servers=int( + os.getenv("PADDLE_INIT_NUM_GRADIENT_SERVERS", "1")), + trainer_id=int(os.getenv("PADDLE_INIT_TRAINER_ID", "0")), + pservers=os.getenv("PADDLE_INIT_PSERVERS", "127.0.0.1")) + fn = open("thirdparty/wuyi_train_thdpty/word_dict.pickle", "r") + word_dict = pickle.load(fn) + fn.close() + dict_size = len(word_dict) + firstword = paddle.layer.data( + name="firstw", type=paddle.data_type.integer_value(dict_size)) + secondword = paddle.layer.data( + name="secondw", type=paddle.data_type.integer_value(dict_size)) + thirdword = paddle.layer.data( + name="thirdw", type=paddle.data_type.integer_value(dict_size)) + fourthword = paddle.layer.data( + name="fourthw", type=paddle.data_type.integer_value(dict_size)) + nextword = paddle.layer.data( + name="fifthw", type=paddle.data_type.integer_value(dict_size)) + + Efirst = wordemb(firstword) + Esecond = wordemb(secondword) + Ethird = wordemb(thirdword) + Efourth = wordemb(fourthword) + + contextemb = paddle.layer.concat(input=[Efirst, Esecond, Ethird, Efourth]) + hidden1 = paddle.layer.fc(input=contextemb, + size=hiddensize, + act=paddle.activation.Sigmoid(), + layer_attr=paddle.attr.Extra(drop_rate=0.5), + bias_attr=paddle.attr.Param(learning_rate=2), + param_attr=paddle.attr.Param( + initial_std=1. / math.sqrt(embsize * 8), + learning_rate=1)) + predictword = paddle.layer.fc(input=hidden1, + size=dict_size, + bias_attr=paddle.attr.Param(learning_rate=2), + act=paddle.activation.Softmax()) + + def event_handler(event): + if isinstance(event, paddle.event.EndIteration): + if event.batch_id % 100 == 0: + result = trainer.test( + paddle.batch( + cluster_reader_cluster(cluster_test_file, node_id), 32)) + print "Pass %d, Batch %d, Cost %f, %s, Testing metrics %s" % ( + event.pass_id, event.batch_id, event.cost, event.metrics, + result.metrics) + + cost = paddle.layer.classification_cost(input=predictword, label=nextword) + parameters = paddle.parameters.create(cost) + adagrad = paddle.optimizer.AdaGrad( + learning_rate=3e-3, + regularization=paddle.optimizer.L2Regularization(8e-4)) + trainer = paddle.trainer.SGD(cost, + parameters, + adagrad, + is_local=not cluster_train) + trainer.train( + paddle.batch(cluster_reader_cluster(cluster_train_file, node_id), 32), + num_passes=30, + event_handler=event_handler) + + +if __name__ == '__main__': + main() diff --git a/doc/howto/usage/cluster/src/word2vec/prepare.py b/doc/howto/usage/cluster/src/word2vec/prepare.py new file mode 100644 index 0000000000000000000000000000000000000000..ade01c378efced05787e1f62e6fa6c38f8ac5ad3 --- /dev/null +++ b/doc/howto/usage/cluster/src/word2vec/prepare.py @@ -0,0 +1,55 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle.v2 as paddle +import tarfile +import os +import pickle + +SPLIT_COUNT = 3 +N = 5 + + +def file_len(fd): + for i, l in enumerate(fd): + pass + return i + 1 + + +def split_from_reader_by_line(filename, reader, split_count): + fn = open(filename, "w") + for batch_id, batch_data in enumerate(reader()): + batch_data_str = [str(d) for d in batch_data] + fn.write(",".join(batch_data_str)) + fn.write("\n") + fn.close() + + fn = open(filename, "r") + total_line_count = file_len(fn) + fn.close() + per_file_lines = total_line_count / split_count + 1 + cmd = "split -d -a 5 -l %d %s %s-" % (per_file_lines, filename, filename) + os.system(cmd) + + +word_dict = paddle.dataset.imikolov.build_dict() +with open("word_dict.pickle", "w") as dict_f: + pickle.dump(word_dict, dict_f) + +split_from_reader_by_line("train.txt", + paddle.dataset.imikolov.train(word_dict, N), + SPLIT_COUNT) +split_from_reader_by_line("test.txt", + paddle.dataset.imikolov.test(word_dict, N), + SPLIT_COUNT) diff --git a/doc/howto/usage/k8s/src/worker_security_group.png b/doc/howto/usage/cluster/src/worker_security_group.png similarity index 100% rename from doc/howto/usage/k8s/src/worker_security_group.png rename to doc/howto/usage/cluster/src/worker_security_group.png diff --git a/doc/howto/usage/cmd_parameter/arguments_cn.md b/doc/howto/usage/cmd_parameter/arguments_cn.md index f7aa525054468670f59309ddf9206af55bb77869..2dea231ca5487978d59a4d0a570431722ed6b3bf 100644 --- a/doc/howto/usage/cmd_parameter/arguments_cn.md +++ b/doc/howto/usage/cmd_parameter/arguments_cn.md @@ -63,7 +63,7 @@ -训练dot_period +训练dot_period √√ diff --git a/doc/howto/usage/k8s/k8s_aws_en.md b/doc/howto/usage/k8s/k8s_aws_en.md deleted file mode 100644 index ce72b0803818d5bf0c18753c421848cf2fc1b668..0000000000000000000000000000000000000000 --- a/doc/howto/usage/k8s/k8s_aws_en.md +++ /dev/null @@ -1,689 +0,0 @@ - -# Distributed PaddlePaddle Training on AWS with Kubernetes - -We will show you step by step on how to run distributed PaddlePaddle training on AWS cluster with Kubernetes. Let's start from core concepts. - -## Distributed PaddlePaddle Training Core Concepts - -### Distributed Training Job - -A distributed training job is represented by a [Kubernetes job](https://kubernetes.io/docs/user-guide/jobs/#what-is-a-job). - -Each Kuberentes job is described by a job config file, which specifies the information like the number of [pods](https://kubernetes.io/docs/user-guide/pods/#what-is-a-pod) in the job and environment variables. - -In a distributed training job, we would: - -1. prepare partitioned training data and configuration file on a distributed file system (in this tutorial we use Amazon Elastic File System), and -1. create and submit the Kubernetes job config to the Kubernetes cluster to start the training job. - -### Parameter Servers and Trainers - -There are two roles in a PaddlePaddle cluster: *parameter server (pserver)* and *trainer*. Each parameter server process maintains a shard of the global model. Each trainer has its local copy of the model, and uses its local data to update the model. During the training process, trainers send model updates to parameter servers, parameter servers are responsible for aggregating these updates, so that trainers can synchronize their local copy with the global model. - -
![Model is partitioned into two shards. Managed by two parameter servers respectively.](src/pserver_and_trainer.png)
- -In order to communicate with pserver, trainer needs to know the ip address of each pserver. In kubernetes it's better to use a service discovery mechanism (e.g., DNS hostname) rather than static ip address, since any pserver's pod may be killed and a new pod could be schduled onto another node of different ip address. However, now we are using static ip. This will be improved. - -Parameter server and trainer are packaged into a same docker image. They will run once pod is scheduled by kubernetes job. - -### Trainer ID - -Each trainer process requires a trainer ID, a zero-based index value, passed in as a command-line parameter. The trainer process thus reads the data partition indexed by this ID. - -### Training - -The entry-point of a container is a shell script. It can see some environment variables pre-defined by Kubernetes. This includes one that gives the job's identity, which can be used in a remote call to the Kubernetes apiserver that lists all pods in the job. - -We rank each pod by sorting them by their ips. The rank of each pod could be the "pod ID". Because we run one trainer and one parameter server in each pod, we can use this "pod ID" as the trainer ID. A detailed workflow of the entry-point script is as follows: - -1. Query the api server to get pod information, and assign the `trainer_id` by sorting the ip. -1. Copy the training data from EFS persistent volume into container. -1. Parse the `paddle pserver` and `paddle trainer` startup parameters from environment variables, and then start up the processes. -1. Trainer with `train_id` 0 will automatically write results onto EFS volume. - - -## PaddlePaddle on AWS with Kubernetes - -### Choose AWS Service Region -This tutorial requires several AWS services work in the same region. Before we create anything in AWS, please check the following link -https://aws.amazon.com/about-aws/global-infrastructure/regional-product-services/ -Choose a region which has the following services available: EC2, EFS, VPS, CloudFormation, KMS, VPC, S3. -In this tutorial, we use "Oregon(us-west-2)" as example. - -### Create AWS Account and IAM Account - -Under each AWS account, we can create multiple [IAM](http://docs.aws.amazon.com/IAM/latest/UserGuide/introduction.html) users. This allows us to grant some privileges to each IAM user and to create/operate AWS clusters as an IAM user. - -To sign up an AWS account, please -follow -[this guide](http://docs.aws.amazon.com/lambda/latest/dg/setting-up.html). -To create IAM users and user groups under an AWS account, please -follow -[this guide](http://docs.aws.amazon.com/IAM/latest/UserGuide/id_users_create.html). - -Please be aware that this tutorial needs the following privileges for the user in IAM: - -- AmazonEC2FullAccess -- AmazonS3FullAccess -- AmazonRoute53FullAccess -- AmazonRoute53DomainsFullAccess -- AmazonElasticFileSystemFullAccess -- AmazonVPCFullAccess -- IAMUserSSHKeys -- IAMFullAccess -- NetworkAdministrator -- AWSKeyManagementServicePowerUser - - -### Download kube-aws and kubectl - -#### kube-aws - -[kube-aws](https://github.com/coreos/kube-aws) is a CLI tool to automate cluster deployment to AWS. -##### Verify kube-aws integrity -Note: if you are using a non-official release (e.g RC release) kube-aws, you can skip this setp. -Import the CoreOS Application Signing Public Key: - -``` -gpg2 --keyserver pgp.mit.edu --recv-key FC8A365E -``` - -Validate the key fingerprint: - -``` -gpg2 --fingerprint FC8A365E -``` -The correct key fingerprint is `18AD 5014 C99E F7E3 BA5F 6CE9 50BD D3E0 FC8A 365E` - -We can download `kube-aws` from its [release page](https://github.com/coreos/kube-aws/releases). In this tutorial, we use version 0.9.1 - -Validate the tarball's GPG signature: - -``` -PLATFORM=linux-amd64 - # Or -PLATFORM=darwin-amd64 - -gpg2 --verify kube-aws-${PLATFORM}.tar.gz.sig kube-aws-${PLATFORM}.tar.gz -``` -##### Install kube-aws -Extract the binary: - -``` -tar zxvf kube-aws-${PLATFORM}.tar.gz -``` - -Add kube-aws to your path: - -``` -mv ${PLATFORM}/kube-aws /usr/local/bin -``` - - -#### kubectl - -[kubectl](https://kubernetes.io/docs/user-guide/kubectl-overview/) is a command line interface for running commands against Kubernetes clusters. - -Download `kubectl` from the Kubernetes release artifact site with the `curl` tool. - -``` -# OS X -curl -O https://storage.googleapis.com/kubernetes-release/release/"$(curl -s https://storage.googleapis.com/kubernetes-release/release/stable.txt)"/bin/darwin/amd64/kubectl - -# Linux -curl -O https://storage.googleapis.com/kubernetes-release/release/"$(curl -s https://storage.googleapis.com/kubernetes-release/release/stable.txt)"/bin/linux/amd64/kubectl -``` - -Make the kubectl binary executable and move it to your PATH (e.g. `/usr/local/bin`): - -``` -chmod +x ./kubectl -sudo mv ./kubectl /usr/local/bin/kubectl -``` - -### Configure AWS Credentials - -First check out [this](http://docs.aws.amazon.com/cli/latest/userguide/installing.html) for installing the AWS command line interface. - -And then configure your AWS account information: - -``` -aws configure -``` - - -Fill in the required fields: - - -``` -AWS Access Key ID: YOUR_ACCESS_KEY_ID -AWS Secrete Access Key: YOUR_SECRETE_ACCESS_KEY -Default region name: us-west-2 -Default output format: json -``` - -`YOUR_ACCESS_KEY_ID`, and `YOUR_SECRETE_ACCESS_KEY` is the IAM key and secret from [Create AWS Account and IAM Account](#create-aws-account-and-iam-account) - -Verify that your credentials work by describing any instances you may already have running on your account: - -``` -aws ec2 describe-instances -``` - -### Define Cluster Parameters - -#### EC2 key pair - -The keypair that will authenticate SSH access to your EC2 instances. The public half of this key pair will be configured on each CoreOS node. - -Follow [EC2 Keypair User Guide](http://docs.aws.amazon.com/AWSEC2/latest/UserGuide/ec2-key-pairs.html) to create a EC2 key pair - -After creating a key pair, you will use the key pair name to configure the cluster. - -Key pairs are only available to EC2 instances in the same region. We are using us-west-2 in our tutorial, so make sure to creat key pairs in that region (Oregon). - -Your browser will download a `key-name.pem` file which is the key to access the EC2 instances. We will use it later. - - -#### KMS key - -Amazon KMS keys are used to encrypt and decrypt cluster TLS assets. If you already have a KMS Key that you would like to use, you can skip creating a new key and provide the Arn string for your existing key. - -You can create a KMS key with the aws command line tool: - -``` -aws kms --region=us-west-2 create-key --description="kube-aws assets" -{ - "KeyMetadata": { - "CreationDate": 1458235139.724, - "KeyState": "Enabled", - "Arn": "arn:aws:kms:us-west-2:aaaaaaaaaaaaa:key/xxxxxxxxxxxxxxxxxxx", - "AWSAccountId": "xxxxxxxxxxxxx", - "Enabled": true, - "KeyUsage": "ENCRYPT_DECRYPT", - "KeyId": "xxxxxxxxx", - "Description": "kube-aws assets" - } -} -``` - -We will need to use the value of `Arn` later. - -And then let's add several inline policies in your IAM user permission. - -Go to [IAM Console](https://console.aws.amazon.com/iam/home?region=us-west-2#/home). Click on button `Users`, click user that we just created, and then click on `Add inline policy` button, and select `Custom Policy`. - -Paste into following inline policies: - -``` - (Caution: node_0, node_1, node_2 directories represents PaddlePaddle node and train_id, not the Kubernetes node){ - "Version": "2012-10-17", - "Statement": [ - { - "Sid": "Stmt1482205552000", - "Effect": "Allow", - "Action": [ - "kms:Decrypt", - "kms:Encrypt" - ], - "Resource": [ - "arn:aws:kms:*:AWS_ACCOUNT_ID:key/*" - ] - }, - { - "Sid": "Stmt1482205746000", - "Effect": "Allow", - "Action": [ - "cloudformation:CreateStack", - "cloudformation:UpdateStack", - "cloudformation:DeleteStack", - "cloudformation:DescribeStacks", - "cloudformation:DescribeStackResource", - "cloudformation:GetTemplate", - "cloudformation:DescribeStackEvents" - ], - "Resource": [ - "arn:aws:cloudformation:us-west-2:AWS_ACCOUNT_ID:stack/MY_CLUSTER_NAME/*" - ] - } - ] -} -``` -`Version` : Its value has to be exactly "2012-10-17". -`AWS_ACCOUNT_ID`: You can get it from following command line: - -``` -aws sts get-caller-identity --output text --query Account -``` - -`MY_CLUSTER_NAME`: Pick a MY_CLUSTER_NAME that you like, you will use it later as well. -Please note, stack name must satisfy regular expression pattern: [a-zA-Z][-a-zA-Z0-9*]*, which means no "_" or "-" in stack name, or kube-aws will throw error in later steps. - -#### External DNS name - -When the cluster is created, the controller will expose the TLS-secured API on a DNS name. - -DNS name should have a CNAME points to cluster DNS name or an A record points to the cluster IP address. - -We will need to use DNS name later in tutorial. If you don't already own one, you can choose any DNS name (e.g., `paddle`) and modify `/etc/hosts` to associate cluster IP with that DNS name for your local machine. And add name service (route53) in aws to associate the IP to paddle for cluster. We will find the cluster IP in later steps. - -#### S3 bucket - -You need to create an S3 bucket before startup the Kubernetes cluster. - -There are some bugs in aws cli in creating S3 bucket, so let's use the [S3 Console](https://console.aws.amazon.com/s3/home?region=us-west-2). - -Click on `Create Bucket`, fill in a unique BUCKET_NAME, and make sure region is us-west-2 (Oregon). - - -#### Initialize Assets - -Create a directory on your local machine to hold the generated assets: - -``` -$ mkdir my-cluster -$ cd my-cluster -``` - -Initialize the cluster CloudFormation stack with the KMS Arn, key pair name, and DNS name from the previous step: - -``` -kube-aws init \ ---cluster-name=MY_CLUSTER_NAME \ ---external-dns-name=MY_EXTERNAL_DNS_NAME \ ---region=us-west-2 \ ---availability-zone=us-west-2a \ ---key-name=KEY_PAIR_NAME \ ---kms-key-arn="arn:aws:kms:us-west-2:xxxxxxxxxx:key/xxxxxxxxxxxxxxxxxxx" -``` - -`MY_CLUSTER_NAME`: the one you picked in [KMS key](#kms-key) - -`MY_EXTERNAL_DNS_NAME`: see [External DNS name](#external-dns-name) - -`KEY_PAIR_NAME`: see [EC2 key pair](#ec2-key-pair) - -`--kms-key-arn`: the "Arn" in [KMS key](#kms-key) - -Here `us-west-2a` is used for parameter `--availability-zone`, but supported availability zone varies among AWS accounts. - -Please check if `us-west-2a` is supported by `aws ec2 --region us-west-2 describe-availability-zones`, if not switch to other supported availability zone. (e.g., `us-west-2a`, or `us-west-2b`) - - -There will now be a cluster.yaml file in the asset directory. This is the main configuration file for your cluster. - -By default `kube-aws` will only create one worker node. Let's edit `cluster.yaml` and change `workerCount` from 1 to 3. - - -#### Render contents of the asset directory - -In the simplest case, you can have kube-aws generate both your TLS identities and certificate authority for you. - -``` -kube-aws render credentials --generate-ca -``` - -The next command generates the default set of cluster assets in your asset directory. - -``` -kube-aws render stack -``` -Assets (templates and credentials) that are used to create, update and interact with your Kubernetes cluster will be created under your current folder. - - -### Kubernetes Cluster Start Up - -#### Create the instances defined in the CloudFormation template - -Now let's create your cluster (choose any `PREFIX` for the command below): - -``` -kube-aws up --s3-uri s3://BUCKET_NAME/PREFIX -``` - -`BUCKET_NAME`: the bucket name that you used in [S3 bucket](#s3-bucket) - - -#### Configure DNS - -You can invoke `kube-aws status` to get the cluster API endpoint after cluster creation. - -``` -$ kube-aws status -Cluster Name: paddle-cluster -Controller DNS Name: paddle-cl-ElbAPISe-EEOI3EZPR86C-531251350.us-west-2.elb.amazonaws.com -``` - -If you own a DNS name, set the A record to any of the above ip. __Or__ you can set up CNAME point to `Controller DNS Name` (`paddle-cl-ElbAPISe-EEOI3EZPR86C-531251350.us-west-2.elb.amazonaws.com`) - -##### Find IP address - -Use command `dig` to check the load balancer hostname to get the ip address. - -``` -$ dig paddle-cl-ElbAPISe-EEOI3EZPR86C-531251350.us-west-2.elb.amazonaws.com - -;; QUESTION SECTION: -;paddle-cl-ElbAPISe-EEOI3EZPR86C-531251350.us-west-2.elb.amazonaws.com. IN A - -;; ANSWER SECTION: -paddle-cl-ElbAPISe-EEOI3EZPR86C-531251350.us-west-2.elb.amazonaws.com. 59 IN A 54.241.164.52 -paddle-cl-ElbAPISe-EEOI3EZPR86C-531251350.us-west-2.elb.amazonaws.com. 59 IN A 54.67.102.112 -``` - -In the above output, both ip `54.241.164.52`, `54.67.102.112` will work. - -*If you own a DNS name*, set the A record to any of the above ip. Then you can skip to the step "Access the cluster". - -*If you do not own a DNS name*: -##### Update local DNS association -Edit `/etc/hosts` to associate above ip with the DNS name. -##### Add Route53 private name service in VPC - - Open [Route53 Console](https://console.aws.amazon.com/route53/home) - - Create hosted zone with following config - - Domain name: "paddle" - - Type: "Private hosted zone for amazon VPC" - - VPC ID: `` - - ![route53 zone setting](src/route53_create_zone.png) - - Add A record - - Click on the zone "paddle" just created - - Click the button "Create record set" - - Name : leave blank - - type: "A" - - Value: `` - - ![route53 create recordset](src/route53_create_recordset.png) - - Verify name service - - Connect to any instance created by kube-aws via ssh - - Run command "host paddle", see if the ip returned is the private ip of kube-controller - -#### Access the cluster - -Once the API server is running, you should see: - -``` -$ kubectl --kubeconfig=kubeconfig get nodes -NAME STATUS AGE -ip-10-0-0-134.us-west-2.compute.internal Ready 6m -ip-10-0-0-238.us-west-2.compute.internal Ready 6m -ip-10-0-0-50.us-west-2.compute.internal Ready 6m -ip-10-0-0-55.us-west-2.compute.internal Ready 6m -``` - - -### Setup Elastic File System for Cluster - -Training data is usually served on a distributed filesystem, we use Elastic File System (EFS) on AWS. - -1. Create security group for EFS in [security group console](https://us-west-2.console.aws.amazon.com/ec2/v2/home?region=us-west-2#SecurityGroups:sort=groupId) - 1. Look up security group id for `paddle-cluster-sg-worker` (`sg-055ee37d` in the image below) -
![](src/worker_security_group.png)
- 2. Add security group `paddle-efs` with `ALL TCP` inbound rule and custom source as group id of `paddle-cluster-sg-worker`. And VPC of `paddle-cluster-vpc`. Make sure availability zone is same as the one you used in [Initialize Assets](#initialize-assets). -
![](src/add_security_group.png)
- -2. Create the Elastic File System in [EFS console](https://us-west-2.console.aws.amazon.com/efs/home?region=us-west-2#/wizard/1) with `paddle-cluster-vpc` VPC. Make sure subnet is `paddle-cluster-Subnet0` andd security group is `paddle-efs`. -
![](src/create_efs.png)
- - -### Start PaddlePaddle Training Demo on AWS - -#### Configure Kubernetes Volume that Points to EFS - -First we need to create a [PersistentVolume](https://kubernetes.io/docs/user-guide/persistent-volumes/) to provision EFS volumn. - -Save following snippet as `pv.yaml` -``` -apiVersion: v1 -kind: PersistentVolume -metadata: - name: efsvol -spec: - capacity: - storage: 100Gi - accessModes: - - ReadWriteMany - nfs: - server: EFS_DNS_NAME - path: "/" -``` - -`EFS_DNS_NAME`: DNS name as shown in description of `paddle-efs` that we created. Looks similar to `fs-2cbf7385.efs.us-west-2.amazonaws.com` - -Run following command to create a persistent volumn: -``` -kubectl --kubeconfig=kubeconfig create -f pv.yaml -``` - -Next let's create a [PersistentVolumeClaim](https://kubernetes.io/docs/user-guide/persistent-volumes/) to claim the persistent volume. - -Save following snippet as `pvc.yaml`. -``` -kind: PersistentVolumeClaim -apiVersion: v1 -metadata: - name: efsvol -spec: - accessModes: - - ReadWriteMany - resources: - requests: - storage: 50Gi -``` - -Run following command to create a persistent volumn claim: -``` -kubectl --kubeconfig=kubeconfig create -f pvc.yaml -``` - -#### Prepare Training Data - -We will now launch a kubernetes job that downloads, saves and evenly splits training data into 3 shards on the persistent volumn that we just created. - -save following snippet as `paddle-data-job.yaml` -``` -apiVersion: batch/v1 -kind: Job -metadata: - name: paddle-data -spec: - template: - metadata: - name: pi - spec: - containers: - - name: paddle-data - image: paddledev/paddle-tutorial:k8s_data - imagePullPolicy: Always - volumeMounts: - - mountPath: "/efs" - name: efs - env: - - name: OUT_DIR - value: /efs/paddle-cluster-job - - name: SPLIT_COUNT - value: "3" - volumes: - - name: efs - persistentVolumeClaim: - claimName: efsvol - restartPolicy: Never -``` - -Run following command to launch the job: -``` -kubectl --kubeconfig=kubeconfig create -f paddle-data-job.yaml -``` - -Job may take 7 min to finish, use following command to check job status. Do not proceed until `SUCCESSFUL` for `paddle-data` job is `1` -``` -$ kubectl --kubeconfig=kubeconfig get jobs -NAME DESIRED SUCCESSFUL AGE -paddle-data 1 1 6m -``` - -Data preparation is done by docker image `paddledev/paddle-tutorial:k8s_data`, see [here](src/k8s_data/README.md) for how to build this docker image and source code. - -#### Start Training - -Now we are ready to start paddle training job. Save following snippet as `paddle-cluster-job.yaml` -``` -apiVersion: batch/v1 -kind: Job -metadata: - name: paddle-cluster-job -spec: - parallelism: 3 - completions: 3 - template: - metadata: - name: paddle-cluster-job - spec: - volumes: - - name: efs - persistentVolumeClaim: - claimName: efsvol - containers: - - name: trainer - image: paddledev/paddle-tutorial:k8s_train - command: ["bin/bash", "-c", "/root/start.sh"] - env: - - name: JOB_NAME - value: paddle-cluster-job - - name: JOB_PATH - value: /home/jobpath - - name: JOB_NAMESPACE - value: default - - name: TRAIN_CONFIG_DIR - value: quick_start - - name: CONF_PADDLE_NIC - value: eth0 - - name: CONF_PADDLE_PORT - value: "7164" - - name: CONF_PADDLE_PORTS_NUM - value: "2" - - name: CONF_PADDLE_PORTS_NUM_SPARSE - value: "2" - - name: CONF_PADDLE_GRADIENT_NUM - value: "3" - - name: TRAINER_COUNT - value: "3" - volumeMounts: - - mountPath: "/home/jobpath" - name: efs - ports: - - name: jobport0 - hostPort: 7164 - containerPort: 7164 - - name: jobport1 - hostPort: 7165 - containerPort: 7165 - - name: jobport2 - hostPort: 7166 - containerPort: 7166 - - name: jobport3 - hostPort: 7167 - containerPort: 7167 - restartPolicy: Never -``` - -`parallelism: 3, completions: 3` means this job will simultaneously start 3 PaddlePaddle pods, and this job will be finished when there are 3 finished pods. - -`env` field represents container's environment variables, we specify PaddlePaddle parameters by environment variables. - -`ports` indicates that TCP port 7164 - 7167 are exposed for communication between `pserver` ans trainer. port starts continously from `CONF_PADDLE_PORT` (7164) to `CONF_PADDLE_PORT + CONF_PADDLE_PORTS_NUM + CONF_PADDLE_PORTS_NUM_SPARSE - 1` (7167). We use multiple ports for dense and sparse paramter updates to improve latency. - -Run following command to launch the job. -``` -kubectl --kubeconfig=kubeconfig create -f paddle-claster-job.yaml -``` - -Inspect individual pods - -``` -$ kubectl --kubeconfig=kubeconfig get pods -NAME READY STATUS RESTARTS AGE -paddle-cluster-job-cm469 1/1 Running 0 9m -paddle-cluster-job-fnt03 1/1 Running 0 9m -paddle-cluster-job-jx4xr 1/1 Running 0 9m -``` - -Inspect individual console output -``` -kubectl --kubeconfig=kubeconfig log -f POD_NAME -``` - -`POD_NAME`: name of any pod (e.g., `paddle-cluster-job-cm469`). - -Run `kubectl --kubeconfig=kubeconfig describe job paddle-cluster-job` to check training job status. It will complete in around 20 minutes. - -The details for start `pserver` and `trainer` are hidden inside docker image `paddledev/paddle-tutorial:k8s_train`, see [here](src/k8s_train/README.md) for how to build the docker image and source code. - -#### Inspect Training Output - -Training output (model snapshot and logs) will be saved in EFS. We can ssh into worker EC2 instance, mount EFS and check training output. - -1. ssh Into Worker EC2 instance -``` -chmod 400 key-name.pem -ssh -i key-name.pem core@INSTANCE_IP -``` - -`INSTANCE_IP`: public IP address of EC2 kubernetes worker node. Go to [EC2 console](https://us-west-2.console.aws.amazon.com/ec2/v2/home?region=us-west-2#Instances:sort=instanceId) and check `public IP` of any `paddle-cluster-kube-aws-worker` instance. - -2. Mount EFS -``` -mkdir efs -sudo mount -t nfs4 -o nfsvers=4.1,rsize=1048576,wsize=1048576,hard,timeo=600,retrans=2 EFS_DNS_NAME:/ efs -``` - -`EFS_DNS_NAME`: DNS name as shown in description of `paddle-efs` that we created. Look similar to `fs-2cbf7385.efs.us-west-2.amazonaws.com`. - -Now folder `efs` will have structure similar to: -``` --- paddle-cluster-job - |-- ... - |-- output - | |-- node_0 - | | |-- server.log - | | `-- train.log - | |-- node_1 - | | |-- server.log - | | `-- train.log - | |-- node_2 - | | |-- server.log - | | `-- train.log - | |-- pass-00000 - | | |-- ___fc_layer_0__.w0 - | | |-- ___fc_layer_0__.wbias - | | |-- done - | | |-- path.txt - | | `-- trainer_config.lr.py - | |-- pass-00001... -``` -`server.log` contains log for `pserver`. `train.log` contains log for `trainer`. Model description and snapshot is stored in `pass-0000*`. - -### Kubernetes Cluster Tear Down - -#### Delete EFS - -Go to [EFS Console](https://us-west-2.console.aws.amazon.com/efs/home?region=us-west-2) and delete the EFS volumn that we created. - -#### Delete security group - -Go to [Security Group Console](https://us-west-2.console.aws.amazon.com/ec2/v2/home?region=us-west-2#SecurityGroups:sort=groupId) and delete security group `paddle-efs`. - - -#### Delete S3 Bucket - -Go to [S3 Console](https://console.aws.amazon.com/s3/home?region=us-west-2#) and delete the S3 bucket that we created. - -#### Destroy Cluster - -``` -kube-aws destroy -``` - -The command will return immediately, but it might take 5 min to tear down the whole cluster. - -You can go to [CludFormation Console](https://us-west-2.console.aws.amazon.com/cloudformation/home?region=us-west-2#/stacks?filter=active) to check destroy process. diff --git a/doc/howto/usage/k8s/k8s_basis_cn.md b/doc/howto/usage/k8s/k8s_basis_cn.md deleted file mode 100644 index 4c3dc81ed38f239c1f4a83d22b49cf57b5d16a8b..0000000000000000000000000000000000000000 --- a/doc/howto/usage/k8s/k8s_basis_cn.md +++ /dev/null @@ -1,75 +0,0 @@ -# Kubernetes 简介 - -[*Kubernetes*](http://kubernetes.io/)是Google开源的容器集群管理系统,其提供应用部署、维护、扩展机制等功能,利用Kubernetes能方便地管理跨机器运行容器化的应用。Kubernetes可以在物理机或虚拟机上运行,且支持部署到[AWS](http://kubernetes.io/docs/getting-started-guides/aws),[Azure](http://kubernetes.io/docs/getting-started-guides/azure/),[GCE](http://kubernetes.io/docs/getting-started-guides/gce)等多种公有云环境。介绍分布式训练之前,需要对[Kubernetes](http://kubernetes.io/)有一个基本的认识,下面先简要介绍一下本文用到的几个Kubernetes概念。 - -- [*Node*](http://kubernetes.io/docs/admin/node/) 表示一个Kubernetes集群中的一个工作节点,这个节点可以是物理机或者虚拟机,Kubernetes集群就是由node节点与master节点组成的。 - -- [*Pod*](http://kubernetes.io/docs/user-guide/pods/) 是一组(一个或多个)容器,pod是Kubernetes的最小调度单元,一个pod中的所有容器会被调度到同一个node上。Pod中的容器共享NET,PID,IPC,UTS等Linux namespace。由于容器之间共享NET namespace,所以它们使用同一个IP地址,可以通过*localhost*互相通信。不同pod之间可以通过IP地址访问。 - -- [*Job*](http://kubernetes.io/docs/user-guide/jobs/) 描述Kubernetes上运行的作业,一次作业称为一个job,通常每个job包括一个或者多个pods,job启动后会创建这些pod并开始执行一个程序,等待这个程序执行成功并返回0则成功退出,如果执行失败,也可以配置不同的重试机制。 - -- [*Volume*](http://kubernetes.io/docs/user-guide/volumes/) 存储卷,是pod内的容器都可以访问的共享目录,也是容器与node之间共享文件的方式,因为容器内的文件都是暂时存在的,当容器因为各种原因被销毁时,其内部的文件也会随之消失。通过volume,就可以将这些文件持久化存储。Kubernetes支持多种volume,例如hostPath(宿主机目录),gcePersistentDisk,awsElasticBlockStore等。 - -- [*Namespaces*](https://kubernetes.io/docs/user-guide/namespaces/) 命名空间,在kubernetes中创建的所有资源对象(例如上文的pod,job)等都属于一个命名空间,在同一个命名空间中,资源对象的名字是唯一的,不同空间的资源名可以重复,命名空间主要为了对象进行逻辑上的分组便于管理。本文只使用了默认命名空间。 - -- [*PersistentVolume*](https://kubernetes.io/docs/user-guide/persistent-volumes/): 和[*PersistentVolumeClaim*](https://kubernetes.io/docs/user-guide/persistent-volumes/#persistentvolumeclaims)结合,将外部的存储服务在Kubernetes中描述成为统一的资源形式,便于存储资源管理和Pod引用。 - -## 部署Kubernetes集群 - -Kubernetes提供了多种集群部署的方案,本文档内不重复介绍。这里给出集中常见的部署方法: - -- [*minikube*](https://kubernetes.io/docs/getting-started-guides/minikube/): 快速在本地启动一个单机的kubernetes服务器,便于本地验证和测试。 -- [*kubeadm*](http://kubernetes.io/docs/getting-started-guides/kubeadm/): 在不同操作系统,不同主机(Bare-Metal, AWS, GCE)条件下,快速部署集群。 -- [*AWS EC2*](https://kubernetes.io/docs/getting-started-guides/aws/): 在aws上快速部署集群。 -- [*Bare-Metal*](https://kubernetes.io/docs/getting-started-guides/centos/centos_manual_config/): 在物理机上手动部署。 - -可以参考[这个表格](https://kubernetes.io/docs/getting-started-guides/#table-of-solutions)选择适合您的场景的合适方案。 - -## 选择存储方案 - -容器不会保留在运行时生成的数据,job或者应用程序在容器中运行时生成的数据会在容器销毁时消失。为了完成分布式机器学习训练任务,需要有一个外部的存储服务来保存训练所需数据和训练输出。 -常见的可选存储服务包括: - -- [*NFS*](https://github.com/kubernetes/kubernetes/tree/master/examples/volumes/nfs): 可以将磁盘上某个目录共享给网络中其他机器访问。部署和配置比较简单,可以用于小量数据的验证。不提供分布式存储,高可用,冗余等功能。NFS的部署方法可以参考[这里](http://www.tecmint.com/how-to-setup-nfs-server-in-linux/)。 -- [*GlusterFS*](http://gluster.readthedocs.io/en/latest/Quick-Start-Guide/Quickstart/): 网络分布式文件系统,可以在Kubernetes中按照[这个](https://github.com/kubernetes/kubernetes/tree/master/examples/volumes/glusterfs)例子使用。 -- [*Ceph*](http://docs.ceph.com/docs/master/): 分布式文件系统,支持rbd,POSIX API接口(ceph fs)和对象存储API,参考[这里](https://kubernetes.io/docs/user-guide/volumes/#rbd)。 -- [*MooseFS*](https://moosefs.com/documentation.html): 一个分布式的存储系统。需要先挂载到服务器Node上再通过kubernetes hostPath Volume挂载到容器中。 - -## 配置kubectl - -### 安装kubectl -``` -# OS X -curl -LO https://storage.googleapis.com/kubernetes-release/release/$(curl -s https://storage.googleapis.com/kubernetes-release/release/stable.txt)/bin/darwin/amd64/kubectl - -# Linux -curl -LO https://storage.googleapis.com/kubernetes-release/release/$(curl -s https://storage.googleapis.com/kubernetes-release/release/stable.txt)/bin/linux/amd64/kubectl - -# Windows -curl -LO https://storage.googleapis.com/kubernetes-release/release/$(curl -s https://storage.googleapis.com/kubernetes-release/release/stable.txt)/bin/windows/amd64/kubectl.exe -``` - -### 配置kubectl访问你的kubernetes集群 - -编辑`~/.kube/config`这个配置文件,修改`Master-IP`的地址。如果使用SSL认证,则需要配置`certificate-authority`和`users`中的用户证书。如果是使用非SSL方式访问(比如通过8080端口),也可以去掉这些证书的配置。 -``` -apiVersion: v1 -clusters: -- cluster: - certificate-authority: /path/to/ca.crt - server: https://[Master-IP]:443 - name: minikube -contexts: -- context: - cluster: minikube - user: minikube - name: minikube -current-context: minikube -kind: Config -preferences: {} -users: -- name: minikube - user: - client-certificate: /path/to/apiserver.crt - client-key: /Users/wuyi/.minikube/apiserver.key -``` diff --git a/doc/howto/usage/k8s/k8s_cn.md b/doc/howto/usage/k8s/k8s_cn.md deleted file mode 100644 index ab07cb9cd5b135ddea82b3360720537f1dc5a801..0000000000000000000000000000000000000000 --- a/doc/howto/usage/k8s/k8s_cn.md +++ /dev/null @@ -1,205 +0,0 @@ -# Kubernetes单机训练 - -在这篇文档里,我们介绍如何在 Kubernetes 集群上启动一个单机使用CPU的Paddle训练作业。在下一篇中,我们将介绍如何启动分布式训练作业。 - -## 制作Docker镜像 - -在一个功能齐全的Kubernetes机群里,通常我们会安装Ceph等分布式文件系统来存储训练数据。这样的话,一个分布式Paddle训练任务中的每个进程都可以从Ceph读取数据。在这个例子里,我们只演示一个单机作业,所以可以简化对环境的要求,把训练数据直接放在 -Paddle的Docker image里。为此,我们需要制作一个包含训练数据的Paddle镜像。 - -Paddle 的 [Quick Start Tutorial](http://www.paddlepaddle.org/doc/demo/quick_start/index_en.html) -里介绍了用Paddle源码中的脚本下载训练数据的过程。 -而 `paddledev/paddle:cpu-demo-latest` 镜像里有 Paddle 源码与demo,( 请注意,默认的 -Paddle镜像 `paddledev/paddle:cpu-latest` 是不包括源码的, Paddle的各版本镜像可以参考 [Docker installation guide](http://www.paddlepaddle.org/doc/build/docker_install.html) ),所以我们使用这个镜像来下载训练数据到Docker container中,然后把这个包含了训练数据的container保存为一个新的镜像。 - -### 运行容器 - -``` -$ docker run --name quick_start_data -it paddledev/paddle:cpu-demo-latest -``` - -### 下载数据 - -进入容器`/root/paddle/demo/quick_start/data`目录,使用`get_data.sh`下载数据 - -``` -$ root@fbd1f2bb71f4:~/paddle/demo/quick_start/data# ./get_data.sh - -Downloading Amazon Electronics reviews data... ---2016-10-31 01:33:43-- http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/reviews_Electronics_5.json.gz -Resolving snap.stanford.edu (snap.stanford.edu)... 171.64.75.80 -Connecting to snap.stanford.edu (snap.stanford.edu)|171.64.75.80|:80... connected. -HTTP request sent, awaiting response... 200 OK -Length: 495854086 (473M) [application/x-gzip] -Saving to: 'reviews_Electronics_5.json.gz' - - 10% [=======> ] 874,279 64.7KB/s eta 2h 13m - -``` - -### 修改启动脚本 - -下载完数据后,修改`/root/paddle/demo/quick_start/train.sh`文件,内容如下(增加了一条cd命令) -``` -set -e -cd /root/paddle/demo/quick_start -cfg=trainer_config.lr.py -#cfg=trainer_config.emb.py -#cfg=trainer_config.cnn.py -#cfg=trainer_config.lstm.py -#cfg=trainer_config.bidi-lstm.py -#cfg=trainer_config.db-lstm.py -paddle train \ - --config=$cfg \ - --save_dir=./output \ - --trainer_count=4 \ - --log_period=20 \ - --num_passes=15 \ - --use_gpu=false \ - --show_parameter_stats_period=100 \ - --test_all_data_in_one_period=1 \ - 2>&1 | tee 'train.log' -``` - -### 提交镜像 - -修改启动脚本后,退出容器,使用`docker commit`命令创建新镜像。 - -``` -$ docker commit quick_start_data mypaddle/paddle:quickstart -``` - -## 使用 Kubernetes 进行训练 - ->针对任务运行完成后容器自动退出的场景,Kubernetes有Job类型的资源来支持。下文就是用Job类型的资源来进行训练。 - -### 编写yaml文件 - -在训练时,输出结果可能会随着容器的消耗而被删除,需要在创建容器前挂载卷以便我们保存训练结果。使用我们之前构造的镜像,可以创建一个 [Kubernetes Job](http://kubernetes.io/docs/user-guide/jobs/#what-is-a-job),简单的yaml文件如下: - -``` -apiVersion: batch/v1 -kind: Job -metadata: - name: quickstart -spec: - parallelism: 1 - completions: 1 - template: - metadata: - name: quickstart - spec: - volumes: - - name: output - hostPath: - path: /home/work/paddle_output - containers: - - name: pi - image: mypaddle/paddle:quickstart - command: ["bin/bash", "-c", "/root/paddle/demo/quick_start/train.sh"] - volumeMounts: - - name: output - mountPath: /root/paddle/demo/quick_start/output - restartPolicy: Never -``` - -### 创建Paddle Job - -使用上文创建的yaml文件创建Kubernetes Job,命令为: - -``` -$ kubectl create -f paddle.yaml -``` - -查看job的详细情况: - -``` -$ kubectl get job -NAME DESIRED SUCCESSFUL AGE -quickstart 1 0 58s - -$ kubectl describe job quickstart -Name: quickstart -Namespace: default -Image(s): registry.baidu.com/public/paddle:cpu-demo-latest -Selector: controller-uid=f120da72-9f18-11e6-b363-448a5b355b84 -Parallelism: 1 -Completions: 1 -Start Time: Mon, 31 Oct 2016 11:20:16 +0800 -Labels: controller-uid=f120da72-9f18-11e6-b363-448a5b355b84,job-name=quickstart -Pods Statuses: 0 Running / 1 Succeeded / 0 Failed -Volumes: - output: - Type: HostPath (bare host directory volume) - Path: /home/work/paddle_output -Events: - FirstSeen LastSeen Count From SubobjectPath Type Reason Message - --------- -------- ----- ---- ------------- -------- ------ ------- - 1m 1m 1 {job-controller } Normal SuccessfulCreate Created pod: quickstart-fa0wx -``` - -### 查看训练结果 - -根据Job对应的Pod信息,可以查看此Pod运行的宿主机。 - -``` -kubectl describe pod quickstart-fa0wx -Name: quickstart-fa0wx -Namespace: default -Node: paddle-demo-let02/10.206.202.44 -Start Time: Mon, 31 Oct 2016 11:20:17 +0800 -Labels: controller-uid=f120da72-9f18-11e6-b363-448a5b355b84,job-name=quickstart -Status: Succeeded -IP: 10.0.0.9 -Controllers: Job/quickstart -Containers: - quickstart: - Container ID: docker://b8561f5c79193550d64fa47418a9e67ebdd71546186e840f88de5026b8097465 - Image: registry.baidu.com/public/paddle:cpu-demo-latest - Image ID: docker://18e457ce3d362ff5f3febf8e7f85ffec852f70f3b629add10aed84f930a68750 - Port: - Command: - bin/bash - -c - /root/paddle/demo/quick_start/train.sh - QoS Tier: - cpu: BestEffort - memory: BestEffort - State: Terminated - Reason: Completed - Exit Code: 0 - Started: Mon, 31 Oct 2016 11:20:20 +0800 - Finished: Mon, 31 Oct 2016 11:21:46 +0800 - Ready: False - Restart Count: 0 - Environment Variables: -Conditions: - Type Status - Ready False -Volumes: - output: - Type: HostPath (bare host directory volume) - Path: /home/work/paddle_output -``` - -我们还可以登录到宿主机上查看训练结果。 - -``` -[root@paddle-demo-let02 paddle_output]# ll -total 60 -drwxr-xr-x 2 root root 4096 Oct 31 11:20 pass-00000 -drwxr-xr-x 2 root root 4096 Oct 31 11:20 pass-00001 -drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00002 -drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00003 -drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00004 -drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00005 -drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00006 -drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00007 -drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00008 -drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00009 -drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00010 -drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00011 -drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00012 -drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00013 -drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00014 -``` diff --git a/doc/howto/usage/k8s/k8s_distributed_cn.md b/doc/howto/usage/k8s/k8s_distributed_cn.md deleted file mode 100644 index 3121b3f59df650c0a22d0bd305a6f793b202d30e..0000000000000000000000000000000000000000 --- a/doc/howto/usage/k8s/k8s_distributed_cn.md +++ /dev/null @@ -1,315 +0,0 @@ -# Kubernetes分布式训练 - -前一篇文章介绍了如何在Kubernetes集群上启动一个单机PaddlePaddle训练作业 (Job)。在这篇文章里,我们介绍如何在Kubernetes集群上进行分布式PaddlePaddle训练作业。关于PaddlePaddle的分布式训练,文章 [Cluster Training](https://github.com/baidu/Paddle/blob/develop/doc/cluster/opensource/cluster_train.md)介绍了一种通过SSH远程分发任务,进行分布式训练的方法,与此不同的是,本文将介绍在Kubernetes容器管理平台上快速构建PaddlePaddle容器集群,进行分布式训练的方案。 - -有关Kubernetes相关概念以及如何搭建和配置Kubernetes集群,可以参考[k8s_basis](./k8s_basis_cn.md)。 - -## 整体方案 - -在训练之前,用户将配置与训练数据切分好放在分布式文件系统预先分配好的目录中(不同的分布式文件系统,需要使用其制定的方式挂载后并导入数据),训练时,程序从此目录拷贝文件到容器内进行训练,将结果保存到此目录里。整体的结构图如下: - -![paddle on kubernetes结构图](src/k8s-paddle-arch.png) - -上图描述了一个3节点的分布式训练场景,在每个Pod上都通过volume方式挂载分布式文件系统的一个目录用于保存训练数据和输出结果。Kubernetes为这次训练创建了3个pod并且调度到了3个node上运行,每个pod包含一个PaddlePaddle容器。在容器创建后,会启动pserver与trainer进程,读取volume中的数据进行这次分布式训练。 - -根据前文的描述,要在已有的Kubernetes集群上进行PaddlePaddle的分布式训练,按照下面步骤即可: - -1. [制作PaddlePaddle镜像](#制作镜像) -1. [将训练文件与切分好的数据上传到共享存储](#上传训练文件) -1. [编写本次训练的YAML文件,创建一个Kubernetes job](#创建Job) -1. [训练结束后查看输出结果](#查看输出) - -下面就根据这几个步骤分别介绍。 - -### 制作镜像 - -PaddlePaddle镜像需要提供`paddle pserver`与`paddle train`进程的运行环境,用这个镜像创建的容器需要有以下两个功能: - -- 拷贝训练文件到容器内 -- 生成`paddle pserver`与`paddle train`进程的启动参数,并且启动训练 - -因为官方镜像 `paddledev/paddle:cpu-latest` 内已经包含PaddlePaddle的执行程序但是还没上述功能,所以我们可以在这个基础上,添加启动脚本,制作新镜像来完成以上的工作。参考镜像的[*Dockerfile*](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/howto/usage/cluster/k8s/src/k8s_train/Dockerfile)。 - -```bash -$ cd doc/howto/usage/k8s/src/k8s_train -$ docker build -t [YOUR_REPO]/paddle:mypaddle . -``` - -然后将构建成功的镜像上传到镜像仓库。 - -```bash -docker push [YOUR_REPO]/paddle:mypaddle -``` - -注意上述命令中`[YOUR_REPO]`表示读者所使用的Docker镜像仓库地址,读者需要替换成自己使用的仓库地址。下文使用`[YOUR_REPO]/paddle:mypaddle`这个地址来表示此步骤所构建出的镜像。 - -### 准备训练数据 - -这里我们通过在Kubernetes集群上启动一个Job来下载并切割数据,也可以通过修改[k8s_train](./src/k8s_train/README.md)的内容来定制image. - -在启动Job之前,需要根据不同的分布式存储来绑定一个[persistentVolumeClaim](https://kubernetes.io/docs/user-guide/persistent-volumes/),生成的数据将会存储在这个volume下. - -```yaml -apiVersion: batch/v1 -kind: Job -metadata: - name: paddle-data -spec: - template: - metadata: - name: pi - spec: - hostNetwork: true - containers: - - name: paddle-data - image: paddledev/paddle-tutorial:k8s_data - imagePullPolicy: Always - volumeMounts: - - mountPath: "/mnt" - name: nfs - env: - - name: OUT_DIR - value: /home/work/mfs/paddle-cluster-job - - name: SPLIT_COUNT - value: "3" - volumes: - - name: nfs - persistentVolumeClaim: - claimName: mfs - restartPolicy: Never -``` - -完成后volume中的文件内容大致如下: -```base -[root@paddle-kubernetes-node0 nfsdir]$ tree -d -. -`-- paddle-cluster-job - |-- 0 - | `-- data - |-- 1 - | `-- data - |-- 2 - | `-- data - |-- output - |-- quick_start -``` - -目录中paddle-cluster-job是本次训练对应的job name,本次训练要求有3个PaddlePaddle节点,在paddle-cluster-job/data目录中存放切分好的数据,文件夹0,1,2分别代表3个节点的trainer_id。recommendation文件夹内存放训练文件,output文件夹存放训练结果与日志。 - -### 创建Job - -Kubernetes可以通过YAML文件来创建相关对象,然后可以使用命令行工具创建job。 - -Job YAML文件描述了这次训练使用的Docker镜像,需要启动的节点个数以及 `paddle pserver`与 `paddle train`进程启动的必要参数,也描述了容器需要使用的存储卷挂载的情况。YAML文件中各个字段的具体含义,可以查看[Kubernetes Job API](http://kubernetes.io/docs/api-reference/batch/v1/definitions/#_v1_job)。例如,本次训练的YAML文件可以写成: - -```yaml -apiVersion: batch/v1 -kind: Job -metadata: - name: paddle-cluster-job -spec: - parallelism: 3 - completions: 3 - template: - metadata: - name: paddle-cluster-job - spec: - volumes: - - name: jobpath - hostPath: - path: /home/work/mfs - containers: - - name: trainer - image: [YOUR_REPO]/paddle:mypaddle - command: ["bin/bash", "-c", "/root/start.sh"] - env: - - name: JOB_NAME - value: paddle-cluster-job - - name: JOB_PATH - value: /home/jobpath - - name: JOB_NAMESPACE - value: default - - name: TRAIN_CONFIG_DIR - value: recommendation - - name: CONF_PADDLE_NIC - value: eth0 - - name: CONF_PADDLE_PORT - value: "7164" - - name: CONF_PADDLE_PORTS_NUM - value: "2" - - name: CONF_PADDLE_PORTS_NUM_SPARSE - value: "2" - - name: CONF_PADDLE_GRADIENT_NUM - value: "3" - volumeMounts: - - name: jobpath - mountPath: /home/jobpath - restartPolicy: Never -``` - -文件中,`metadata`下的`name`表示这个job的名字。`parallelism,completions`字段表示这个job会同时开启3个PaddlePaddle节点,成功训练且退出的pod数目为3时,这个job才算成功结束。然后申明一个存储卷`jobpath`,代表宿主机目录`/home/work/mfs`,在对容器的描述`containers`字段中,将此目录挂载为容器的`/home/jobpath`目录,这样容器的`/home/jobpath`目录就成为了共享存储,放在这个目录里的文件其实是保存到了MFS上。 - -`env`字段表示容器的环境变量,我们将`paddle`运行的一些参数通过这种方式传递到容器内。 - -环境变量 | 说明 ---- | --- -JOB_PATH | 共享存储挂在的路径 -JOB_NAME | Job的名字 -TRAIN_CONFIG_DIR | 本次训练文件所在目录,与JOB_PATH,JOB_NAME组合可以找到本次训练需要的文件路径 -CONF_PADDLE_NIC | `paddle pserver`进程需要的`--nics`参数,即网卡名 -CONF_PADDLE_PORT | `paddle paserver`的`--port`参数 -CONF_PADDLE_PORTS_NUM | 稠密更新的端口数量,即`--ports_num`参数 -CONF_PADDLE_PORTS_NUM_SPARSE | 稀疏更新的端口数量,即`--ports_num_for_sparse`参数 -CONF_PADDLE_GRADIENT_NUM | 训练节点数量,即`--num_gradient_servers参数` - -这些参数的具体描述,读者可以查看[这里](http://www.paddlepaddle.org/doc/ui/cmd_argument/detail_introduction.html#parameter-server-and-distributed-communication)。 - -编写完YAML文件后,可以使用Kubernetes的命令行工具创建job。 - -```bash -kubectl create -f job.yaml -``` - -创建成功后,Kubernetes就会创建3个pod作为PaddlePaddle节点然后拉取镜像,启动容器开始训练。 - - -### 查看输出 - -在训练过程中,可以在共享存储上查看输出的日志和模型,例如output目录下就存放了输出结果。注意node_0,node_1,node_2这几个目录表示PaddlePaddle节点与trainer_id,并不是Kubernetes中的node概念。 - -```bash -[root@paddle-kubernetes-node0 output]# tree -d -. -├── node_0 -│   ├── server.log -│   └── train.log -├── node_1 -│   ├── server.log -│   └── train.log -├── node_2 -...... -├── pass-00002 -│   ├── done -│   ├── ___embedding_0__.w0 -│   ├── ___embedding_1__.w0 -...... -``` - -我们可以通过日志查看容器训练的情况,例如: - -```bash -[root@paddle-kubernetes-node0 node_0]# cat train.log -I1116 09:10:17.123121 50 Util.cpp:155] commandline: - /usr/local/bin/../opt/paddle/bin/paddle_trainer - --nics=eth0 --port=7164 - --ports_num=2 --comment=paddle_process_by_paddle - --pservers=192.168.129.66,192.168.223.143,192.168.129.71 - --ports_num_for_sparse=2 --config=./trainer_config.py - --trainer_count=4 --num_passes=10 --use_gpu=0 - --log_period=50 --dot_period=10 --saving_period=1 - --local=0 --trainer_id=0 - --save_dir=/home/jobpath/paddle-cluster-job/output -I1116 09:10:17.123440 50 Util.cpp:130] Calling runInitFunctions -I1116 09:10:17.123764 50 Util.cpp:143] Call runInitFunctions done. -[WARNING 2016-11-16 09:10:17,227 default_decorators.py:40] please use keyword arguments in paddle config. -[INFO 2016-11-16 09:10:17,239 networks.py:1282] The input order is [movie_id, title, genres, user_id, gender, age, occupation, rating] -[INFO 2016-11-16 09:10:17,239 networks.py:1289] The output order is [__mse_cost_0__] -I1116 09:10:17.392917 50 Trainer.cpp:170] trainer mode: Normal -I1116 09:10:17.613910 50 PyDataProvider2.cpp:257] loading dataprovider dataprovider::process -I1116 09:10:17.680917 50 PyDataProvider2.cpp:257] loading dataprovider dataprovider::process -I1116 09:10:17.681543 50 GradientMachine.cpp:134] Initing parameters.. -I1116 09:10:18.012390 50 GradientMachine.cpp:141] Init parameters done. -I1116 09:10:18.018641 50 ParameterClient2.cpp:122] pserver 0 192.168.129.66:7164 -I1116 09:10:18.018950 50 ParameterClient2.cpp:122] pserver 1 192.168.129.66:7165 -I1116 09:10:18.019069 50 ParameterClient2.cpp:122] pserver 2 192.168.223.143:7164 -I1116 09:10:18.019492 50 ParameterClient2.cpp:122] pserver 3 192.168.223.143:7165 -I1116 09:10:18.019716 50 ParameterClient2.cpp:122] pserver 4 192.168.129.71:7164 -I1116 09:10:18.019836 50 ParameterClient2.cpp:122] pserver 5 192.168.129.71:7165 -``` - - -## 一些细节的补充 - -### 使用环境变量 - -使用容器方式运行训练任务的Kubernetes Job,通常会使用环境变量配置Job的配置信息`start_paddle.py`提供了一个启动脚本,将环境变量转换成paddle的命令行参数: -``` -API = "/api/v1/namespaces/" -JOBSELECTOR = "labelSelector=job-name=" -JOB_PATH = os.getenv("JOB_PATH") + "/" + os.getenv("JOB_NAME") -JOB_PATH_OUTPUT = JOB_PATH + "/output" -JOBNAME = os.getenv("JOB_NAME") -NAMESPACE = os.getenv("JOB_NAMESPACE") -PADDLE_NIC = os.getenv("CONF_PADDLE_NIC") -PADDLE_PORT = os.getenv("CONF_PADDLE_PORT") -PADDLE_PORTS_NUM = os.getenv("CONF_PADDLE_PORTS_NUM") -PADDLE_PORTS_NUM_SPARSE = os.getenv("CONF_PADDLE_PORTS_NUM_SPARSE") -PADDLE_SERVER_NUM = os.getenv("CONF_PADDLE_GRADIENT_NUM") -``` - -### Pod间通信 -`start_paddle.py`脚本开始时,会先进行参数的初始化与解析。 - -```python -parser = argparse.ArgumentParser(prog="start_paddle.py", - description='simple tool for k8s') - args, train_args_list = parser.parse_known_args() - train_args = refine_unknown_args(train_args_list) - train_args_dict = dict(zip(train_args[:-1:2], train_args[1::2])) - podlist = getPodList() -``` - -然后通过函数`getPodList()`访问Kubernetes的接口来查询此job对应的所有pod信息。当所有pod都处于running状态(容器运行都运行)时,再通过函数`getIdMap(podlist)`获取trainer_id。 - -```python - podlist = getPodList() - # need to wait until all pods are running - while not isPodAllRunning(podlist): - time.sleep(10) - podlist = getPodList() - idMap = getIdMap(podlist) -``` -* *注意*: `getPodList()`会获取当前namespace下的所有pod,如果已经有pod运行,可能会导致出错。这种集群节点管理方式会在将来使用[statfulsets](https://kubernetes.io/docs/concepts/abstractions/controllers/statefulsets/)代替。 - -在函数`getIdMap(podlist)`内部,我们通过读取`podlist`中每个pod的IP地址,将IP排序生成的序号作为trainer_id。 - -```python -def getIdMap(podlist): - ''' - generate tainer_id by ip - ''' - ips = [] - for pod in podlist["items"]: - ips.append(pod["status"]["podIP"]) - ips.sort() - idMap = {} - for i in range(len(ips)): - idMap[ips[i]] = i - return idMap -``` - -在得到`idMap`后,通过函数`startPaddle(idMap, train_args_dict)`构造`paddle pserver`与`paddle train`的启动参数并执行进程。 - -### 启动任务 - -在函数`startPaddle`中,最主要的工作就是解析出`paddle pserver`与`paddle train`的启动参数。例如`paddle train`参数的解析,解析环境变量得到`PADDLE_NIC`,`PADDLE_PORT`,`PADDLE_PORTS_NUM`等参数,然后通过自身的IP地址在`idMap`中获取`trainerId`。 - -```python - program = 'paddle train' - args = " --nics=" + PADDLE_NIC - args += " --port=" + str(PADDLE_PORT) - args += " --ports_num=" + str(PADDLE_PORTS_NUM) - args += " --comment=" + "paddle_process_by_paddle" - ip_string = "" - for ip in idMap.keys(): - ip_string += (ip + ",") - ip_string = ip_string.rstrip(",") - args += " --pservers=" + ip_string - args_ext = "" - for key, value in train_args_dict.items(): - args_ext += (' --' + key + '=' + value) - localIP = socket.gethostbyname(socket.gethostname()) - trainerId = idMap[localIP] - args += " " + args_ext + " --trainer_id=" + \ - str(trainerId) + " --save_dir=" + JOB_PATH_OUTPUT -``` diff --git a/doc/howto/usage/k8s/k8s_en.md b/doc/howto/usage/k8s/k8s_en.md deleted file mode 100644 index 0c3ab05b708e7a924577c26496b8c55126e76c62..0000000000000000000000000000000000000000 --- a/doc/howto/usage/k8s/k8s_en.md +++ /dev/null @@ -1,201 +0,0 @@ -# Paddle On Kubernetes - ->In this article, we will introduce how to run Paddle training job on single CPU machine using Kubernetes. In next article, we will introduce how to run Paddle training job on distributed cluster. - -## Build Docker Image - -In distributed Kubernetes cluster, we will use Ceph or other shared storage system for storing training related data so that all processes in Paddle training can retrieve data from Ceph. In this example, we will only demo training job on single machine. In order to simplify the requirement of the environment, we will directly put training data into Paddle's Docker Image, so we need to create a Paddle Docker image that already includes the training data. - -Paddle's [Quick Start Tutorial](http://www.paddlepaddle.org/doc/demo/quick_start/index_en.html) introduces how to download and train data by using script from Paddle's source code. -And `paddledev/paddle:cpu-demo-latest` image has the Paddle source code and demo. (Caution: Default Paddle image `paddledev/paddle:cpu-latest` doesn't include the source code, Paddle's different versions of image can be referred here: [Docker installation guide](http://www.paddlepaddle.org/doc/build/docker_install.html)), so we run this container and download the training data, and then commit the whole container to be a new Docker image. - -### Run Docker Container - -``` -$ docker run --name quick_start_data -it paddledev/paddle:cpu-demo-latest -``` - -### Download Training Data - -Getting into `/root/paddle/demo/quick_start/data` Directory,using `get_data.sh` to download training data. -Then getting into `/root/paddle/demo/quick_start` Directory, using `preprocess.sh` to pre-process training data. - -``` -$ root@fbd1f2bb71f4:~/paddle/demo/quick_start/data# ./get_data.sh - -Downloading Amazon Electronics reviews data... ---2016-10-31 01:33:43-- http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/reviews_Electronics_5.json.gz -Resolving snap.stanford.edu (snap.stanford.edu)... 171.64.75.80 -Connecting to snap.stanford.edu (snap.stanford.edu)|171.64.75.80|:80... connected. -HTTP request sent, awaiting response... 200 OK -Length: 495854086 (473M) [application/x-gzip] -Saving to: 'reviews_Electronics_5.json.gz' - - 10% [=======> ] 874,279 64.7KB/s eta 2h 13m - -``` - -### Modify Startup Script - -After downloading the data,modify `/root/paddle/demo/quick_start/train.sh` file contents are as follows (one more cd cmd): -``` -set -e -cd /root/paddle/demo/quick_start -cfg=trainer_config.lr.py -#cfg=trainer_config.emb.py -#cfg=trainer_config.cnn.py -#cfg=trainer_config.lstm.py -#cfg=trainer_config.bidi-lstm.py -#cfg=trainer_config.db-lstm.py -paddle train \ - --config=$cfg \ - --save_dir=./output \ - --trainer_count=4 \ - --log_period=20 \ - --num_passes=15 \ - --use_gpu=false \ - --show_parameter_stats_period=100 \ - --test_all_data_in_one_period=1 \ - 2>&1 | tee 'train.log' -``` - -### Commit Docker Image - -``` -$ docker commit quick_start_data mypaddle/paddle:quickstart -``` - -## Use Kubernetes For Training - ->We will use Kubernetes job for training process, following steps shows how to do the training with Kubernetes. - -### Create Yaml Files - -The output result in container will be demolished when job finished (container stopped running), so we need to mount the volume out to the local disk when creating the container to store the training result. Using our previously created image, we can create a [Kubernetes Job](http://kubernetes.io/docs/user-guide/jobs/#what-is-a-job), the yaml contents are as follows: - -``` -apiVersion: batch/v1 -kind: Job -metadata: - name: quickstart -spec: - parallelism: 1 - completions: 1 - template: - metadata: - name: quickstart - spec: - volumes: - - name: output - hostPath: - path: /home/work/paddle_output - containers: - - name: pi - image: mypaddle/paddle:quickstart - command: ["bin/bash", "-c", "/root/paddle/demo/quick_start/train.sh"] - volumeMounts: - - name: output - mountPath: /root/paddle/demo/quick_start/output - restartPolicy: Never -``` - -### Start Paddle Job - -Using the above yaml file to start the Kubernetes job. - -``` -$ kubectl create -f paddle.yaml -``` - -Get the detailed status of the job: - -``` -$ kubectl get job -NAME DESIRED SUCCESSFUL AGE -quickstart 1 0 58s - -$ kubectl describe job quickstart -Name: quickstart -Namespace: default -Image(s): registry.baidu.com/public/paddle:cpu-demo-latest -Selector: controller-uid=f120da72-9f18-11e6-b363-448a5b355b84 -Parallelism: 1 -Completions: 1 -Start Time: Mon, 31 Oct 2016 11:20:16 +0800 -Labels: controller-uid=f120da72-9f18-11e6-b363-448a5b355b84,job-name=quickstart -Pods Statuses: 0 Running / 1 Succeeded / 0 Failed -Volumes: - output: - Type: HostPath (bare host directory volume) - Path: /home/work/paddle_output -Events: - FirstSeen LastSeen Count From SubobjectPath Type Reason Message - --------- -------- ----- ---- ------------- -------- ------ ------- - 1m 1m 1 {job-controller } Normal SuccessfulCreate Created pod: quickstart-fa0wx -``` - -### Get Training Result - -We can use kubectl command to take a look at the status of related pod. - -``` -$ kubectl describe pod quickstart-fa0wx -Name: quickstart-fa0wx -Namespace: default -Node: paddle-demo-let02/10.206.202.44 -Start Time: Mon, 31 Oct 2016 11:20:17 +0800 -Labels: controller-uid=f120da72-9f18-11e6-b363-448a5b355b84,job-name=quickstart -Status: Succeeded -IP: 10.0.0.9 -Controllers: Job/quickstart -Containers: - quickstart: - Container ID: docker://b8561f5c79193550d64fa47418a9e67ebdd71546186e840f88de5026b8097465 - Image: registry.baidu.com/public/paddle:cpu-demo-latest - Image ID: docker://18e457ce3d362ff5f3febf8e7f85ffec852f70f3b629add10aed84f930a68750 - Port: - Command: - bin/bash - -c - /root/paddle/demo/quick_start/train.sh - QoS Tier: - cpu: BestEffort - memory: BestEffort - State: Terminated - Reason: Completed - Exit Code: 0 - Started: Mon, 31 Oct 2016 11:20:20 +0800 - Finished: Mon, 31 Oct 2016 11:21:46 +0800 - Ready: False - Restart Count: 0 - Environment Variables: -Conditions: - Type Status - Ready False -Volumes: - output: - Type: HostPath (bare host directory volume) - Path: /home/work/paddle_output -``` - -We can also ssh to Kubernetes node to take a look at the training result. - -``` -[root@paddle-demo-let02 paddle_output]# ll -total 60 -drwxr-xr-x 2 root root 4096 Oct 31 11:20 pass-00000 -drwxr-xr-x 2 root root 4096 Oct 31 11:20 pass-00001 -drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00002 -drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00003 -drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00004 -drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00005 -drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00006 -drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00007 -drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00008 -drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00009 -drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00010 -drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00011 -drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00012 -drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00013 -drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00014 -``` diff --git a/doc/howto/usage/k8s/src/Dockerfile b/doc/howto/usage/k8s/src/Dockerfile deleted file mode 100644 index 3a73606c61432329b4cc2d2f8daadc5af8735c96..0000000000000000000000000000000000000000 --- a/doc/howto/usage/k8s/src/Dockerfile +++ /dev/null @@ -1,7 +0,0 @@ -FROM paddledev/paddle:cpu-latest - -MAINTAINER zjsxzong89@gmail.com - -COPY start.sh /root/ -COPY start_paddle.py /root/ -CMD ["bash"," -c","/root/start.sh"] \ No newline at end of file diff --git a/doc/howto/usage/k8s/src/k8s-paddle-arch.png b/doc/howto/usage/k8s/src/k8s-paddle-arch.png deleted file mode 100644 index 2183a232ad402b76f82a67234a5c93e13ce97ac3..0000000000000000000000000000000000000000 Binary files a/doc/howto/usage/k8s/src/k8s-paddle-arch.png and /dev/null differ diff --git a/doc/howto/usage/k8s/src/k8s_train/Dockerfile b/doc/howto/usage/k8s/src/k8s_train/Dockerfile deleted file mode 100644 index c0fca1f9a945921e6e8899fee2db8845e66136a1..0000000000000000000000000000000000000000 --- a/doc/howto/usage/k8s/src/k8s_train/Dockerfile +++ /dev/null @@ -1,6 +0,0 @@ -FROM paddledev/paddle:cpu-latest - -COPY start.sh /root/ -COPY start_paddle.py /root/ -RUN chmod +x /root/start.sh -CMD ["bash"," -c","/root/start.sh"] diff --git a/doc/index_cn.rst b/doc/index_cn.rst index 9279bac7f4b2898c18979630a8d6dfcb2dba70e0..ada51c2d73263898b2c748437f8eb0f30b537073 100644 --- a/doc/index_cn.rst +++ b/doc/index_cn.rst @@ -8,3 +8,4 @@ PaddlePaddle 文档 howto/index_cn.rst api/index_cn.rst faq/index_cn.rst + mobile/index_cn.rst diff --git a/doc/index_en.rst b/doc/index_en.rst index 168c7667c61da677905585d6c4b5037ce80b3765..23b64b6cadf776d44c4d0aa5a550ffe24be13b18 100644 --- a/doc/index_en.rst +++ b/doc/index_en.rst @@ -7,4 +7,4 @@ PaddlePaddle Documentation getstarted/index_en.rst howto/index_en.rst api/index_en.rst - about/index_en.rst + mobile/index_en.rst diff --git a/doc/mobile/cross_compiling_for_android_cn.md b/doc/mobile/cross_compiling_for_android_cn.md new file mode 100644 index 0000000000000000000000000000000000000000..ae24ced770492743065e37654b494caf6b4c5bc0 --- /dev/null +++ b/doc/mobile/cross_compiling_for_android_cn.md @@ -0,0 +1,177 @@ +# Android平台编译指南 + +用户可通过如下两种方式,交叉编译Android平台上适用的PaddlePaddle库: + +- [基于Docker容器的编译方式](#基于docker容器的编译方式) +- [基于Linux交叉编译环境的编译方式](#基于linux交叉编译环境的编译方式) + +## 基于Docker容器的编译方式 +Docker能在所有主要操作系统(包括Linux,Mac OS X和Windows)上运行,因此,使用基于Docker容器的编译方式,用户可在自己熟悉的开发平台上编译Android平台上适用的PaddlePaddle库。 + +### 构建PaddlePaddle的Android开发镜像 +我们把PaddlePaddle的交叉编译环境打包成一个镜像,称为开发镜像,里面涵盖了交叉编译Android版PaddlePaddle库需要的所有编译工具。 + +```bash +$ git clone https://github.com/PaddlePaddle/Paddle.git +$ cd Paddle +$ docker build -t username/paddle-android:dev . -f Dockerfile.android +``` + +用户也可以使用PaddlePaddle提供的官方开发镜像: + +```bash +$ docker pull paddlepaddle/paddle:latest-dev-android +``` + +### 编译PaddlePaddle C-API库 +构建好开发镜像后,即可使用开发镜像来编译Android版PaddlePaddle C-API库。 +Android的Docker开发镜像向用户提供两个可配置的参数: + + ++ + + + + + + + + + + + + + + + + + + + + + + +
ArgumentOptional ValuesDefault
ANDROID_ABIarmeabi-v7a, arm64-v8aarmeabi-v7a
ANDROID_API>= 1621
+ +- 编译`armeabi-v7a`,`Android API 21`的PaddlePaddle库 + + ```bash + $ docker run -it --rm -v $PWD:/paddle -e "ANDROID_ABI=armeabi-v7a" -e "ANDROID_API=21" username/paddle-android:dev + ``` + +- 编译`arm64-v8a`,`Android API 21`的PaddlePaddle库 + + ```bash + $ docker run -it --rm -v $PWD:/paddle -e "ANDROID_ABI=arm64-v8a" -e "ANDROID_API=21" username/paddle-android:dev + ``` + +执行上述`docker run`命令时,容器默认执行[paddle/scripts/docker/build_android.sh](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/scripts/docker/build_android.sh)脚本。该脚本中记录了交叉编译Android版PaddlePaddle库常用的CMake配置,并且会根据`ANDROID_ABI`和`ANDROID_API`自动构建独立工具链、进行编译和安装。由于arm64架构要求Android API不小于21。因此当`ANDROID_ABI=arm64-v8a`,`ANDROID_API<21`时,Docker容器中将默认使用`Android API 21`的编译工具链。用户可以参考下文[配置交叉编译参数](#配置交叉编译参数)章节,根据个人的需求修改定制Docker容器所执行的脚本。编译安装结束之后,PaddlePaddle的C-API库将被安装到`$PWD/install_android`目录,所依赖的第三方库同时也被安装到`$PWD/install_android/third_party`目录。 + +## 基于Linux交叉编译环境的编译方式 +本文档将以Linux x86-64平台为例,介绍交叉编译Android平台上适用的PaddlePaddle库的方法和步骤。 + +### 准备交叉编译环境 + +从源码交叉编译PaddlePaddle,用户需要提前准备好交叉编译环境。Android平台上使用的C/C++交叉编译工具链为[Android NDK](https://developer.android.com/ndk/downloads/index.html?hl=zh-cn),用户可自行前往下载预编译好的版本,也可通过以下命令获取: + +```bash +wget -q https://dl.google.com/android/repository/android-ndk-r14b-linux-x86_64.zip +unzip -q android-ndk-r14b-linux-x86_64.zip +``` + +Android NDK中包含了所有Android API级别、所有架构(arm/arm64/x86/mips)需要用到的编译工具和系统库。用户可根据自己的编译目标架构、所需支持的最低Android API级别,构建[独立工具链](https://developer.android.google.cn/ndk/guides/standalone_toolchain.html?hl=zh-cn)。 + +- 构建`armeabi-v7a`、 `Android API 21`的独立工具链: + +```bash +your/path/to/android-ndk-r14b-linux-x86_64/build/tools/make-standalone-toolchain.sh \ + --arch=arm --platform=android-21 --install-dir=your/path/to/arm_standalone_toolchain +``` + +此命令将在`your/path/to/arm_standalone_toolchain`目录生成一套独立编译工具链,面向架构为32位ARM架构,支持的最小的Android API级别为21,支持编译器`arm-linux-androideabi-gcc (GCC) 4.9`和`clang 3.8`。 + +- 构建`arm64-v8a`、 `Android API 21`的独立工具链: + +```bash +your/path/to/android-ndk-r14b-linux-x86_64/build/tools/make-standalone-toolchain.sh \ + --arch=arm64 --platform=android-21 --install-dir=your/path/to/arm64_standalone_toolchain +``` + +此命令将在`your/path/to/arm64_standalone_toolchain`目录生成一套独立编译工具链,面向架构为64位ARM64架构,支持的最小Android API级别为21,支持编译器`arm-linux-androideabi-gcc (GCC) 4.9`和`clang 3.8`。 + +### 配置交叉编译参数 + +CMake系统对交叉编译提供了支持[cmake-toolchains](https://cmake.org/cmake/help/v3.0/manual/cmake-toolchains.7.html#cross-compiling)。为了简化cmake配置,PaddlePaddle为交叉编译提供了工具链配置文档[cmake/cross_compiling/android.cmake](https://github.com/PaddlePaddle/Paddle/blob/develop/cmake/cross_compiling/android.cmake),以提供一些默认的编译器和编译参数相关配置。注意,从CMake 3.7版本开始,CMake官方对Android平台的交叉编译提供了通用的支持。PaddlePaddle若检测到用户使用的CMake版本不低于3.7时,将会将用户传进来的配置参数传递CMake系统,交由CMake系统本身来处理。有关参数配置的详细说明见[cmake-toolchains](https://cmake.org/cmake/help/v3.7/manual/cmake-toolchains.7.html#cross-compiling)。 + +交叉编译Android版本的PaddlePaddle库时,有一些必须配置的参数: +- `CMAKE_SYSTEM_NAME`,CMake编译的目标平台,必须设置为`Android`。在设置`CMAKE_SYSTEM_NAME=Android`后,PaddlePaddle的CMake系统才认为是在交叉编译Android系统的版本,并自动编译PaddlePaddle所需的所有第三方库。此外,还会强制设置一些PaddlePaddle参数的值(`WITH_GPU=OFF`、`WITH_AVX=OFF`、`WITH_PYTHON=OFF`、`WITH_RDMA=OFF`、`WITH_MKL=OFF`、`WITH_GOLANG=OFF`)。 +- `WITH_C_API`,必须设置为`ON`。在Android平台上只支持使用C-API来预测。 +- `WITH_SWIG_PY`,必须设置为`OFF`。在Android平台上不支持通过swig调用来训练或者预测。 + +Android平台可选配置参数: + +- `ANDROID_STANDALONE_TOOLCHAIN`,独立工具链所在的绝对路径,或者相对于构建目录的相对路径。PaddlePaddle的CMake系统将根据该值自动推导和设置需要使用的交叉编译器、sysroot、以及Android API级别;否则,用户需要在cmake时手动设置这些值。无默认值。 +- `ANDROID_TOOLCHAIN`,目标工具链。可设置`gcc/clang`,默认值为`clang`。 + - CMake 3.7以上,将会始终使用`clang`工具链;CMake 3.7以下,可设置`ANDROID_TOOLCHAIN=gcc`以使用`gcc`工具链。 + - Android官方提供的`clang`编译器要求系统支持`GLIBC 2.15`以上。 +- `ANDROID_ABI`,目标架构ABI。目前支持`armeabi-v7a`和`arm64-v8a`,默认值为`armeabi-v7a`。 +- `ANDROID_NATIVE_API_LEVEL`,工具链的Android API级别。若没有显式设置,PaddlePaddle将根据`ANDROID_STANDALONE_TOOLCHAIN`的值自动推导得到。 +- `ANROID_ARM_MODE`,是否使用ARM模式。 + - `ANDROID_ABI=armeabi-v7a`时,可设置`ON/OFF`,默认值为`ON`; + - `ANDROID_ABI=arm64-v8a`时,不需要设置。 +- `ANDROID_ARM_NEON`,是否使用NEON指令。 + - `ANDROID_ABI=armeabi-v7a`时,可设置`ON/OFF`,默认值为`ON`; + - `ANDROID_ABI=arm64-v8a`时,不需要设置。 + +其他配置参数: + +- `USE_EIGEN_FOR_BLAS`,是否使用Eigen库进行矩阵计算。可设置`ON/OFF`,默认值为`OFF`。 +- `HOST_C/CXX_COMPILER`,宿主机的C/C++编译器。在编译宿主机版protoc可执行文件和目标机版OpenBLAS库时需要用到。默认设置成环境变量`CC/CXX`的值;若环境变量`CC/CXX`没有设置,则设置成`cc/c++`编译器。 + +常用的cmake配置如下: + +```bash +cmake -DCMAKE_SYSTEM_NAME=Android \ + -DANDROID_STANDALONE_TOOLCHAIN=your/path/to/arm_standalone_toolchain \ + -DANDROID_ABI=armeabi-v7a \ + -DANDROID_ARM_NEON=ON \ + -DANDROID_ARM_MODE=ON \ + -DUSE_EIGEN_FOR_BLAS=ON \ + -DCMAKE_INSTALL_PREFIX=your/path/to/install \ + -DWITH_C_API=ON \ + -DWITH_SWIG_PY=OFF \ + .. +``` + +``` +cmake -DCMAKE_SYSTEM_NAME=Android \ + -DANDROID_STANDALONE_TOOLCHAIN=your/path/to/arm64_standalone_toolchain \ + -DANDROID_ABI=arm64-v8a \ + -DUSE_EIGEN_FOR_BLAS=OFF \ + -DCMAKE_INSTALL_PREFIX=your/path/to/install \ + -DWITH_C_API=ON \ + -DWITH_SWIG_PY=OFF \ + .. +``` + +用户还可根据自己的需求设置其他编译参数。比如希望最小化生成的库的大小,可以设置`CMAKE_BUILD_TYPE`为`MinSizeRel`;若希望最快的执行速度,则可设置`CMAKE_BUILD_TYPE`为`Release`。亦可以通过手动设置`CMAKE_C/CXX_FLAGS`来影响PaddlePaddle的编译过程。 + +**性能TIPS**,为了达到最快的计算速度,在CMake参数配置上,有以下建议: + +- 设置`CMAKE_BUILD_TYPE`为`Release` +- 使用`clang`编译工具链 +- `armeabi-v7a`时,设置`USE_EIGEN_BLAS=ON`,使用Eigen进行矩阵计算;`arm64-v8a`时,设置`USE_EIGEN_FOR_BLAS=OFF`,使用OpenBLAS进行矩阵计算 + +### 编译和安装 + +CMake配置完成后,执行以下命令,PaddlePaddle将自动下载和编译所有第三方依赖库、编译和安装PaddlePaddle预测库。 + +```bash +make +make install +``` + +注意:如果你曾经在源码目录下编译过其他平台的PaddlePaddle库,请先使用`rm -rf`命令删除`third_party`目录和`build`目录,以确保所有的第三方依赖库和PaddlePaddle代码都是针对新的CMake配置重新编译的。 + +执行完安装命令后,`your/path/to/install`目录中会包含`include`、`lib`和`third_party`目录,其中`include`中包含C-API的头文件,`lib`中包含若干个不同Android ABI的PaddlePaddle库,`third_party`中包含所依赖的所有第三方库。自此,PaddlePaddle的已经安装完成,用户可将`your/path/to/install`目录下的生成文件用于深度学习相关Android App中,调用方法见C-API文档。 diff --git a/doc/mobile/cross_compiling_for_android_en.md b/doc/mobile/cross_compiling_for_android_en.md new file mode 100644 index 0000000000000000000000000000000000000000..0cf50181df4116beda3aa6faf836eda92edf6066 --- /dev/null +++ b/doc/mobile/cross_compiling_for_android_en.md @@ -0,0 +1,183 @@ +# Build PaddlePaddle for Android + +There are two approaches to build PaddlePaddle for Android: + +- [Cross-Compiling Using Docker](#cross-compiling-using-docker) +- [Cross-Compiling on Linux](#cross-compiling-on-linux) + +## Cross-Compiling Using Docker + +Docker-based cross-compiling is the recommended approach because Docker runs on all major operating systems, including Linux, Mac OS X, and Windows. + +### Build the Docker Image + +The following steps pack all the tools that we need to build PaddlePaddle into a Docker image. + +```bash +$ git clone https://github.com/PaddlePaddle/Paddle.git +$ cd Paddle +$ docker build -t paddle:dev-android . -f Dockerfile.android +``` + +Users can directly use the published Docker image. + +```bash +$ docker pull paddlepaddle/paddle:latest-dev-android +``` + +### Build the Inference Library + +We can run the Docker image we just created to build the inference library of PaddlePaddle for Android using the command below: + +```bash +$ docker run -it --rm -v $PWD:/paddle -e "ANDROID_ABI=armeabi-v7a" -e "ANDROID_API=21" paddle:dev-android +``` + +The Docker image accepts two arguments `ANDROID_ABI` and `ANDROID_API`: + + ++ + + + + + + + + + + + + + + + + + + + + + + +
ArgumentOptional ValuesDefault
ANDROID_ABIarmeabi-v7a, arm64-v8aarmeabi-v7a
ANDROID_API>= 1621
+ +The ARM-64 architecture (`arm64-v8a`) requires at least level 21 of Android API. + +The default entry-point of the Docker image, [`paddle/scripts/docker/build_android.sh`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/scripts/docker/build_android.sh) generates the [Android cross-compiling standalone toolchain](https://developer.android.com/ndk/guides/standalone_toolchain.html) based on the argument: `ANDROID_ABI` or `ANDROID_API`. For information about other configuration arguments, please continue reading. + +The above command generates and outputs the inference library in `$PWD/install_android` and puts third-party libraries in `$PWD/install_android/third_party`. + +## Cross-Compiling on Linux + +The Linux-base approach to cross-compile is to run steps in `Dockerfile.android` manually on a Linux x64 computer. + +### Setup the Environment + +To build for Android's, we need [Android NDK]( +https://developer.android.com/ndk/downloads/index.html): + +```bash +wget -q https://dl.google.com/android/repository/android-ndk-r14b-linux-x86_64.zip +unzip -q android-ndk-r14b-linux-x86_64.zip +``` + +Android NDK includes everything we need to build the [*standalone toolchain*](https://developer.android.com/ndk/guides/standalone_toolchain.html), which in then used to build PaddlePaddle for Android. (We plan to remove the intermediate stage of building the standalone toolchain in the near future.) + +- To build the standalone toolchain for `armeabi-v7a` and Android API level 21: + + ```bash + your/path/to/android-ndk-r14b-linux-x86_64/build/tools/make-standalone-toolchain.sh \ + --arch=arm --platform=android-21 --install-dir=your/path/to/arm_standalone_toolchain + ``` + + The generated standalone toolchain will be in `your/path/to/arm_standalone_toolchain`. + +- To build the standalone toolchain for `arm64-v8a` and Android API level 21: + + ```bash + your/path/to/android-ndk-r14b-linux-x86_64/build/tools/make-standalone-toolchain.sh \ + --arch=arm64 --platform=android-21 --install-dir=your/path/to/arm64_standalone_toolchain + ``` + + The generated standalone toolchain will be in `your/path/to/arm64_standalone_toolchain`. + +### Cross-Compiling Arguments + +CMake supports [choosing the toolchain](https://cmake.org/cmake/help/v3.0/manual/cmake-toolchains.7.html#cross-compiling). PaddlePaddle provides [`android.cmake`](https://github.com/PaddlePaddle/Paddle/blob/develop/cmake/cross_compiling/android.cmake), which configures the Android cross-compiling toolchain for CMake. `android.cmake` is not required for CMake >= 3.7, which support Android cross-compiling. PaddlePaddle detects the CMake version, for those newer than 3.7, it uses [the official version](https://cmake.org/cmake/help/v3.7/manual/cmake-toolchains.7.html#cross-compiling). + +Some other CMake arguments you need to know: + +- `CMAKE_SYSTEM_NAME` must be `Android`. This tells PaddlePaddle's CMake system to cross-compile third-party dependencies. This also changes some other CMake arguments like `WITH_GPU=OFF`, `WITH_AVX=OFF`, `WITH_PYTHON=OFF`, `WITH_RDMA=OFF`, `WITH_MKL=OFF` and `WITH_GOLANG=OFF`. +- `WITH_C_API` must be `ON`, to build the C-based inference library for Android. +- `WITH_SWIG_PY` must be `OFF` because the Android platform doesn't support SWIG-based API. + +Some Android-specific arguments: + +- `ANDROID_STANDALONE_TOOLCHAIN`: the absolute path of the Android standalone toolchain, or the path relative to the CMake build directory. PaddlePaddle's CMake extensions would derive the cross-compiler, sysroot and Android API level from this argument. +- `ANDROID_TOOLCHAIN`: could be `gcc` or `clang`. The default value is `clang`. + - For CMake >= 3.7, it should anyway be `clang`. For older versions, it could be `gcc`. + - Android's official `clang` requires `glibc` >= 2.15. +- `ANDROID_ABI`: could be `armeabi-v7a` or `arm64-v8a`. The default value is `armeabi-v7a`. +- `ANDROID_NATIVE_API_LEVEL`: could be derived from the value of `ANDROID_STANDALONE_TOOLCHAIN`. +- `ANROID_ARM_MODE`: + - could be `ON` or `OFF`, and defaults to `ON`, when `ANDROID_ABI=armeabi-v7a`; + - no need to specify when `ANDROID_ABI=arm64-v8a`. +- `ANDROID_ARM_NEON`: indicates if to use NEON instructions. + - could be `ON` or `OFF`, and defaults to `ON`, when `ANDROID_ABI=armeabi-v7a`; + - no need to specify when `ANDROID_ABI=arm64-v8a`. + +Other useful arguments: + +- `USE_EIGEN_FOR_BLAS`: indicates if using Eigen. Could be `ON` or `OFF`, defaults to `OFF`. +- `HOST_C/CXX_COMPILER`: specifies the host compiler, which is used to build the host-specific protoc and target-specific OpenBLAS. It defaults to the value of the environment variable `CC/C++`, or `cc/c++`. + +Some frequent configurations for your reference: + +```bash +cmake -DCMAKE_SYSTEM_NAME=Android \ + -DANDROID_STANDALONE_TOOLCHAIN=your/path/to/arm_standalone_toolchain \ + -DANDROID_ABI=armeabi-v7a \ + -DANDROID_ARM_NEON=ON \ + -DANDROID_ARM_MODE=ON \ + -DUSE_EIGEN_FOR_BLAS=ON \ + -DCMAKE_INSTALL_PREFIX=your/path/to/install \ + -DWITH_C_API=ON \ + -DWITH_SWIG_PY=OFF \ + .. +``` + +``` +cmake -DCMAKE_SYSTEM_NAME=Android \ + -DANDROID_STANDALONE_TOOLCHAIN=your/path/to/arm64_standalone_toolchain \ + -DANDROID_ABI=arm64-v8a \ + -DUSE_EIGEN_FOR_BLAS=OFF \ + -DCMAKE_INSTALL_PREFIX=your/path/to/install \ + -DWITH_C_API=ON \ + -DWITH_SWIG_PY=OFF \ + .. +``` + + +There are some other arguments you might want to configure. + +- `CMAKE_BUILD_TYPE=MinSizeRel` minimizes the size of library. +- `CMAKE_BUILD_TYPE-Release` optimizes the runtime performance. + +Our own tip for performance optimization to use clang and Eigen or OpenBLAS: + +- `CMAKE_BUILD_TYPE=Release` +- `ANDROID_TOOLCHAIN=clang` +- `USE_EIGEN_BLAS=ON` for `armeabi-v7a`, or `USE_EIGEN_FOR_BLAS=OFF` for `arm64-v8a`. + +### Build and Install + +After running `cmake`, we can run `make; make install` to build and install. + +Before building, you might want to remove the `third_party` and `build` directories including pre-built libraries for other architectures. + +After building,in the directory `CMAKE_INSTALL_PREFIX`, you will find three sub-directories: + +- `include`: the header file of the inference library, +- `lib`: the inference library built for various Android ABIs, +- `third_party`: dependent third-party libraries built for Android. diff --git a/doc/mobile/cross_compiling_for_ios_cn.md b/doc/mobile/cross_compiling_for_ios_cn.md new file mode 100644 index 0000000000000000000000000000000000000000..d5196d9a4c93c7692d2a624ec7d0650e32806338 --- /dev/null +++ b/doc/mobile/cross_compiling_for_ios_cn.md @@ -0,0 +1,117 @@ +# iOS平台编译指南 +交叉编译iOS平台上适用的PaddlePaddle库,需要在MacOS系统上进行。本文的将介绍在MacOS上,从源码交叉编译iOS平台上适用的PaddlePaddle库。 + +## 准备交叉编译环境 +Apple官方为iOS开发提供了完整的交叉编译工具和集成开发环境,用户从App Store下载安装Xcode即可。也可自行前往官网下载,[Xcode](https://developer.apple.com/cn/xcode/)。安装完成之后,可在命令行执行`xcodebuild -version`,判断是否安装成功。 + +```bash +$ xcodebuild -version +Xcode 9.0 +Build version 9A235 +``` + +## 配置交叉编译参数 + +PaddlePaddle为交叉编译提供了工具链配置文档[cmake/cross_compiling/ios.cmake](https://github.com/PaddlePaddle/Paddle/blob/develop/cmake/cross_compiling/ios.cmake),以提供一些默认的编译器和编译参数配置。 + +交叉编译iOS版本的PaddlePaddle库时,有一些必须配置的参数: + +- `CMAKE_SYSTEM_NAME`,CMake编译的目标平台,必须设置为`iOS`。在设置`CMAKE_SYSTEM_NAME=iOS`后,PaddlePaddle的CMake系统会自动编译所有的第三方依赖库,并且强制设置一些PaddlePaddle参数的值(`WITH_C_API=ON`、`WITH_GPU=OFF`、`WITH_AVX=OFF`、`WITH_PYTHON=OFF`、`WITH_RDMA=OFF`)。 +- `WITH_C_API`,是否编译C-API预测库,必须设置为ON。在iOS平台上只支持使用C-API来预测。 +- `WITH_SWIG_PY`,必须设置为`OFF`。在iOS平台上不支持通过swig调用来训练或者预测。 + +iOS平台可选配置参数: + +- `IOS_PLATFORM`,可设置为`OS`(默认值)或`SIMULATOR`。 + - `OS`,构建目标为`arm`架构的iPhone或者iPad等物理设备。 + - `SIMULATOR`,构建目标为`x86`架构的模拟器平台。 +- `IOS_ARCH`,目标架构。针对不同的`IOS_PLATFORM`,可设置的目标架构如下表所示,默认编译所有架构: + + + + + + + + + + + + + + + + + + + + + + +
IOS_PLATFORMIOS_ARCH
OSarmv7, armv7s, arm64
SIMULATORi386, x86_64
+ +- `IOS_DEPLOYMENT_TARGET`,最小的iOS部署版本,默认值为`7.0`。 +- `IOS_ENABLE_BITCODE`,是否使能[Bitcode](https://developer.apple.com/library/content/documentation/IDEs/Conceptual/AppDistributionGuide/AppThinning/AppThinning.html#//apple_ref/doc/uid/TP40012582-CH35-SW3),可设置`ON/OFF`,默认值为`ON`。 +- `IOS_USE_VECLIB_FOR_BLAS`,是否使用[vecLib](https://developer.apple.com/documentation/accelerate/veclib)框架进行BLAS矩阵计算,可设置`ON/OFF`,默认值为`OFF`。 +- `IOS_DEVELOPMENT_ROOT`,`Developer`目录,可显式指定为`/path/to/platform/Developer`。若未显式指定,PaddlePaddle将会根据`IOS_PLATFORM`自动选择`Xcode`对应`platform`的`Developer`目录。 +- `IOS_SDK_ROOT`,所使用`SDK`的根目录,可显式指定为`/path/to/platform/Developer/SDKs/SDK`。若未显式指定,PaddlePaddle将会自动选择`IOS_DEVELOPMENT_ROOT`目录下最新的`SDK`版本。 + +其他配置参数: + +- `USE_EIGEN_FOR_BLAS`,是否使用Eigen库进行矩阵计算,在`IOS_USE_VECLIB_FOR_BLAS=OFF`时有效。可设置`ON/OFF`,默认值为`OFF`。 +- `HOST_C/CXX_COMPILER`,宿主机的C/C++编译器。默认值为环境变量`CC/CXX`的值;若环境变量`CC/CXX`未设置,则使用`cc/c++`编译器。 + +常用的cmake配置如下: + +```bash +cmake -DCMAKE_SYSTEM_NAME=iOS \ + -DIOS_PLATFORM=OS \ + -DIOS_ARCH="armv7;arm64" \ + -DIOS_ENABLE_BITCODE=ON \ + -DIOS_USE_VECLIB_FOR_BLAS=ON \ + -DCMAKE_INSTALL_PREFIX=your/path/to/install \ + -DWITH_C_API=ON \ + -DWITH_TESTING=OFF \ + -DWITH_SWIG_PY=OFF \ + .. +``` + +```bash +cmake -DCMAKE_SYSTEM_NAME=iOS \ + -DIOS_PLATFORM=SIMULATOR \ + -DIOS_ARCH="x86_64" \ + -DIOS_USE_VECLIB_FOR_BLAS=ON \ + -DCMAKE_INSTALL_PREFIX=your/path/to/install \ + -DWITH_C_API=ON \ + -DWITH_TESTING=OFF \ + -DWITH_SWIG_PY=OFF \ + .. +``` + +用户还可根据自己的需求设置其他编译参数。比如希望最小化生成库的大小,可以设置`CMAKE_BUILD_TYPE`为`MinSizeRel`;若希望得到最快的执行速度,则可设置`CMAKE_BUILD_TYPE`为`Release`。亦可以通过手动设置`CMAKE_C/CXX_FLAGS`来影响PaddlePaddle的编译过程。 + +**性能TIPS**,为了达到最快的计算速度,在CMake参数配置上,有以下建议: + +- 设置`CMAKE_BUILD_TYPE`为`Release` +- 设置`IOS_USE_VECLIB_FOR_BLAS=ON`,调用`vecLib`框架提供的BLAS函数进行矩阵计算。 + +## 编译和安装 + +CMake配置完成后,执行以下命令,PaddlePaddle将自动下载和编译所有第三方依赖库、编译和安装PaddlePaddle预测库。 + +``` +$ make +$ make install +``` + +注意:如果你曾在源码目录下编译过其他平台的PaddlePaddle库,请先使用`rm -rf`命令删除`third_party`目录和`build`目录,以确保所有的第三方依赖库和PaddlePaddle代码都是针对新的CMake配置重新编译的。 + +执行完安装命令后,`your/path/to/install`目录中会包含以下内容: + +- `include`目录,其中包含所有C-API的头文件 +- `lib`目录,其中包含PaddlePaddle的C-API静态库 +- `third_party`目录,其中包含所依赖的所有第三方库 + +注意,如果PaddlePaddle库需要同时支持真机和模拟器,则需要分别编译真机和模拟器版本,然后使用`lipo`工具合并fat库。 + +自此,PaddlePaddle库已经安装完成,用户可将合成的fat库用于深度学习相关的iOS App中,调用方法见C-API文档。 diff --git a/doc/mobile/cross_compiling_for_ios_en.md b/doc/mobile/cross_compiling_for_ios_en.md new file mode 100644 index 0000000000000000000000000000000000000000..19bfe86c511c7e43b462f94c8cabba420b3007f1 --- /dev/null +++ b/doc/mobile/cross_compiling_for_ios_en.md @@ -0,0 +1,120 @@ +# Build PaddlePaddle for iOS + +This tutorial will walk you through cross compiling the PaddlePaddle library for iOS from the source in MacOS. + +## Preparation + +Apple provides Xcode for cross-compiling and IDE for iOS development. Download from App store or [here](https://developer.apple.com/cn/xcode/). To verify your installation, run command as follows + +```bash +$ xcodebuild -version +Xcode 9.0 +Build version 9A235 +``` + +## Cross-compiling configurations + +PaddlePaddle provides cross-compiling toolchain configuration documentation [cmake/cross_compiling/ios.cmake](https://github.com/PaddlePaddle/Paddle/blob/develop/cmake/cross_compiling/ios.cmake), which has some default settings for frequently used compilers. + +There are some mandatory environment variables need to be set before cross compiling PaddlePaddle for iOS: + +- `CMAKE_SYSTEM_NAME`, CMake compiling target platform name, has to be `iOS`. PaddlePaddle CMake will compile all the third party dependencies and enforce some parameters (`WITH_C_API=ON`, `WITH_GPU=OFF`, `WITH_AVX=OFF`, `WITH_PYTHON=OFF`,`WITH_RDMA=OFF`) when this variable is set with value `iOS`. + +- `WITH_C_API`, Whether to compile inference C-API library, has to be `ON`, since C-API is the only supported interface for inferencing in iOS. +- `WITH_SWIG_PY`, has to be `OFF`. It's not supported to inference or train via swig in iOS. + +Optional environment variables for iOS are: + +- `IOS_PLATFORM`, either `OS` (default) or `SIMULATOR`. + - `OS`, build targets ARM-based physical devices like iPhone or iPad. + - `SIMULATOR`, build targets x86 architecture simulators. +- `IOS_ARCH`, target architecture. By default, all architecture types will be compiled. If you need to specify the architecture to compile for, please find valid values for different `IOS_PLATFORM` settings from the table below: + + + + + + + + + + + + + + + + + + + + + + +
IOS_PLATFORMIOS_ARCH
OSarmv7, armv7s, arm64
SIMULATORi386, x86_64
+ +- `IOS_DEPLOYMENT_TARGET`, minimum iOS version to deployment, `7.0` by default. +- `IOS_ENABLE_BITCODE`, whether to enable [Bitcode](https://developer.apple.com/library/content/documentation/IDEs/Conceptual/AppDistributionGuide/AppThinning/AppThinning.html#//apple_ref/doc/uid/TP40012582-CH35-SW3), values can be `ON/OFF`, `ON` by default. +- `IOS_USE_VECLIB_FOR_BLAS`, whether to use [vecLib](https://developer.apple.com/documentation/accelerate/veclib) framework for BLAS computing. values can be `ON/OFF`, `OFF` by default. +- `IOS_DEVELOPMENT_ROOT`, the path to `Developer` directory, can be explicitly set with your `/path/to/platform/Developer`. If left blank, PaddlePaddle will automatically pick the Xcode corresponding `platform`'s `Developer` directory based on your `IOS_PLATFORM` value. +- `IOS_SDK_ROOT`, the path to `SDK` root, can be explicitly set with your `/path/to/platform/Developer/SDKs/SDK`. if left black, PaddlePaddle will pick the latest SDK in the directory of `IOS_DEVELOPMENT_ROOT`. + +other settings: + +- `USE_EIGEN_FOR_BLAS`, whether to use Eigen for matrix computing. effective when `IOS_USE_VECLIB_FOR_BLAS=OFF`. Values can be `ON/OFF`, `OFF` by default. +- `HOST_C/CXX_COMPILER`, host C/C++ compiler. Uses value from environment variable `CC/CXX` by default or `cc/c++` if `CC/CXX` doesn't exist. + +some typical cmake configurations: + +```bash +cmake -DCMAKE_SYSTEM_NAME=iOS \ + -DIOS_PLATFORM=OS \ + -DIOS_ARCH="armv7;arm64" \ + -DIOS_ENABLE_BITCODE=ON \ + -DIOS_USE_VECLIB_FOR_BLAS=ON \ + -DCMAKE_INSTALL_PREFIX=your/path/to/install \ + -DWITH_C_API=ON \ + -DWITH_TESTING=OFF \ + -DWITH_SWIG_PY=OFF \ + .. +``` + +```bash +cmake -DCMAKE_SYSTEM_NAME=iOS \ + -DIOS_PLATFORM=SIMULATOR \ + -DIOS_ARCH="x86_64" \ + -DIOS_USE_VECLIB_FOR_BLAS=ON \ + -DCMAKE_INSTALL_PREFIX=your/path/to/install \ + -DWITH_C_API=ON \ + -DWITH_TESTING=OFF \ + -DWITH_SWIG_PY=OFF \ + .. +``` + +You can set other compiling parameters for your own need. I.E. if you are trying to minimize the library size, set `CMAKE_BUILD_TYPE` with `MinSizeRel`; or if the performance is your concern, set `CMAKE_BUILD_TYPE` with `Release`. You can even manipulate the PaddlePaddle compiling procedure by manually set `CMAKE_C/CXX_FLAGS` values. + +**TIPS for a better performance**: + +- set `CMAKE_BUILD_TYPE` with `Release` +- set `IOS_USE_VECLIB_FOR_BLAS` with `ON` + +## Build and install + +After CMake, run following commands, PaddlePaddle will download the compile 3rd party dependencies, compile and install PaddlePaddle inference library. + +``` +$ make +$ make install +``` + +Please Note: if you compiled PaddlePaddle in the source directory for other platforms, do remove `third_party` and `build` directory within the source with `rm -rf` to ensure that all the 3rd party libraries dependencies and PaddlePaddle is newly compiled with current CMake configuration. + +`your/path/to/install` directory will have following directories after `make install`: + +- `include`, contains all the C-API header files. +- `lib`, contains PaddlePaddle C-API static library. +- `third_party` contains all the 3rd party libraries. + +Please note: if PaddlePaddle library need to support both physical devices and simulators, you will need to compile correspondingly, then merge fat library with `lipo`. + +Now you will have PaddlePaddle library compiled and installed, the fat library can be used in deep learning related iOS APPs. Please refer to C-API documentation for usage guides. diff --git a/doc/mobile/cross_compiling_for_raspberry_cn.md b/doc/mobile/cross_compiling_for_raspberry_cn.md new file mode 100644 index 0000000000000000000000000000000000000000..f8ef9dc8031613831437745995268f3abc392f5b --- /dev/null +++ b/doc/mobile/cross_compiling_for_raspberry_cn.md @@ -0,0 +1,62 @@ +# Raspberry Pi平台编译指南 + +通常有两个方法来构建基于 Rasspberry Pi 的版本: + +1. 通过ssh等方式登录到Raspberry Pi系统上来构建。所需的开发工具和第三方库可以参考 [`/Dockerfile`](https://github.com/PaddlePaddle/Paddle/blob/develop/Dockerfile)。 + +1. 另一个方法是交叉编译。这篇文档介绍在 Linux/x64 上交叉编译Raspberry Pi平台上适用的PaddlePaddle的方法和步骤。 + +## 安装交叉编译器 + +克隆下面 Github repo + +```bash +git clone https://github.com/raspberrypi/tools.git +``` + +即可在 `./tools/tree/master/arm-bcm2708/gcc-linaro-arm-linux-gnueabihf-raspbian-x64` 目录里找到交叉编译器 arm-linux-gnueabihf-gcc 4.8.3。运行该编译工具链需要一台 Linux x64 机器上以及 2.14版本以上的 glibc。 + +## 配置交叉编译参数 + +CMake[支持交叉编译](https://cmake.org/cmake/help/v3.0/manual/cmake-toolchains.7.html#cross-compiling)。PaddlePaddle for Raspberry Pi的配置信息在[cmake/cross_compiling/raspberry_pi.cmake](https://github.com/PaddlePaddle/Paddle/blob/develop/cmake/cross_compiling/raspberry_pi.cmake)。 + +交叉编译Raspberry Pi版本PaddlePaddle库时,有一些必须配置的参数: + +- `CMAKE_SYSTEM_NAME`:CMake编译的目标平台,必须配置为`RPi`。在设置`CMAKE_SYSTEM_NAME=RPi`后,PaddlePaddle的CMake系统才认为在是在交叉编译Raspberry Pi系统的版本,并自动编译宿主机版protoc可执行文件、目标机版protobuf库、以及目标机版OpenBLAS库。 + +- `RPI_TOOLCHAIN`:编译工具链所在的绝对路径,或者相对于构建目录的相对路径。PaddlePaddle的CMake系统将根据该值自动设置需要使用的交叉编译器;否则,用户需要在cmake时手动设置这些值。无默认值。 + +- `RPI_ARM_NEON`:是否使用NEON指令。目前必须设置成`ON`,默认值为`ON`。 + +- `HOST_C/CXX_COMPILER`,宿主机的C/C++编译器。在编译宿主机版protoc可执行文件和目标机版OpenBLAS库时需要用到。默认设置成环境变量`CC`的值;若环境变量`CC`没有设置,则设置成`cc`编译器。 + +一个常用的CMake配置如下: + +``` +cmake -DCMAKE_SYSTEM_NAME=RPi \ + -DRPI_TOOLCHAIN=your/path/to/arm-bcm2708/gcc-linaro-arm-linux-gnueabihf-raspbian-x64 \ + -DRPI_ARM_NEON=ON \ + -DCMAKE_INSTALL_PREFIX=your/path/to/install \ + -DWITH_GPU=OFF \ + -DWITH_C_API=ON \ + -DWITH_PYTHON=OFF \ + -DWITH_SWIG_PY=OFF \ + .. +``` + +其中`WITH_C_API=ON`表示需要构建推理库。 + +用户还可根据自己的需求设置其他编译参数。比如希望最小化生成的库的大小,可以设置`CMAKE_BUILD_TYPE`为`MinSizeRel`;若希望最快的执行速度,则可设置`CMAKE_BUILD_TYPE`为`Release`。 + +## 编译和安装 + +CMake配置完成后,执行以下命令,PaddlePaddle将自动下载和编译所有第三方依赖库、编译和安装PaddlePaddle。 + +```bash +make +make install +``` + +注意:如果你曾经在源码目录下编译过其他平台的PaddlePaddle库,请先使用`rm -rf`命令删除`third_party`目录和`build`目录,以确保所有的第三方依赖库和PaddlePaddle代码都是针对新的CMake配置重新编译的。 + +执行完安装命令后,`your/path/to/install`目录中会包含`include`和`lib`目录,其中`include`中包含C-API的头文件,`lib`中包含一个Raspberry Pi版本的库。 diff --git a/doc/mobile/cross_compiling_for_raspberry_en.md b/doc/mobile/cross_compiling_for_raspberry_en.md new file mode 100644 index 0000000000000000000000000000000000000000..3c1a5950ff9553bb725d5a96e3fdf2e5e9f6f95c --- /dev/null +++ b/doc/mobile/cross_compiling_for_raspberry_en.md @@ -0,0 +1,62 @@ +# Build PaddlePaddle for Raspberry Pi + +You may use any of the following two approaches to build the inference library of PaddlePaddle for Raspberry Pi: + +1. Build using SSH: Log in to a Raspberry Pi using SSH and build the library. The required development tools and third-party dependencies are listed in here: [`/Dockerfile`](https://github.com/PaddlePaddle/Paddle/blob/develop/Dockerfile). + +1. Cross-compile: We talk about how to cross-compile PaddlePaddle for Raspberry Pi on a Linux/x64 machine, in more detail in this article. + +## The Cross-Compiling Toolchain + +Step 1. Clone the Github repo by running the following command. + +```bash +git clone https://github.com/raspberrypi/tools.git +``` + +Step 2. Use the pre-built cross-compiler found in `./tools/tree/master/arm-bcm2708/gcc-linaro-arm-linux-gnueabihf-raspbian-x64`. To run it on a Linux computer, glibc version >= 2.14 is needed. + +## CMake Arguments + +CMake supports [cross-compiling](https://cmake.org/cmake/help/v3.0/manual/cmake-toolchains.7.html#cross-compiling). All CMake configuration arguments required for the cross-compilation for Raspberry Pi can be found in [`cmake/cross_compiling/raspberry_pi.cmake`](https://github.com/PaddlePaddle/Paddle/blob/develop/cmake/cross_compiling/raspberry_pi.cmake). + +Some important arguments that need to be set: + +- `CMAKE_SYSTEM_NAME`: The target platform. Must be `RPi`. + +- `RPI_TOOLCHAIN`: The absolute path of the cross-compiling toolchain. + +- `RPI_ARM_NEON`: Use ARM NEON Intrinsics. This is a required argument and set default to `ON`. + +- `HOST_C/CXX_COMPILER`: The C/C++ compiler for the host. It is used to build building tools running on the host, for example, protoc. + +A commonly-used CMake configuration is as follows: + +``` +cmake -DCMAKE_SYSTEM_NAME=RPi \ + -DRPI_TOOLCHAIN=your/path/to/arm-bcm2708/gcc-linaro-arm-linux-gnueabihf-raspbian-x64 \ + -DRPI_ARM_NEON=ON \ + -DCMAKE_INSTALL_PREFIX=your/path/to/install \ + -DWITH_GPU=OFF \ + -DWITH_C_API=ON \ + -DWITH_PYTHON=OFF \ + -DWITH_SWIG_PY=OFF \ + .. +``` + +To build the inference library, please set the argument WITH\_C\_API to ON: `WITH_C_API=ON`. + +You can add more arguments. For example, to minimize the size of the generated inference library, you may use `CMAKE_BUILD_TYPE=MinSizeRel`. For performance optimization, you may use `CMAKE_BUILD_TYPE=Release`. + +## Build and Install + +The following commands build the inference library of PaddlePaddle for Raspberry Pi and third-party dependencies. + +```bash +make +make install +``` + + The intermediate files will be stored in `build`. Third-party libraries will be located in `build/third_party`. If you have already built it for other platforms like Android or iOS, you may want to clear these directories by running the command: `rm -rf build`. + +The infernece library will be in `your/path/to/install/lib`, with related header files in `your/path/to/install/include`. diff --git a/doc/mobile/index_cn.rst b/doc/mobile/index_cn.rst new file mode 100644 index 0000000000000000000000000000000000000000..1d99666e58b7043b85b0203ee0dfcd1957710161 --- /dev/null +++ b/doc/mobile/index_cn.rst @@ -0,0 +1,9 @@ +MOBILE +====== + +.. toctree:: + :maxdepth: 1 + + cross_compiling_for_android_cn.md + cross_compiling_for_ios_cn.md + cross_compiling_for_raspberry_cn.md diff --git a/doc/mobile/index_en.rst b/doc/mobile/index_en.rst new file mode 100644 index 0000000000000000000000000000000000000000..ef421dacad458828cadf8cf505375d6c4bfd9dde --- /dev/null +++ b/doc/mobile/index_en.rst @@ -0,0 +1,9 @@ +MOBILE +====== + +.. toctree:: + :maxdepth: 1 + + cross_compiling_for_android_en.md + cross_compiling_for_ios_en.md + cross_compiling_for_raspberry_en.md diff --git a/doc/survey/cluster_bootstrapping_tools.md b/doc/survey/cluster_bootstrapping_tools.md new file mode 100644 index 0000000000000000000000000000000000000000..1cd9962700bb49866f1ed6987abc28b27888a23f --- /dev/null +++ b/doc/survey/cluster_bootstrapping_tools.md @@ -0,0 +1,71 @@ +# Cluster bootstrapping tool survey +## Abstract +In order to bring up a cluster from bare metal machine to a fully functional kubernetes cluster for Paddlepaddle to run, we need to utilize some tools. Here we are going to compare [Sextant](https://github.com/k8sp/sextant) and [Tectonic installer](https://github.com/coreos/tectonic-installer) + +## Basic assumptions +Here are some basic assumptions before we move on to details +1. You are an administrator of a bare metal machine cluster, which means: + * you have full control to each of the machines. + * you have full control to the network which machines are connected to. +2. Machines can be booted from network with PEX or iPXE +3. You understand the [general procedure to bring up a cluster](#appendix-general-procedure-to-bring-up-a-cluster) + +if your cluster is able to mark above items with checkmarks, then keep reading. + +## Comparing Sextant and Tectonic installer +### Sextant +Sextant is an end2end solution to bring up a bare metal cluster to a fully functional k8s cluster, it integrates DHCP, name service, PEX, cloud-config-service, docker registry services altogether. + +#### Pros +1. End2End: basically all admin need to do is to config the cluster.yaml and power on the cluster. +2. Offline cluster configuration: Sextant has 2 phases during working with it, config time and deploy time. when admin is configuring, it requires admin's machine has internet connectivity, which will download some images, etc. But in deploy time, it's completely OK to go offline since all dependencies are ready during config time. +3. docker registry integrated. +4. GPU machine took care of. + +### Cons +1. k8s API server is not deployed with high availability in considering by default. +2. No grouping support. +3. No API interface, a one-off service. + + +### Tectonic installer +First of all, Tectonic is not free, it requires coreos.com account as a step of installation, and free user can only create less than 10 nodes. + +Tectonic is a suite of software which wraps around k8s and providing more utility regarding dev ops, ie, +Tectonic installer as it's named, it installs Tectonic to a bare metal cluster which means it's not totally an equivalent of Sextant. At the "booting a cluster" part, it mostly utilizes [Matchbox](https://github.com/coreos/matchbox), which is a general cluster bootstrapper. + +Matchbox's Approach is similar to Sexstant. + +### Pros +1. supports grouping machines. +2. supports running provisioning service in rtk. (not a big deal though). +3. supports http/gRPC API interface. +4. supports multi-template. + +### Cons +1. Not an e2e solution to bring up a cluster, need a lot of extra work and other software. +2. [Not fully supporting](https://github.com/coreos/matchbox/issues/550) centOS deployment yet. + +## Conclusion +Sextant is a better solution overall for paddle cloud deploying to a bare metal cluster. It would be great if Sextant can also 1) deploy k8s api server with high availability by default; 2) not designed as a one-off service. + + + +## Appendix: General procedure to bring up a cluster +It's physically impossible for a cluster admin to manually install OS and applications into cluster nodes one by one, here is what an admin would do in cloud industry: +1. setup a bootstrap machine with static IP in the cluster, which has following services: + * DHCP: assigns ip address for rest of the nodes. + * name service: to map node name to a IP + * PXE related services: the booting related info will be delivered to newly booted machines as their IP is assigned via DHCP service, PXE service will provide further booting and installing info and image with TFTP and http protocol. + * cluster config service: this is for providing cluster node with OS config via http + * optional docker registry: a built-in docker registry makes the whole cluster independent from connecting internet, and speeds up software distribution. +2. New node powers on, it will + * broadcast the request for an IP address + * DHCP server assigns the IP address, and deliver the PXE booting related info to the node. + * cluster node will request config files with booting info delivered with DHCP via the TFTP service, and in most of the cases, the config file will point to a http service for the booting image. + * Since PXE is configured with initrd, it will utilize the cloud config service and do further installations like coreOS or K8s installations. + * then restart the node. + +For further understanding, following 2 links from Matchbox are some good readings: +* [Machine lifecycle](https://github.com/coreos/matchbox/blob/master/Documentation/machine-lifecycle.md) +* [PXE booting](https://github.com/coreos/matchbox/blob/master/Documentation/network-booting.md) diff --git a/doc/templates/conf.py.cn.in b/doc/templates/conf.py.cn.in index 95cad835b11816f4d2e256c2abd662a545a5bad2..41b35b5b233abd737db07aaeb6c6dd4bf6d42b08 100644 --- a/doc/templates/conf.py.cn.in +++ b/doc/templates/conf.py.cn.in @@ -13,22 +13,18 @@ # serve to show the default. import sys import os, subprocess +sys.path.insert(0, os.path.abspath('@PADDLE_SOURCE_DIR@/python')) import shlex from recommonmark import parser, transform -try: - import py_paddle - import paddle - import paddle.v2 -except ImportError: - print("Must install paddle python package before generating documentation") - sys.exit(1) +import paddle +import paddle.v2 MarkdownParser = parser.CommonMarkParser AutoStructify = transform.AutoStructify # If extensions (or modules to document with autodoc) are in another directory, # add these directories to sys.path here. If the directory is relative to the # documentation root, use os.path.abspath to make it absolute, like shown here. -templates_path = ["@PROJ_ROOT@/doc_theme/templates"] +templates_path = ["@PADDLE_SOURCE_DIR@/doc_theme/templates"] # -- General configuration ------------------------------------------------ @@ -124,7 +120,7 @@ html_theme = 'sphinx_rtd_theme' # Add any paths that contain custom static files (such as style sheets) here, # relative to this directory. They are copied after the builtin static files, # so a file named "default.css" will overwrite the builtin "default.css". -html_static_path = ['@PROJ_ROOT@/doc_theme/static'] +html_static_path = ['@PADDLE_SOURCE_DIR@/doc_theme/static'] # Output file base name for HTML help builder. htmlhelp_basename = project + 'doc' diff --git a/doc/templates/conf.py.en.in b/doc/templates/conf.py.en.in index b477f0120c4fa0544012080b7cfb8572d3c44b04..5822c2481dd61da2084b0de76f6f65aa4e32e033 100644 --- a/doc/templates/conf.py.en.in +++ b/doc/templates/conf.py.en.in @@ -13,15 +13,11 @@ # serve to show the default. import sys import os, subprocess +sys.path.insert(0, os.path.abspath('@PADDLE_SOURCE_DIR@/python')) import shlex from recommonmark import parser, transform -try: - import py_paddle - import paddle - import paddle.v2 -except ImportError: - print("Must install paddle python package before generating documentation") - sys.exit(1) +import paddle +import paddle.v2 MarkdownParser = parser.CommonMarkParser @@ -29,7 +25,7 @@ AutoStructify = transform.AutoStructify # If extensions (or modules to document with autodoc) are in another directory, # add these directories to sys.path here. If the directory is relative to the # documentation root, use os.path.abspath to make it absolute, like shown here. -templates_path = ["@PROJ_ROOT@/doc_theme/templates"] +templates_path = ["@PADDLE_SOURCE_DIR@/doc_theme/templates"] # -- General configuration ------------------------------------------------ @@ -124,7 +120,7 @@ html_theme = 'sphinx_rtd_theme' # Add any paths that contain custom static files (such as style sheets) here, # relative to this directory. They are copied after the builtin static files, # so a file named "default.css" will overwrite the builtin "default.css". -html_static_path = ['@PROJ_ROOT@/doc_theme/static'] +html_static_path = ['@PADDLE_SOURCE_DIR@/doc_theme/static'] # Output file base name for HTML help builder. htmlhelp_basename = project + 'doc' diff --git a/doc/tutorials/embedding_model/index_cn.md b/doc/tutorials/embedding_model/index_cn.md deleted file mode 100644 index 2b4a79fbbfc0c4af74aa73c540919f5d9cf2635b..0000000000000000000000000000000000000000 --- a/doc/tutorials/embedding_model/index_cn.md +++ /dev/null @@ -1,139 +0,0 @@ -# 中文词向量模型的使用 # ----------- -本文档介绍如何在PaddlePaddle平台上,使用预训练的标准格式词向量模型。 - -在此感谢 @lipeng 提出的代码需求,并给出的相关模型格式的定义。 - -## 介绍 ### -### 中文字典 ### -我们的字典使用内部的分词工具对百度知道和百度百科的语料进行分词后产生。分词风格如下: "《红楼梦》"将被分为 "《","红楼梦","》",和 "《红楼梦》"。字典采用UTF8编码,输出有2列:词本身和词频。字典共包含 3206326个词和4个特殊标记: - - ``: 分词序列的开始 - - ``: 分词序列的结束 - - `PALCEHOLDER_JUST_IGNORE_THE_EMBEDDING`: 占位符,没有实际意义 - - ``: 未知词 - -### 中文词向量的预训练模型 ### -遵循文章 [A Neural Probabilistic Language Model](http://www.jmlr.org/papers/volume3/bengio03a/bengio03a.pdf)中介绍的方法,模型采用 n-gram 语言模型,结构如下图:6元上下文作为输入层->全连接层->softmax层 。对应于字典,我们预训练得到4种不同维度的词向量,分别为:32维、64维、128维和256维。 -
![](./neural-n-gram-model.png)
-
Figure 1. neural-n-gram-model
- -### 下载和数据抽取 ### -运行以下的命令下载和获取我们的字典和预训练模型: - - cd $PADDLE_ROOT/demo/model_zoo/embedding - ./pre_DictAndModel.sh - -## 中文短语改写的例子 ## -以下示范如何使用预训练的中文字典和词向量进行短语改写。 - -### 数据的准备和预处理 ### -首先,运行以下的命令下载数据集。该数据集(utf8编码)包含20个训练样例,5个测试样例和2个生成式样例。 - - cd $PADDLE_ROOT/demo/seqToseq/data - ./paraphrase_data.sh - -第二步,将数据处理成规范格式,在训练数集上训练生成词向量字典(数据将保存在 `$PADDLE_SOURCE_ROOT/demo/seqToseq/data/pre-paraphrase`): - - cd $PADDLE_ROOT/demo/seqToseq/ - python preprocess.py -i data/paraphrase [--mergeDict] - -- 其中,如果使用`--mergeDict`选项,源语言短语和目标语言短语的字典将被合并(源语言和目标语言共享相同的编码字典)。本实例中,源语言和目标语言都是相同的语言,因此可以使用该选项。 - - -### 使用用户指定的词向量字典 ### -使用如下命令,从预训练模型中,根据用户指定的字典,抽取对应的词向量构成新的词表: - cd $PADDLE_ROOT/demo/model_zoo/embedding - python extract_para.py --preModel PREMODEL --preDict PREDICT --usrModel USRMODEL--usrDict USRDICT -d DIM - -- `--preModel PREMODEL`: 预训练词向量字典模型的路径 -- `--preDict PREDICT`: 预训练模型使用的字典的路径 -- `--usrModel USRMODEL`: 抽取出的新词表的保存路径 -- `--usrDict USRDICT`: 用户指定新的字典的路径,用于构成新的词表 -- `-d DIM`: 参数(词向量)的维度 - -此处,你也可以简单的运行以下的命令: - - cd $PADDLE_ROOT/demo/seqToseq/data/ - ./paraphrase_model.sh - -运行成功以后,你将会看到以下的模型结构: - - paraphrase_model - |--- _source_language_embedding - |--- _target_language_embedding - -### 在PaddlePaddle平台训练模型 ### -首先,配置模型文件,配置如下(可以参考保存在 `demo/seqToseq/paraphrase/train.conf`的配置): - - from seqToseq_net import * - is_generating = False - - ################## Data Definition ##################### - train_conf = seq_to_seq_data(data_dir = "./data/pre-paraphrase", - job_mode = job_mode) - - ############## Algorithm Configuration ################## - settings( - learning_method = AdamOptimizer(), - batch_size = 50, - learning_rate = 5e-4) - - ################# Network configure ##################### - gru_encoder_decoder(train_conf, is_generating, word_vector_dim = 32) - -这个配置与`demo/seqToseq/translation/train.conf` 基本相同 - -然后,使用以下命令进行模型训练: - - cd $PADDLE_SOURCE_ROOT/demo/seqToseq/paraphrase - ./train.sh - -其中,`train.sh` 与`demo/seqToseq/translation/train.sh` 基本相同,只有2个配置不一样: - -- `--init_model_path`: 初始化模型的路径配置为`data/paraphrase_modeldata/paraphrase_model` -- `--load_missing_parameter_strategy`:如果参数模型文件缺失,除词向量模型外的参数将使用正态分布随机初始化 - -如果用户想要了解详细的数据集的格式、模型的结构和训练过程,请查看 [Text generation Tutorial](../text_generation/index_cn.md). - -## 可选功能 ## -### 观测词向量 -PaddlePaddle 平台为想观测词向量的用户提供了将二进制词向量模型转换为文本模型的功能: - - cd $PADDLE_ROOT/demo/model_zoo/embedding - python paraconvert.py --b2t -i INPUT -o OUTPUT -d DIM - -- `-i INPUT`: 输入的(二进制)词向量模型名称 -- `-o OUTPUT`: 输出的文本模型名称 -- `-d DIM`: (词向量)参数维度 - -运行完以上命令,用户可以在输出的文本模型中看到: - - 0,4,32156096 - -0.7845433,1.1937413,-0.1704215,0.4154715,0.9566584,-0.5558153,-0.2503305, ...... - 0.0000909,0.0009465,-0.0008813,-0.0008428,0.0007879,0.0000183,0.0001984, ...... - ...... - -- 其中,第一行是`PaddlePaddle` 输出文件的格式说明,包含3个属性:: - - `PaddlePaddle`的版本号,本例中为0 - - 浮点数占用的字节数,本例中为4 - - 总计的参数个数,本例中为32,156,096 -- 其余行是(词向量)参数行(假设词向量维度为32) - - 每行打印32个参数以','分隔 - - 共有32,156,096/32 = 1,004,877行,也就是说,模型共包含1,004,877个被向量化的词 - -### 词向量模型的修正 -`PaddlePaddle` 为想修正词向量模型的用户提供了将文本词向量模型转换为二进制模型的命令: - - cd $PADDLE_ROOT/demo/model_zoo/embedding - python paraconvert.py --t2b -i INPUT -o OUTPUT - -- `-i INPUT`: 输入的文本词向量模型名称 -- `-o OUTPUT`: 输出的二进制词向量模型名称 - -请注意,输入的文本格式如下: - - -0.7845433,1.1937413,-0.1704215,0.4154715,0.9566584,-0.5558153,-0.2503305, ...... - 0.0000909,0.0009465,-0.0008813,-0.0008428,0.0007879,0.0000183,0.0001984, ...... - ...... -- 输入文本中没有头部(格式说明)行 -- (输入文本)每行存储一个词,以逗号','分隔 diff --git a/doc/tutorials/embedding_model/index_en.md b/doc/tutorials/embedding_model/index_en.md deleted file mode 100644 index 9525f64f9b5384c8e44690fb0887fb2293108e0a..0000000000000000000000000000000000000000 --- a/doc/tutorials/embedding_model/index_en.md +++ /dev/null @@ -1,140 +0,0 @@ -# Chinese Word Embedding Model Tutorial # ----------- -This tutorial is to guide you through the process of using a Pretrained Chinese Word Embedding Model in the PaddlePaddle standard format. - -We thank @lipeng for the pull request that defined the model schemas and pretrained the models. - -## Introduction ### -### Chinese Word Dictionary ### -Our Chinese-word dictionary is created on Baidu ZhiDao and Baidu Baike by using in-house word segmentor. For example, the participle of "《红楼梦》" is "《","红楼梦","》",and "《红楼梦》". Our dictionary (using UTF-8 format) has has two columns: word and its frequency. The total word count is 3206326, including 4 special token: - - ``: the start of a sequence - - ``: the end of a sequence - - `PALCEHOLDER_JUST_IGNORE_THE_EMBEDDING`: a placeholder, just ignore it and its embedding - - ``: a word not included in dictionary - -### Pretrained Chinese Word Embedding Model ### -Inspired by paper [A Neural Probabilistic Language Model](http://www.jmlr.org/papers/volume3/bengio03a/bengio03a.pdf), our model architecture (**Embedding joint of six words->FullyConnect->SoftMax**) is as following graph. And for our dictionary, we pretrain four models with different word vector dimenstions, i.e 32, 64, 128, 256. -
![](./neural-n-gram-model.png)
-
Figure 1. neural-n-gram-model
- -### Download and Extract ### -To download and extract our dictionary and pretrained model, run the following commands. - - cd $PADDLE_ROOT/demo/model_zoo/embedding - ./pre_DictAndModel.sh - -## Chinese Paraphrasing Example ## -We provide a paraphrasing task to show the usage of pretrained Chinese Word Dictionary and Embedding Model. - -### Data Preparation and Preprocess ### - -First, run the following commands to download and extract the in-house dataset. The dataset (using UTF-8 format) has 20 training samples, 5 testing samples and 2 generating samples. - - cd $PADDLE_ROOT/demo/seqToseq/data - ./paraphrase_data.sh - -Second, preprocess data and build dictionary on train data by running the following commands, and the preprocessed dataset is stored in `$PADDLE_SOURCE_ROOT/demo/seqToseq/data/pre-paraphrase`: - - cd $PADDLE_ROOT/demo/seqToseq/ - python preprocess.py -i data/paraphrase [--mergeDict] - -- `--mergeDict`: if using this option, the source and target dictionary are merged, i.e, two dictionaries have the same context. Here, as source and target data are all chinese words, this option can be used. - -### User Specified Embedding Model ### -The general command of extracting desired parameters from the pretrained embedding model based on user dictionary is: - - cd $PADDLE_ROOT/demo/model_zoo/embedding - python extract_para.py --preModel PREMODEL --preDict PREDICT --usrModel USRMODEL--usrDict USRDICT -d DIM - -- `--preModel PREMODEL`: the name of pretrained embedding model -- `--preDict PREDICT`: the name of pretrained dictionary -- `--usrModel USRMODEL`: the name of extracted embedding model -- `--usrDict USRDICT`: the name of user specified dictionary -- `-d DIM`: dimension of parameter - -Here, you can simply run the command: - - cd $PADDLE_ROOT/demo/seqToseq/data/ - ./paraphrase_model.sh - -And you will see following embedding model structure: - - paraphrase_model - |--- _source_language_embedding - |--- _target_language_embedding - -### Training Model in PaddlePaddle ### -First, create a model config file, see example `demo/seqToseq/paraphrase/train.conf`: - - from seqToseq_net import * - is_generating = False - - ################## Data Definition ##################### - train_conf = seq_to_seq_data(data_dir = "./data/pre-paraphrase", - job_mode = job_mode) - - ############## Algorithm Configuration ################## - settings( - learning_method = AdamOptimizer(), - batch_size = 50, - learning_rate = 5e-4) - - ################# Network configure ##################### - gru_encoder_decoder(train_conf, is_generating, word_vector_dim = 32) - -This config is almost the same as `demo/seqToseq/translation/train.conf`. - -Then, train the model by running the command: - - cd $PADDLE_SOURCE_ROOT/demo/seqToseq/paraphrase - ./train.sh - -where `train.sh` is almost the same as `demo/seqToseq/translation/train.sh`, the only difference is following two command arguments: - -- `--init_model_path`: path of the initialization model, here is `data/paraphrase_model` -- `--load_missing_parameter_strategy`: operations when model file is missing, here use a normal distibution to initialize the other parameters except for the embedding layer - -For users who want to understand the dataset format, model architecture and training procedure in detail, please refer to [Text generation Tutorial](../text_generation/index_en.md). - -## Optional Function ## -### Embedding Parameters Observation -For users who want to observe the embedding parameters, this function can convert a PaddlePaddle binary embedding model to a text model by running the command: - - cd $PADDLE_ROOT/demo/model_zoo/embedding - python paraconvert.py --b2t -i INPUT -o OUTPUT -d DIM - -- `-i INPUT`: the name of input binary embedding model -- `-o OUTPUT`: the name of output text embedding model -- `-d DIM`: the dimension of parameter - -You will see parameters like this in output text model: - - 0,4,32156096 - -0.7845433,1.1937413,-0.1704215,0.4154715,0.9566584,-0.5558153,-0.2503305, ...... - 0.0000909,0.0009465,-0.0008813,-0.0008428,0.0007879,0.0000183,0.0001984, ...... - ...... - -- 1st line is **PaddlePaddle format file head**, it has 3 attributes: - - version of PaddlePaddle, here is 0 - - sizeof(float), here is 4 - - total number of parameter, here is 32156096 -- Other lines print the paramters (assume `` = 32) - - each line print 32 paramters splitted by ',' - - there is 32156096/32 = 1004877 lines, meaning there is 1004877 embedding words - -### Embedding Parameters Revision -For users who want to revise the embedding parameters, this function can convert a revised text embedding model to a PaddlePaddle binary model by running the command: - - cd $PADDLE_ROOT/demo/model_zoo/embedding - python paraconvert.py --t2b -i INPUT -o OUTPUT - -- `-i INPUT`: the name of input text embedding model. -- `-o OUTPUT`: the name of output binary embedding model - -Note that the format of input text model is as follows: - - -0.7845433,1.1937413,-0.1704215,0.4154715,0.9566584,-0.5558153,-0.2503305, ...... - 0.0000909,0.0009465,-0.0008813,-0.0008428,0.0007879,0.0000183,0.0001984, ...... - ...... -- there is no file header in 1st line -- each line stores parameters for one word, the separator is commas ',' diff --git a/doc/tutorials/embedding_model/neural-n-gram-model.png b/doc/tutorials/embedding_model/neural-n-gram-model.png deleted file mode 100644 index f70b765b3fd69816345a79fc59adfea46008dbfd..0000000000000000000000000000000000000000 Binary files a/doc/tutorials/embedding_model/neural-n-gram-model.png and /dev/null differ diff --git a/doc/tutorials/gan/gan.png b/doc/tutorials/gan/gan.png deleted file mode 100644 index 0eafd7cb49b545f412f8e775804bcd0b22c42454..0000000000000000000000000000000000000000 Binary files a/doc/tutorials/gan/gan.png and /dev/null differ diff --git a/doc/tutorials/gan/index_en.md b/doc/tutorials/gan/index_en.md deleted file mode 100644 index ac9ed37b2264778869f92c0910b1cb946fb4427f..0000000000000000000000000000000000000000 --- a/doc/tutorials/gan/index_en.md +++ /dev/null @@ -1,137 +0,0 @@ -# Generative Adversarial Networks (GAN) - -This demo implements GAN training described in the original [GAN paper](https://arxiv.org/abs/1406.2661) and deep convolutional generative adversarial networks [DCGAN paper](https://arxiv.org/abs/1511.06434). - -The high-level structure of GAN is shown in Figure. 1 below. It is composed of two major parts: a generator and a discriminator, both of which are based on neural networks. The generator takes in some kind of noise with a known distribution and transforms it into an image. The discriminator takes in an image and determines whether it is artificially generated by the generator or a real image. So the generator and the discriminator are in a competitive game in which generator is trying to generate image to look as real as possible to fool the discriminator, while the discriminator is trying to distinguish between real and fake images. - -
![](./gan.png)
-

- Figure 1. GAN-Model-Structure - figure credit -

- -The generator and discriminator take turn to be trained using SGD. The objective function of the generator is for its generated images being classified as real by the discriminator, and the objective function of the discriminator is to correctly classify real and fake images. When the GAN model is trained to converge to the equilibrium state, the generator will transform the given noise distribution to the distribution of real images, and the discriminator will not be able to distinguish between real and fake images at all. - -## Implementation of GAN Model Structure -Since GAN model involves multiple neural networks, it requires to use paddle python API. So the code walk-through below can also partially serve as an introduction to the usage of Paddle Python API. - -There are three networks defined in gan_conf.py, namely **generator_training**, **discriminator_training** and **generator**. The relationship to the model structure we defined above is that **discriminator_training** is the discriminator, **generator** is the generator, and the **generator_training** combined the generator and discriminator since training generator would require the discriminator to provide loss function. This relationship is described in the following code: -```python -if is_generator_training: - noise = data_layer(name="noise", size=noise_dim) - sample = generator(noise) - -if is_discriminator_training: - sample = data_layer(name="sample", size=sample_dim) - -if is_generator_training or is_discriminator_training: - label = data_layer(name="label", size=1) - prob = discriminator(sample) - cost = cross_entropy(input=prob, label=label) - classification_error_evaluator( - input=prob, label=label, name=mode + '_error') - outputs(cost) - -if is_generator: - noise = data_layer(name="noise", size=noise_dim) - outputs(generator(noise)) -``` - -In order to train the networks defined in gan_conf.py, one first needs to initialize a Paddle environment, parse the config, create GradientMachine from the config and create trainer from GradientMachine as done in the code chunk below: -```python -import py_paddle.swig_paddle as api -# init paddle environment -api.initPaddle('--use_gpu=' + use_gpu, '--dot_period=10', - '--log_period=100', '--gpu_id=' + args.gpu_id, - '--save_dir=' + "./%s_params/" % data_source) - -# Parse config -gen_conf = parse_config(conf, "mode=generator_training,data=" + data_source) -dis_conf = parse_config(conf, "mode=discriminator_training,data=" + data_source) -generator_conf = parse_config(conf, "mode=generator,data=" + data_source) - -# Create GradientMachine -dis_training_machine = api.GradientMachine.createFromConfigProto( -dis_conf.model_config) -gen_training_machine = api.GradientMachine.createFromConfigProto( -gen_conf.model_config) -generator_machine = api.GradientMachine.createFromConfigProto( -generator_conf.model_config) - -# Create trainer -dis_trainer = api.Trainer.create(dis_conf, dis_training_machine) -gen_trainer = api.Trainer.create(gen_conf, gen_training_machine) -``` - -In order to balance the strength between generator and discriminator, we schedule to train whichever one is performing worse by comparing their loss function value. The loss function value can be calculated by a forward pass through the GradientMachine. -```python -def get_training_loss(training_machine, inputs): - outputs = api.Arguments.createArguments(0) - training_machine.forward(inputs, outputs, api.PASS_TEST) - loss = outputs.getSlotValue(0).copyToNumpyMat() - return numpy.mean(loss) -``` - -After training one network, one needs to sync the new parameters to the other networks. The code below demonstrates one example of such use case: -```python -# Train the gen_training -gen_trainer.trainOneDataBatch(batch_size, data_batch_gen) - -# Copy the parameters from gen_training to dis_training and generator -copy_shared_parameters(gen_training_machine, -dis_training_machine) -copy_shared_parameters(gen_training_machine, generator_machine) -``` - - -## A Toy Example -With the infrastructure explained above, we can now walk you through a toy example of generating two dimensional uniform distribution using 10 dimensional Gaussian noise. - -The Gaussian noises are generated using the code below: -```python -def get_noise(batch_size, noise_dim): - return numpy.random.normal(size=(batch_size, noise_dim)).astype('float32') -``` - -The real samples (2-D uniform) are generated using the code below: -```python -# synthesize 2-D uniform data in gan_trainer.py:114 -def load_uniform_data(): - data = numpy.random.rand(1000000, 2).astype('float32') - return data -``` - -The generator and discriminator network are built using fully-connected layer and batch_norm layer, and are defined in gan_conf.py. - -To train the GAN model, one can use the command below. The flag -d specifies the training data (cifar, mnist or uniform) and flag --useGpu specifies whether to use gpu for training (0 is cpu, 1 is gpu). -```bash -$python gan_trainer.py -d uniform --useGpu 1 -``` -The generated samples can be found in ./uniform_samples/ and one example is shown below as Figure 2. One can see that it roughly recovers the 2D uniform distribution. - -
![](./uniform_sample.png)
-

- Figure 2. Uniform Sample -

- -## MNIST Example -### Data preparation -To download the MNIST data, one can use the following commands: -```bash -$cd data/ -$./get_mnist_data.sh -``` - -### Model description -Following the DC-Gan paper (https://arxiv.org/abs/1511.06434), we use convolution/convolution-transpose layer in the discriminator/generator network to better deal with images. The details of the network structures are defined in gan_conf_image.py. - -### Training the model -To train the GAN model on mnist data, one can use the following command: -```bash -$python gan_trainer.py -d mnist --useGpu 1 -``` -The generated sample images can be found at ./mnist_samples/ and one example is shown below as Figure 3. -
![](./mnist_sample.png)
-

- Figure 3. MNIST Sample -

diff --git a/doc/tutorials/gan/mnist_sample.png b/doc/tutorials/gan/mnist_sample.png deleted file mode 100644 index f9c7bf7ddd7f148eac4fe347e9c38afaa8876760..0000000000000000000000000000000000000000 Binary files a/doc/tutorials/gan/mnist_sample.png and /dev/null differ diff --git a/doc/tutorials/gan/uniform_sample.png b/doc/tutorials/gan/uniform_sample.png deleted file mode 100644 index e716c48e782019a757bed0cb443f2ed97386cbe2..0000000000000000000000000000000000000000 Binary files a/doc/tutorials/gan/uniform_sample.png and /dev/null differ diff --git a/doc/tutorials/image_classification/cifar.png b/doc/tutorials/image_classification/cifar.png deleted file mode 100644 index f54a0c58837cb3385b32dc57d02cec92666ef0f1..0000000000000000000000000000000000000000 Binary files a/doc/tutorials/image_classification/cifar.png and /dev/null differ diff --git a/doc/tutorials/image_classification/image_classification.png b/doc/tutorials/image_classification/image_classification.png deleted file mode 100644 index 14f255805081c1b4fab27eaf336fd389fa93ca19..0000000000000000000000000000000000000000 Binary files a/doc/tutorials/image_classification/image_classification.png and /dev/null differ diff --git a/doc/tutorials/image_classification/index_cn.md b/doc/tutorials/image_classification/index_cn.md deleted file mode 100644 index 87f465522a0fa21c8c03754b4be8dcb035c4de81..0000000000000000000000000000000000000000 --- a/doc/tutorials/image_classification/index_cn.md +++ /dev/null @@ -1,205 +0,0 @@ -图像分类教程 -========== - -在本教程中,我们将使用CIFAR-10数据集训练一个卷积神经网络,并使用这个神经网络来对图片进行分类。如下图所示,卷积神经网络可以辨识图片中的主体,并给出分类结果。 -
![Image Classification](./image_classification.png)
- -## 数据准备 -首先下载CIFAR-10数据集。下面是CIFAR-10数据集的官方网址: - - - -我们准备了一个脚本,可以用于从官方网站上下载CIFAR-10数据集,转为jpeg文件并存入特定的目录。使用这个脚本前请确认已经安装了pillow及相关依赖模块。可以参照下面的命令进行安装: - -1. 安装pillow - -```bash -sudo apt-get install libjpeg-dev -pip install pillow -``` - -2. 下载数据集 - -```bash -cd demo/image_classification/data/ -sh download_cifar.sh -``` - -CIFAR-10数据集包含60000张32x32的彩色图片。图片分为10类,每个类包含6000张。其中50000张图片作为训练集,10000张作为测试集。 - -下图展示了所有的图片类别,每个类别中随机抽取了10张图片。 -
![Image Classification](./cifar.png)
- -脚本运行完成后,我们应当会得到一个名为cifar-out的文件夹,其下子文件夹的结构如下 - - -``` -train ----airplane ----automobile ----bird ----cat ----deer ----dog ----frog ----horse ----ship ----truck -test ----airplane ----automobile ----bird ----cat ----deer ----dog ----frog ----horse ----ship ----truck -``` - -cifar-out下包含`train`和`test`两个文件夹,其中分别包含了CIFAR-10中的训练集和测试集。这两个文件夹下各自有10个子文件夹,每个子文件夹下存储相应分类的图片。将图片按照上述结构存储好之后,我们就可以着手对分类模型进行训练了。 - -## 预处理 -数据下载之后,还需要进行预处理,将数据转换为Paddle的格式。我们可以通过如下命令进行预处理工作: - -``` -cd demo/image_classification/ -sh preprocess.sh -``` - -其中`preprocess.sh` 调用 `./demo/image_classification/preprocess.py` 对图片进行预处理 -```sh -export PYTHONPATH=$PYTHONPATH:../../ -data_dir=./data/cifar-out -python preprocess.py -i $data_dir -s 32 -c 1 -``` - -`./demo/image_classification/preprocess.py` 使用如下参数: - -- `-i` 或 `--input` 给出输入数据所在路径; -- `-s` 或 `--size` 给出图片尺寸; -- `-c` 或 `--color` 标示图片是彩色图或灰度图 - -## 模型训练 -在开始训练之前,我们需要先创建一个模型配置文件。下面我们给出了一个配置示例。**注意**,这里的列出的和`vgg_16_cifar.py`文件稍有差别,因为该文件可适用于预测。 - -```python -from paddle.trainer_config_helpers import * -data_dir='data/cifar-out/batches/' -meta_path=data_dir+'batches.meta' -args = {'meta':meta_path, 'mean_img_size': 32, - 'img_size': 32, 'num_classes': 10, - 'use_jpeg': 1, 'color': "color"} -define_py_data_sources2(train_list=data_dir+"train.list", - test_list=data_dir+'test.list', - module='image_provider', - obj='processData', - args=args) -settings( - batch_size = 128, - learning_rate = 0.1 / 128.0, - learning_method = MomentumOptimizer(0.9), - regularization = L2Regularization(0.0005 * 128)) - -img = data_layer(name='image', size=3*32*32) -lbl = data_layer(name="label", size=10) -# small_vgg is predined in trainer_config_helpers.network -predict = small_vgg(input_image=img, num_channels=3) -outputs(classification_cost(input=predict, label=lbl)) -``` - -在第一行中我们载入用于定义网络的函数。 -```python -from paddle.trainer_config_helpers import * -``` - -之后定义的`define_py_data_sources2`使用Python数据提供器,其中 `args`将在`image_provider.py`进行使用,该文件负责产生图片数据并传递给Paddle系统 - - `meta`: 训练集平均值。 - - `mean_img_size`: 平均特征图的高度及宽度。 - - `img_size`:输入图片的高度及宽度。 - - `num_classes`:类别个数。 - - `use_jpeg`:处理过程中数据存储格式。 - - `color`:标示是否为彩色图片。 - - `settings`用于设置训练算法。在下面的例子中,learning rate被设置为0.1除以batch size,而weight decay则为0.0005乘以batch size。 - - ```python -settings( - batch_size = 128, - learning_rate = 0.1 / 128.0, - learning_method = MomentumOptimizer(0.9), - regularization = L2Regularization(0.0005 * 128) -) -``` - -`small_vgg`定义了网络结构。这里我们使用的是一个小的VGG网络。关于VGG卷积神经网络的描述可以参考:[http://www.robots.ox.ac.uk/~vgg/research/very_deep/](http://www.robots.ox.ac.uk/~vgg/research/very_deep/)。 -```python -# small_vgg is predined in trainer_config_helpers.network -predict = small_vgg(input_image=img, num_channels=3) -``` -配置创建完毕后,可以运行脚本train.sh来训练模型。 - -```bash -config=vgg_16_cifar.py -output=./cifar_vgg_model -log=train.log - -paddle train \ ---config=$config \ ---dot_period=10 \ ---log_period=100 \ ---test_all_data_in_one_period=1 \ ---use_gpu=1 \ ---save_dir=$output \ -2>&1 | tee $log - -python -m paddle.utils.plotcurve -i $log > plot.png -``` -- 这里我们使用的是GPU模式进行训练。如果你没有GPU环境,可以设置`use_gpu=0`。 -- `./demo/image_classification/vgg_16_cifar.py`是网络和数据配置文件。各项参数的详细说明可以在命令行参数相关文档中找到。 -- 脚本`plotcurve.py`依赖于python的`matplotlib`模块。因此如果这个脚本运行失败,也许是因为需要安装`matplotlib`。 -在训练完成后,训练及测试误差曲线图会被`plotcurve.py`脚本保存在 `plot.png`中。下面是一个误差曲线图的示例: - -
![Training and testing curves.](./plot.png)
- -## 预测 -在训练完成后,模型及参数会被保存在路径`./cifar_vgg_model/pass-%05d`下。例如第300个pass的模型会被保存在`./cifar_vgg_model/pass-00299`。 - -要对一个图片的进行分类预测,我们可以使用`predict.sh`,该脚本将输出预测分类的标签: - -``` -sh predict.sh -``` - -predict.sh: -``` -model=cifar_vgg_model/pass-00299/ -image=data/cifar-out/test/airplane/seaplane_s_000978.png -use_gpu=1 -python prediction.py $model $image $use_gpu -``` - -## 练习 -在CUB-200数据集上使用VGG模型训练一个鸟类图片分类模型。相关的鸟类数据集可以从如下地址下载,其中包含了200种鸟类的照片(主要来自北美洲)。 - - - - - - -## 细节探究 -### 卷积神经网络 -卷积神经网络是一种使用卷积层的前向神经网络,很适合构建用于理解图片内容的模型。一个典型的神经网络如下图所示: - -![Convolutional Neural Network](./lenet.png) - -一个卷积神经网络包含如下层: - -- 卷积层:通过卷积操作从图片或特征图中提取特征 -- 池化层:使用max-pooling对特征图下采样 -- 全连接层:使输入层到隐藏层的神经元是全部连接的。 - -卷积神经网络在图片分类上有着惊人的性能,这是因为它发掘出了图片的两类重要信息:局部关联性质和空间不变性质。通过交替使用卷积和池化处理, 卷积神经网络能够很好的表示这两类信息。 - -关于如何定义网络中的层,以及如何在层之间进行连接,请参考Layer文档。 diff --git a/doc/tutorials/image_classification/index_en.md b/doc/tutorials/image_classification/index_en.md deleted file mode 100644 index 60c81a6a539944634773f38ec4c9a59709dd4afc..0000000000000000000000000000000000000000 --- a/doc/tutorials/image_classification/index_en.md +++ /dev/null @@ -1,221 +0,0 @@ -Image Classification Tutorial -============================== - -This tutorial will guide you through training a convolutional neural network to classify objects using the CIFAR-10 image classification dataset. -As shown in the following figure, the convolutional neural network can recognize the main object in images, and output the classification result. - -
![Image Classification](./image_classification.png)
- -## Data Preparation -First, download CIFAR-10 dataset. CIFAR-10 dataset can be downloaded from its official website. - - - -We have prepared a script to download and process CIFAR-10 dataset. The script will download CIFAR-10 dataset from the official dataset. -It will convert it to jpeg images and organize them into a directory with the required structure for the tutorial. Make sure that you have installed pillow and its dependents. -Consider the following commands: - -1. install pillow dependents - -```bash -sudo apt-get install libjpeg-dev -pip install pillow -``` - -2. download data and preparation - -```bash -cd demo/image_classification/data/ -sh download_cifar.sh -``` - -The CIFAR-10 dataset consists of 60000 32x32 color images in 10 classes, with 6000 images per class. There are 50000 training images and 10000 test images. - -Here are the classes in the dataset, as well as 10 random images from each: -
![Image Classification](./cifar.png)
- - -After downloading and converting, we should find a directory (cifar-out) containing the dataset in the following format: - -``` -train ----airplane ----automobile ----bird ----cat ----deer ----dog ----frog ----horse ----ship ----truck -test ----airplane ----automobile ----bird ----cat ----deer ----dog ----frog ----horse ----ship ----truck -``` - -It has two directories:`train` and `test`. These two directories contain training data and testing data of CIFAR-10, respectively. Each of these two folders contains 10 sub-folders, ranging from `airplane` to `truck`. Each sub-folder contains images with the corresponding label. After the images are organized into this structure, we are ready to train an image classification model. - -## Preprocess -After the data has been downloaded, it needs to be pre-processed into the Paddle format. We can run the following command for preprocessing. - -``` -cd demo/image_classification/ -sh preprocess.sh -``` - -`preprocess.sh` calls `./demo/image_classification/preprocess.py` to preprocess image data. -```sh -export PYTHONPATH=$PYTHONPATH:../../ -data_dir=./data/cifar-out -python preprocess.py -i $data_dir -s 32 -c 1 -``` - -`./demo/image_classification/preprocess.py` has the following arguments - -- `-i` or `--input` specifes the input data directory. -- `-s` or `--size` specifies the processed size of images. -- `-c` or `--color` specifes whether images are color images or gray images. - - -## Model Training -We need to create a model config file before training the model. An example of the config file (vgg_16_cifar.py) is listed below. **Note**, it is slightly different from the `vgg_16_cifar.py` which also applies to the prediction. - -```python -from paddle.trainer_config_helpers import * -data_dir='data/cifar-out/batches/' -meta_path=data_dir+'batches.meta' -args = {'meta':meta_path, 'mean_img_size': 32, - 'img_size': 32, 'num_classes': 10, - 'use_jpeg': 1, 'color': "color"} -define_py_data_sources2(train_list=data_dir+"train.list", - test_list=data_dir+'test.list', - module='image_provider', - obj='processData', - args=args) -settings( - batch_size = 128, - learning_rate = 0.1 / 128.0, - learning_method = MomentumOptimizer(0.9), - regularization = L2Regularization(0.0005 * 128)) - -img = data_layer(name='image', size=3*32*32) -lbl = data_layer(name="label", size=10) -# small_vgg is predined in trainer_config_helpers.network -predict = small_vgg(input_image=img, num_channels=3) -outputs(classification_cost(input=predict, label=lbl)) -``` - -The first line imports python functions for defining networks. -```python -from paddle.trainer_config_helpers import * -``` - -Then define an `define_py_data_sources2` which use python data provider -interface. The arguments in `args` are used in `image_provider.py` which -yeilds image data and transform them to Paddle. - - `meta`: the mean value of training set. - - `mean_img_size`: the size of mean feature map. - - `img_size`: the height and width of input image. - - `num_classes`: the number of classes. - - `use_jpeg`: the data storage type when preprocessing. - - `color`: specify color image. - -`settings` specifies the training algorithm. In the following example, -it specifies learning rate as 0.1, but divided by batch size, and the weight decay -is 0.0005 and multiplied by batch size. -```python -settings( - batch_size = 128, - learning_rate = 0.1 / 128.0, - learning_method = MomentumOptimizer(0.9), - regularization = L2Regularization(0.0005 * 128) -) -``` - -The `small_vgg` specifies the network. We use a small version of VGG convolutional network as our network -for classification. A description of VGG network can be found here [http://www.robots.ox.ac.uk/~vgg/research/very_deep/](http://www.robots.ox.ac.uk/~vgg/research/very_deep/). -```python -# small_vgg is predined in trainer_config_helpers.network -predict = small_vgg(input_image=img, num_channels=3) -``` -After writing the config, we can train the model by running the script train.sh. - -```bash -config=vgg_16_cifar.py -output=./cifar_vgg_model -log=train.log - -paddle train \ ---config=$config \ ---dot_period=10 \ ---log_period=100 \ ---test_all_data_in_one_period=1 \ ---use_gpu=1 \ ---save_dir=$output \ -2>&1 | tee $log - -python -m paddle.utils.plotcurve -i $log > plot.png -``` - -- Here we use GPU mode to train. If you have no gpu environment, just set `use_gpu=0`. - -- `./demo/image_classification/vgg_16_cifar.py` is the network and data configuration file. The meaning of the other flags can be found in the documentation of the command line flags. - -- The script `plotcurve.py` requires the python module of `matplotlib`, so if it fails, maybe you need to install `matplotlib`. - - -After training finishes, the training and testing error curves will be saved to `plot.png` using `plotcurve.py` script. An example of the plot is shown below: - -
![Training and testing curves.](./plot.png)
- - -## Prediction -After we train the model, the model file as well as the model parameters are stored in path `./cifar_vgg_model/pass-%05d`. For example, the model of the 300-th pass is stored at `./cifar_vgg_model/pass-00299`. - -To make a prediction for an image, one can run `predict.sh` as follows. The script will output the label of the classfiication. - -``` -sh predict.sh -``` - -predict.sh: -``` -model=cifar_vgg_model/pass-00299/ -image=data/cifar-out/test/airplane/seaplane_s_000978.png -use_gpu=1 -python prediction.py $model $image $use_gpu -``` - -## Exercise -Train a image classification of birds using VGG model and CUB-200 dataset. The birds dataset can be downloaded here. It contains an image dataset with photos of 200 bird species (mostly North American). - - - - - - -## Delve into Details -### Convolutional Neural Network -A Convolutional Neural Network is a feedforward neural network that uses convolution layers. It is very suitable for building neural networks that process and understand images. A standard convolutional neural network is shown below: - -![Convolutional Neural Network](./lenet.png) - -Convolutional Neural Network contains the following layers: - -- Convolutional layer: It uses convolution operation to extract features from an image or a feature map. -- Pooling layer: It uses max-pooling to downsample feature maps. -- Fully Connected layer: It uses fully connected connections to transform features. - -Convolutional Neural Network achieves amazing performance for image classification because it exploits two important characteristics of images: *local correlation* and *spatial invariance*. By iteratively applying convolution and max-pooing operations, convolutional neural network can well represent these two characteristics of images. - - -For more details of how to define layers and their connections, please refer to the documentation of layers. diff --git a/doc/tutorials/image_classification/lenet.png b/doc/tutorials/image_classification/lenet.png deleted file mode 100644 index 1e6f2b32bad797f3fccb929c72a121fc935b0cbb..0000000000000000000000000000000000000000 Binary files a/doc/tutorials/image_classification/lenet.png and /dev/null differ diff --git a/doc/tutorials/image_classification/plot.png b/doc/tutorials/image_classification/plot.png deleted file mode 100644 index a31f99791c670e18bb8c62b7604ec8cb0284ffb4..0000000000000000000000000000000000000000 Binary files a/doc/tutorials/image_classification/plot.png and /dev/null differ diff --git a/doc/tutorials/image_classification/src/cifar.png b/doc/tutorials/image_classification/src/cifar.png deleted file mode 100644 index f54a0c58837cb3385b32dc57d02cec92666ef0f1..0000000000000000000000000000000000000000 Binary files a/doc/tutorials/image_classification/src/cifar.png and /dev/null differ diff --git a/doc/tutorials/image_classification/src/image_classification.png b/doc/tutorials/image_classification/src/image_classification.png deleted file mode 100644 index 14f255805081c1b4fab27eaf336fd389fa93ca19..0000000000000000000000000000000000000000 Binary files a/doc/tutorials/image_classification/src/image_classification.png and /dev/null differ diff --git a/doc/tutorials/image_classification/src/lenet.png b/doc/tutorials/image_classification/src/lenet.png deleted file mode 100644 index 1e6f2b32bad797f3fccb929c72a121fc935b0cbb..0000000000000000000000000000000000000000 Binary files a/doc/tutorials/image_classification/src/lenet.png and /dev/null differ diff --git a/doc/tutorials/image_classification/src/plot.png b/doc/tutorials/image_classification/src/plot.png deleted file mode 100644 index a31f99791c670e18bb8c62b7604ec8cb0284ffb4..0000000000000000000000000000000000000000 Binary files a/doc/tutorials/image_classification/src/plot.png and /dev/null differ diff --git a/doc/tutorials/imagenet_model/resnet_block.jpg b/doc/tutorials/imagenet_model/resnet_block.jpg deleted file mode 100644 index e16bd3c624030c4c09b358a015b491141b42d8f1..0000000000000000000000000000000000000000 Binary files a/doc/tutorials/imagenet_model/resnet_block.jpg and /dev/null differ diff --git a/doc/tutorials/imagenet_model/resnet_model_cn.md b/doc/tutorials/imagenet_model/resnet_model_cn.md deleted file mode 100644 index 82ec9d70b345c11aba3aa86f8206eedc8072bb88..0000000000000000000000000000000000000000 --- a/doc/tutorials/imagenet_model/resnet_model_cn.md +++ /dev/null @@ -1,284 +0,0 @@ -# Model Zoo - ImageNet # - -[ImageNet](http://www.image-net.org/) 是通用物体分类领域一个众所周知的数据库。本教程提供了一个用于ImageNet上的卷积分类网络模型。 - -## ResNet 介绍 - -论文 [Deep Residual Learning for Image Recognition](http://arxiv.org/abs/1512.03385) 中提出的ResNet网络结构在2015年ImageNet大规模视觉识别竞赛(ILSVRC 2015)的分类任务中赢得了第一名。他们提出残差学习的框架来简化网络的训练,所构建网络结构的的深度比之前使用的网络有大幅度的提高。下图展示的是基于残差的连接方式。左图构造网络模块的方式被用于34层的网络中,而右图的瓶颈连接模块用于50层,101层和152层的网络结构中。 - -
![resnet_block](./resnet_block.jpg)
-
图 1. ResNet 网络模块
- -本教程中我们给出了三个ResNet模型,这些模型都是由原作者提供的模型转换过来的。我们使用PaddlePaddle在ILSVRC的验证集共50,000幅图像上测试了模型的分类错误率,其中输入图像的颜色通道顺序为**BGR**,保持宽高比缩放到短边为256,只截取中心方形的图像区域。分类错误率和模型大小由下表给出。 -
- ----- - - - - - - - - - - - - - - - - - - - - - - - - - - -
ResNetTop-1Model Size
ResNet-5024.9%99M
ResNet-10123.7%173M
ResNet-15223.2%234M
-
- -## ResNet 模型 - -50层,101层和152层的网络配置文件可参照```demo/model_zoo/resnet/resnet.py```。你也可以通过在命令行参数中增加一个参数如```--config_args=layer_num=50```来指定网络层的数目。 - -### 网络可视化 - -你可以通过执行下面的命令来得到ResNet网络的结构可视化图。该脚本会生成一个dot文件,然后可以转换为图片。需要安装graphviz来转换dot文件为图片。 - -``` -cd demo/model_zoo/resnet -./net_diagram.sh -``` - -### 模型下载 - -``` -cd demo/model_zoo/resnet -./get_model.sh -``` -你可以执行上述命令来下载所有的模型和均值文件,如果下载成功,这些文件将会被保存在```demo/model_zoo/resnet/model```路径下。 - -``` -mean_meta_224 resnet_101 resnet_152 resnet_50 -``` - * resnet_50: 50层网络模型。 - * resnet_101: 101层网络模型。 - * resnet_152: 152层网络模型。 - * mean\_meta\_224: 均值图像文件,图像大小为3 x 224 x 224,颜色通道顺序为**BGR**。你也可以使用这三个值: 103.939, 116.779, 123.68。 - -### 参数信息 - -* **卷积层权重** - - 由于每个卷积层后面连接的是batch normalization层,因此该层中没有偏置(bias)参数,并且只有一个权重。 - 形状: `(Co, ky, kx, Ci)` - * Co: 输出特征图的通道数目 - * ky: 滤波器核在垂直方向上的尺寸 - * kx: 滤波器核在水平方向上的尺寸 - * Ci: 输入特征图的通道数目 - - 二维矩阵: (Co * ky * kx, Ci), 行优先次序存储。 - -* **全连接层权重** - - 二维矩阵: (输入层尺寸, 本层尺寸), 行优先次序存储。 - -* **[Batch Normalization]() 层权重** - -本层有四个参数,实际上只有.w0和.wbias是需要学习的参数,另外两个分别是滑动均值和方差。在测试阶段它们将会被加载到模型中。下表展示了batch normalization层的参数。 -
- ----- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
参数名尺寸含义
_res2_1_branch1_bn.w0256gamma, 缩放参数
_res2_1_branch1_bn.w1256特征图均值
_res2_1_branch1_bn.w2256特征图方差
_res2_1_branch1_bn.wbias256beta, 偏置参数
-
- -### 参数读取 - -使用者可以使用下面的Python脚本来读取参数值: - -``` -import sys -import numpy as np - -def load(file_name): - with open(file_name, 'rb') as f: - f.read(16) # skip header for float type. - return np.fromfile(f, dtype=np.float32) - -if __name__=='__main__': - weight = load(sys.argv[1]) -``` - -或者直接使用下面的shell命令: - -``` -od -j 16 -f _res2_1_branch1_bn.w0 -``` - -## 特征提取 - -我们提供了C++和Python接口来提取特征。下面的例子使用了`demo/model_zoo/resnet/example`中的数据,详细地展示了整个特征提取的过程。 - -### C++接口 - -首先,在配置文件中的`define_py_data_sources2`里指定图像数据列表,具体请参照示例`demo/model_zoo/resnet/resnet.py`。 - -``` - train_list = 'train.list' if not is_test else None - # mean.meta is mean file of ImageNet dataset. - # mean.meta size : 3 x 224 x 224. - # If you use three mean value, set like: - # "mean_value:103.939,116.779,123.68;" - args={ - 'mean_meta': "model/mean_meta_224/mean.meta", - 'image_size': 224, 'crop_size': 224, - 'color': True,'swap_channel:': [2, 1, 0]} - define_py_data_sources2(train_list, - 'example/test.list', - module="example.image_list_provider", - obj="processData", - args=args) -``` - -第二步,在`resnet.py`文件中指定要提取特征的网络层的名字。例如, - -``` -Outputs("res5_3_branch2c_conv", "res5_3_branch2c_bn") -``` - -第三步,在`extract_fea_c++.sh`文件中指定模型路径和输出的目录,然后执行下面的命令。 - -``` -cd demo/model_zoo/resnet -./extract_fea_c++.sh -``` - -如果执行成功,特征将会存到`fea_output/rank-00000`文件中,如下所示。同时你可以使用`load_feature.py`文件中的`load_feature_c`接口来加载该文件。 - -``` --0.115318 -0.108358 ... -0.087884;-1.27664 ... -1.11516 -2.59123; --0.126383 -0.116248 ... -0.00534909;-1.42593 ... -1.04501 -1.40769; -``` - -* 每行存储的是一个样本的特征。其中,第一行存的是图像`example/dog.jpg`的特征,第二行存的是图像`example/cat.jpg`的特征。 -* 不同层的特征由分号`;`隔开,并且它们的顺序与`Outputs()`中指定的层顺序一致。这里,左边是`res5_3_branch2c_conv`层的特征,右边是`res5_3_branch2c_bn`层特征。 - -### Python接口 - -示例`demo/model_zoo/resnet/classify.py`中展示了如何使用Python来提取特征。下面的例子同样使用了`./example/test.list`中的数据。执行的命令如下: - -``` -cd demo/model_zoo/resnet -./extract_fea_py.sh -``` - -extract_fea_py.sh: - -``` -python classify.py \ - --job=extract \ - --conf=resnet.py\ - --use_gpu=1 \ - --mean=model/mean_meta_224/mean.meta \ - --model=model/resnet_50 \ - --data=./example/test.list \ - --output_layer="res5_3_branch2c_conv,res5_3_branch2c_bn" \ - --output_dir=features - -``` -* \--job=extract: 指定工作模式来提取特征。 -* \--conf=resnet.py: 网络配置文件。 -* \--use_gpu=1: 指定是否使用GPU。 -* \--model=model/resnet_50: 模型路径。 -* \--data=./example/test.list: 数据列表。 -* \--output_layer="xxx,xxx": 指定提取特征的层。 -* \--output_dir=features: 输出目录。 - -如果运行成功,你将会看到特征存储在`features/batch_0`文件中,该文件是由cPickle产生的。你可以使用`load_feature.py`中的`load_feature_py`接口来打开该文件,它将返回如下的字典: - -``` -{ -'cat.jpg': {'res5_3_branch2c_conv': array([[-0.12638293, -0.116248 , -0.11883899, ..., -0.00895038, 0.01994277, -0.00534909]], dtype=float32), 'res5_3_branch2c_bn': array([[-1.42593431, -1.28918779, -1.32414699, ..., -1.45933616, -1.04501402, -1.40769434]], dtype=float32)}, -'dog.jpg': {'res5_3_branch2c_conv': array([[-0.11531784, -0.10835785, -0.08809858, ...,0.0055237, 0.01505112, -0.08788397]], dtype=float32), 'res5_3_branch2c_bn': array([[-1.27663755, -1.18272924, -0.90937918, ..., -1.25178063, -1.11515927, -2.59122872]], dtype=float32)} -} -``` - -仔细观察,这些特征值与上述使用C++接口提取的结果是一致的。 - -## 预测 - -`classify.py`文件也可以用于对样本进行预测。我们提供了一个示例脚本`predict.sh`,它使用50层的ResNet模型来对`example/test.list`中的数据进行预测。 - -``` -cd demo/model_zoo/resnet -./predict.sh -``` - -predict.sh调用了`classify.py`: - -``` -python classify.py \ - --job=predict \ - --conf=resnet.py\ - --multi_crop \ - --model=model/resnet_50 \ - --use_gpu=1 \ - --data=./example/test.list -``` -* \--job=extract: 指定工作模型进行预测。 -* \--conf=resnet.py: 网络配置文件。network configure. -* \--multi_crop: 使用10个裁剪图像块,预测概率取平均。 -* \--use_gpu=1: 指定是否使用GPU。 -* \--model=model/resnet_50: 模型路径。 -* \--data=./example/test.list: 数据列表。 - -如果运行成功,你将会看到如下结果,其中156和285是这些图像的分类标签。 - -``` -Label of example/dog.jpg is: 156 -Label of example/cat.jpg is: 282 -``` diff --git a/doc/tutorials/imagenet_model/resnet_model_en.md b/doc/tutorials/imagenet_model/resnet_model_en.md deleted file mode 100644 index 478ad06193b14ba7fe02238df621db1f7b0804d4..0000000000000000000000000000000000000000 --- a/doc/tutorials/imagenet_model/resnet_model_en.md +++ /dev/null @@ -1,284 +0,0 @@ -# Model Zoo - ImageNet # - -[ImageNet](http://www.image-net.org/) is a popular dataset for generic object classification. This tutorial provides convolutional neural network(CNN) models for ImageNet. - -## ResNet Introduction - -ResNets from paper [Deep Residual Learning for Image Recognition](http://arxiv.org/abs/1512.03385) won the 1st place on the ILSVRC 2015 classification task. They present residual learning framework to ease the training of networks that are substantially deeper than those used previously. The residual connections are shown in following figure. The left building block is used in network of 34 layers and the right bottleneck building block is used in network of 50, 101, 152 layers . - -
![resnet_block](./resnet_block.jpg)
-
Figure 1. ResNet Block
- -We present three ResNet models, which are converted from the models provided by the authors . The classfication errors tested in PaddlePaddle on 50,000 ILSVRC validation set with input images channel order of **BGR** by single scale with the shorter side of 256 and single crop as following table. -
- ----- - - - - - - - - - - - - - - - - - - - - - - - - - - -
ResNetTop-1Model Size
ResNet-5024.9%99M
ResNet-10123.7%173M
ResNet-15223.2%234M
-
- -## ResNet Model - -See ```demo/model_zoo/resnet/resnet.py```. This config contains network of 50, 101 and 152 layers. You can specify layer number by adding argument like ```--config_args=layer_num=50``` in command line arguments. - -### Network Visualization - -You can get a diagram of ResNet network by running the following commands. The script generates dot file and then converts dot file to PNG file, which needs to install graphviz to convert. - -``` -cd demo/model_zoo/resnet -./net_diagram.sh -``` - -### Model Download - -``` -cd demo/model_zoo/resnet -./get_model.sh -``` -You can run above command to download all models and mean file and save them in ```demo/model_zoo/resnet/model``` if downloading successfully. - -``` -mean_meta_224 resnet_101 resnet_152 resnet_50 -``` - * resnet_50: model of 50 layers. - * resnet_101: model of 101 layers. - * resnet_152: model of 152 layers. - * mean\_meta\_224: mean file with 3 x 224 x 224 size in **BGR** order. You also can use three mean values: 103.939, 116.779, 123.68. - -### Parameter Info - -* **Convolution Layer Weight** - - As batch normalization layer is connected after each convolution layer, there is no parameter of bias and only one weight in this layer. - shape: `(Co, ky, kx, Ci)` - * Co: channle number of output feature map. - * ky: filter size in vertical direction. - * kx: filter size in horizontal direction. - * Ci: channle number of input feature map. - - 2-Dim matrix: (Co * ky * kx, Ci), saved in row-major order. - -* **Fully connected Layer Weight** - - 2-Dim matrix: (input layer size, this layer size), saved in row-major order. - -* **[Batch Normalization]() Layer Weight** - -There are four parameters in this layer. In fact, only .w0 and .wbias are the learned parameters. The other two are therunning mean and variance respectively. They will be loaded in testing. Following table shows parameters of a batch normzalization layer. -
- ----- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Parameter NameNumberMeaning
_res2_1_branch1_bn.w0256gamma, scale parameter
_res2_1_branch1_bn.w1256mean value of feature map
_res2_1_branch1_bn.w2256variance of feature map
_res2_1_branch1_bn.wbias256beta, shift parameter
-
- -### Parameter Observation - -Users who want to observe the parameters can use Python to read: - -``` -import sys -import numpy as np - -def load(file_name): - with open(file_name, 'rb') as f: - f.read(16) # skip header for float type. - return np.fromfile(f, dtype=np.float32) - -if __name__=='__main__': - weight = load(sys.argv[1]) -``` - -or simply use following shell command: - -``` -od -j 16 -f _res2_1_branch1_bn.w0 -``` - -## Feature Extraction - -We provide both C++ and Python interfaces to extract features. The following examples use data in `demo/model_zoo/resnet/example` to show the extracting process in detail. - -### C++ Interface - -First, specify image data list in `define_py_data_sources2` in the config, see example `demo/model_zoo/resnet/resnet.py`. - -``` - train_list = 'train.list' if not is_test else None - # mean.meta is mean file of ImageNet dataset. - # mean.meta size : 3 x 224 x 224. - # If you use three mean value, set like: - # "mean_value:103.939,116.779,123.68;" - args={ - 'mean_meta': "model/mean_meta_224/mean.meta", - 'image_size': 224, 'crop_size': 224, - 'color': True,'swap_channel:': [2, 1, 0]} - define_py_data_sources2(train_list, - 'example/test.list', - module="example.image_list_provider", - obj="processData", - args=args) -``` - -Second, specify layers to extract features in `Outputs()` of `resnet.py`. For example, - -``` -Outputs("res5_3_branch2c_conv", "res5_3_branch2c_bn") -``` - -Third, specify model path and output directory in `extract_fea_c++.sh`, and then run the following commands. - -``` -cd demo/model_zoo/resnet -./extract_fea_c++.sh -``` - -If successful, features are saved in `fea_output/rank-00000` as follows. And you can use `load_feature_c` interface in `load_feature.py ` to load such a file. - -``` --0.115318 -0.108358 ... -0.087884;-1.27664 ... -1.11516 -2.59123; --0.126383 -0.116248 ... -0.00534909;-1.42593 ... -1.04501 -1.40769; -``` - -* Each line stores features of a sample. Here, the first line stores features of `example/dog.jpg` and second line stores features of `example/cat.jpg`. -* Features of different layers are splitted by `;`, and their order is consistent with the layer order in `Outputs()`. Here, the left features are `res5_3_branch2c_conv` layer and right features are `res5_3_branch2c_bn` layer. - -### Python Interface - -`demo/model_zoo/resnet/classify.py` is an example to show how to use Python to extract features. Following example still uses data of `./example/test.list`. Command is as follows: - -``` -cd demo/model_zoo/resnet -./extract_fea_py.sh -``` - -extract_fea_py.sh: - -``` -python classify.py \ - --job=extract \ - --conf=resnet.py\ - --use_gpu=1 \ - --mean=model/mean_meta_224/mean.meta \ - --model=model/resnet_50 \ - --data=./example/test.list \ - --output_layer="res5_3_branch2c_conv,res5_3_branch2c_bn" \ - --output_dir=features - -``` -* \--job=extract: specify job mode to extract feature. -* \--conf=resnet.py: network configure. -* \--use_gpu=1: speficy GPU mode. -* \--model=model/resnet_5: model path. -* \--data=./example/test.list: data list. -* \--output_layer="xxx,xxx": specify layers to extract features. -* \--output_dir=features: output diretcoty. - -If run successfully, you will see features saved in `features/batch_0`, this file is produced with cPickle. You can use `load_feature_py` interface in `load_feature.py` to open the file, and it returns a dictionary as follows: - -``` -{ -'cat.jpg': {'res5_3_branch2c_conv': array([[-0.12638293, -0.116248 , -0.11883899, ..., -0.00895038, 0.01994277, -0.00534909]], dtype=float32), 'res5_3_branch2c_bn': array([[-1.42593431, -1.28918779, -1.32414699, ..., -1.45933616, -1.04501402, -1.40769434]], dtype=float32)}, -'dog.jpg': {'res5_3_branch2c_conv': array([[-0.11531784, -0.10835785, -0.08809858, ...,0.0055237, 0.01505112, -0.08788397]], dtype=float32), 'res5_3_branch2c_bn': array([[-1.27663755, -1.18272924, -0.90937918, ..., -1.25178063, -1.11515927, -2.59122872]], dtype=float32)} -} -``` - -Observed carefully, these feature values are consistent with the above results extracted by C++ interface. - -## Prediction - -`classify.py` also can be used to predict. We provide an example script `predict.sh` to predict data in `example/test.list` using a ResNet model with 50 layers. - -``` -cd demo/model_zoo/resnet -./predict.sh -``` - -predict.sh calls the `classify.py`: - -``` -python classify.py \ - --job=predict \ - --conf=resnet.py\ - --multi_crop \ - --model=model/resnet_50 \ - --use_gpu=1 \ - --data=./example/test.list -``` -* \--job=extract: speficy job mode to predict. -* \--conf=resnet.py: network configure. -* \--multi_crop: use 10 crops and average predicting probability. -* \--use_gpu=1: speficy GPU mode. -* \--model=model/resnet_50: model path. -* \--data=./example/test.list: data list. - -If run successfully, you will see following results, where 156 and 285 are labels of the images. - -``` -Label of example/dog.jpg is: 156 -Label of example/cat.jpg is: 282 -``` diff --git a/doc/tutorials/index_cn.md b/doc/tutorials/index_cn.md deleted file mode 100644 index 6a27004d58d24cc466d930322be8cdbb2f434c74..0000000000000000000000000000000000000000 --- a/doc/tutorials/index_cn.md +++ /dev/null @@ -1,13 +0,0 @@ -# 完整教程 - -* [快速入门](quick_start/index_cn.rst) -* [个性化推荐](rec/ml_regression_cn.rst) -* [图像分类](image_classification/index_cn.md) -* [情感分析](sentiment_analysis/index_cn.md) -* [语义角色标注](semantic_role_labeling/index_cn.md) -* [机器翻译](text_generation/index_cn.md) - -## 常用模型 - -* [ResNet模型](imagenet_model/resnet_model_cn.md) -* [词向量模型](embedding_model/index_cn.md) diff --git a/doc/tutorials/index_en.md b/doc/tutorials/index_en.md deleted file mode 100644 index 77331a703b6f0fdf92921ebcc476325b7327e976..0000000000000000000000000000000000000000 --- a/doc/tutorials/index_en.md +++ /dev/null @@ -1,14 +0,0 @@ -# TUTORIALS -There are several examples and demos here. - -* [Quick Start](quick_start/index_en.md) -* [MovieLens Regression](rec/ml_regression_en.rst) -* [Image Classification](image_classification/index_en.md) -* [Sentiment Analysis](sentiment_analysis/index_en.md) -* [Semantic Role Labeling](semantic_role_labeling/index_en.md) -* [Text Generation](text_generation/index_en.md) -* [Image Auto-Generation](gan/index_en.md) - -## Model Zoo -* [ImageNet: ResNet](imagenet_model/resnet_model_en.md) -* [Embedding: Chinese Word](embedding_model/index_en.md) diff --git a/doc/tutorials/quick_start/index_cn.rst b/doc/tutorials/quick_start/index_cn.rst deleted file mode 100644 index d565fcf95ef8489eb22a5a1b5a552b5336f4e371..0000000000000000000000000000000000000000 --- a/doc/tutorials/quick_start/index_cn.rst +++ /dev/null @@ -1,397 +0,0 @@ -============= -快速入门教程 -============= - -我们将以 `文本分类问题 `_ 为例, -介绍PaddlePaddle的基本使用方法。 - -安装 -==== - -请参考 :ref:`install_steps` 安装PaddlePaddle。 - -使用概述 -======== - -**文本分类问题**:对于给定的一条文本,我们从提前给定的类别集合中选择其所属类别。 - -比如, 在购物网站上,通过查看买家对某个产品的评价反馈, 评估该产品的质量。 - -- 这个显示器很棒! (好评) -- 用了两个月之后这个显示器屏幕碎了。(差评) - -使用PaddlePaddle, 每一个任务流程都可以被划分为如下五个步骤。 - - .. image:: src/Pipeline_cn.jpg - :align: center - :scale: 80% - -1. 数据格式准备 - - 本例每行保存一条样本,类别Id和文本信息用 ``Tab`` 间隔,文本中的单词用空格分隔(如果不切词,则字与字之间用空格分隔),例如:``类别Id '\t' 这 个 显 示 器 很 棒 !`` -2. 向系统传送数据 - - PaddlePaddle可以执行用户的python脚本程序来读取各种格式的数据文件。 - - 本例的所有字符都将转换为连续整数表示的Id传给模型。 -3. 描述网络结构和优化算法 - - 本例由易到难展示4种不同的文本分类网络配置:逻辑回归模型,词向量模型,卷积模型,时序模型。 - - 常用优化算法包括Momentum, RMSProp,AdaDelta,AdaGrad,Adam,Adamax等,本例采用Adam优化方法,加了L2正则和梯度截断。 -4. 训练模型 -5. 应用模型 - -数据格式准备 ------------- - -接下来我们将展示如何用PaddlePaddle训练一个文本分类模型,将 `Amazon电子产品评论数据 `_ 分为好评(正样本)和差评(负样本)两种类别。 -`源代码 `_ 的 ``demo/quick_start`` 目录里提供了该数据的下载脚本和预处理脚本,你只需要在命令行输入以下命令,就能够很方便的完成数据下载和相应的预处理工作。 - -.. code-block:: bash - - cd demo/quick_start - ./data/get_data.sh - ./preprocess.sh - -数据预处理完成之后,通过配置类似于 ``dataprovider_*.py`` 的数据读取脚本和类似于 ``trainer_config.*.py`` 的训练模型脚本,PaddlePaddle将以设置参数的方式来设置 -相应的数据读取脚本和训练模型脚本。接下来,我们将对这两个步骤给出了详细的解释,你也可以先跳过本文的解释环节,直接进入训练模型章节, 使用 ``sh train.sh`` 开始训练模型, -查看`train.sh`内容,通过 **自底向上法** (bottom-up approach)来帮助你理解PaddlePaddle的内部运行机制。 - - -向系统传送数据 -============== - -Python脚本读取数据 ------------------- - -`DataProvider` 是PaddlePaddle负责提供数据的模块,主要职责在于将训练数据传入内存或者显存,让模型能够得到训练更新,其包括两个函数: - -* initializer:PaddlePaddle会在调用读取数据的Python脚本之前,先调用initializer函数。在下面例子里,我们在initialzier函数里初始化词表,并且在随后的读取数据过程中填充词表。 -* process:PaddlePaddle调用process函数来读取数据。每次读取一条数据后,process函数会用yield语句输出这条数据,从而能够被PaddlePaddle 捕获 (harvest)。 - -``dataprovider_bow.py`` 文件给出了完整例子: - -.. literalinclude:: ../../../demo/quick_start/dataprovider_bow.py - :language: python - :lines: 21-70 - :linenos: - :emphasize-lines: 8,33 - -详细内容请参见 :ref:`api_dataprovider` 。 - -配置中的数据加载定义 --------------------- - -在模型配置中通过 ``define_py_data_sources2`` 接口来加载数据: - -.. literalinclude:: ../../../demo/quick_start/trainer_config.emb.py - :language: python - :lines: 19-35 - :linenos: - :emphasize-lines: 12 - - -以下是对上述数据加载的解释: - -- data/train.list,data/test.list: 指定训练数据和测试数据 -- module="dataprovider_bow": 处理数据的Python脚本文件 -- obj="process": 指定生成数据的函数 -- args={"dictionary": word_dict}: 额外的参数,这里指定词典 - -更详细数据格式和用例请参考 :ref:`api_pydataprovider2` 。 - -模型网络结构 -============ - -本小节我们将介绍模型网络结构。 - - .. image:: src/PipelineNetwork_cn.jpg - :align: center - :scale: 80% - - -我们将以最基本的逻辑回归网络作为起点,并逐渐展示更加深入的功能。更详细的网络配置连接请参考 :ref:`api_trainer_config_helpers_layers` 。 -所有配置都能在 `源代码 `_ 的 ``demo/quick_start`` 目录下找到。 - -逻辑回归模型 ------------- - -具体流程如下: - - .. image:: src/NetLR_cn.jpg - :align: center - :scale: 80% - -- 获取利用 `one-hot vector `_ 表示的每个单词,维度是词典大小 - - .. code-block:: python - - word = data_layer(name="word", size=word_dim) - -- 获取该条样本类别Id,维度是类别个数。 - - .. code-block:: python - - label = data_layer(name="label", size=label_dim) - -- 利用逻辑回归模型对该向量进行分类,同时会计算分类准确率 - - .. code-block:: python - - # Define a fully connected layer with logistic activation (also called softmax activation). - output = fc_layer(input=word, - size=label_dim, - act_type=SoftmaxActivation()) - # Define cross-entropy classification loss and error. - classification_cost(input=output, label=label) - - - - input: 除去data层,每个层都有一个或多个input,多个input以list方式输入 - - size: 该层神经元个数 - - act_type: 激活函数类型 - -**效果总结**:我们将在后面介绍训练和预测流程的脚本。在此为方便对比不同网络结构,我们总结了各个网络的复杂度和效果。 - - ===================== =============================== ================= - 网络名称 参数数量 错误率 - ===================== =============================== ================= - 逻辑回归 252 KB 8.652 % - ===================== =============================== ================= - -词向量模型 ----------- - -embedding模型需要稍微改变提供数据的Python脚本,即 ``dataprovider_emb.py``,词向量模型、 -卷积模型、时序模型均使用该脚本。其中文本输入类型定义为整数时序类型integer_value_sequence。 - -.. code-block:: python - - def initializer(settings, dictionary, **kwargs): - settings.word_dict = dictionary - settings.input_types = [ - # Define the type of the first input as sequence of integer. - # The value of the integers range from 0 to len(dictrionary)-1 - integer_value_sequence(len(dictionary)), - # Define the second input for label id - integer_value(2)] - - @provider(init_hook=initializer) - def process(settings, file_name): - ... - # omitted, it is same as the data provider for LR model - -该模型依然使用逻辑回归分类网络的框架, 只是将句子用连续向量表示替换为用稀疏向量表示, 即对第三步进行替换。句子表示的计算更新为两步: - -.. image:: src/NetContinuous_cn.jpg - :align: center - :scale: 80% - -- 利用单词Id查找该单词对应的连续向量(维度为word_dim), 输入N个单词,输出为N个word_dim维度向量 - - .. code-block:: python - - emb = embedding_layer(input=word, size=word_dim) - -- 将该句话包含的所有单词向量求平均, 得到句子的表示 - - .. code-block:: python - - avg = pooling_layer(input=emb, pooling_type=AvgPooling()) - -其它部分和逻辑回归网络结构一致。 - -**效果总结:** - - ===================== =============================== ================== - 网络名称 参数数量 错误率 - ===================== =============================== ================== - 词向量模型 15 MB 8.484 % - ===================== =============================== ================== - -卷积模型 ------------ - -卷积网络是一种特殊的从词向量表示到句子表示的方法, 也就是将词向量模型进一步演化为三个新步骤。 - -.. image:: src/NetConv_cn.jpg - :align: center - :scale: 80% - -文本卷积分可为三个步骤: - -1. 首先,从每个单词左右两端分别获取k个相邻的单词, 拼接成一个新的向量; - -2. 其次,对该向量进行非线性变换(例如Sigmoid变换), 使其转变为维度为hidden_dim的新向量; - -3. 最后,对整个新向量集合的每一个维度取最大值来表示最后的句子。 - -这三个步骤可配置为: - -.. code-block:: python - - text_conv = sequence_conv_pool(input=emb, - context_start=k, - context_len=2 * k + 1) - -**效果总结:** - - ===================== =============================== ======================== - 网络名称 参数数量 错误率 - ===================== =============================== ======================== - 卷积模型 16 MB 5.628 % - ===================== =============================== ======================== - -时序模型 ----------- - -.. image:: src/NetRNN_cn.jpg - :align: center - :scale: 80% - -时序模型,也称为RNN模型, 包括简单的 `RNN模型 `_, `GRU模型 `_ 和 `LSTM模型 `_ 等等。 - -- GRU模型配置: - - .. code-block:: python - - gru = simple_gru(input=emb, size=gru_size) - - -- LSTM模型配置: - - .. code-block:: python - - lstm = simple_lstm(input=emb, size=lstm_size) - -本次试验,我们采用单层LSTM模型,并使用了Dropout,**效果总结:** - - ===================== =============================== ========================= - 网络名称 参数数量 错误率 - ===================== =============================== ========================= - 时序模型 16 MB 4.812 % - ===================== =============================== ========================= - -优化算法 -========= - -`优化算法 `_ 包括 -Momentum, RMSProp,AdaDelta,AdaGrad,ADAM,Adamax等,这里采用Adam优化方法,同时使用了L2正则(L2 Regularization)和梯度截断(Gradient Clipping)。 - -.. code-block:: python - - settings(batch_size=128, - learning_rate=2e-3, - learning_method=AdamOptimizer(), - regularization=L2Regularization(8e-4), - gradient_clipping_threshold=25) - -训练模型 -========= - -在数据加载和网络配置完成之后, 我们就可以训练模型了。 - -.. image:: src/PipelineTrain_cn.jpg - :align: center - :scale: 80% - -训练模型,我们只需要运行 ``train.sh`` 训练脚本: - - .. code-block:: bash - - ./train.sh - -``train.sh`` 中包含了训练模型的基本命令。训练时所需设置的主要参数如下: - - .. code-block:: bash - - paddle train \ - --config=trainer_config.py \ - --log_period=20 \ - --save_dir=./output \ - --num_passes=15 \ - --use_gpu=false - -这里只简单介绍了单机训练,如何进行分布式训练,请参考 :ref:`cluster_train` 。 - -预测 -===== - -当模型训练好了之后,我们就可以进行预测了。 - -.. image:: src/PipelineTest_cn.jpg - :align: center - :scale: 80% - -之前配置文件中 ``test.list`` 指定的数据将会被测试,这里直接通过预测脚本 ``predict.sh`` 进行预测, -更详细的说明,请参考 :ref:`api_swig_py_paddle` 。 - - .. code-block:: bash - - model="output/pass-00003" - paddle train \ - --config=trainer_config.lstm.py \ - --use_gpu=false \ - --job=test \ - --init_model_path=$model \ - --config_args=is_predict=1 \ - --predict_output_dir=. \ - - mv rank-00000 result.txt - -这里以 ``output/pass-00003`` 为例进行预测,用户可以根据训练日志,选择测试结果最好的模型来预测。 - -预测结果以文本的形式保存在 ``result.txt`` 中,一行为一个样本,格式如下: - - .. code-block:: bash - - 预测ID;ID为0的概率 ID为1的概率 - 预测ID;ID为0的概率 ID为1的概率 - -总体效果总结 -============== - -在 ``/demo/quick_start`` 目录下,能够找到这里使用的所有数据, 网络配置, 训练脚本等等。 -对于Amazon-Elec测试集(25k), 如下表格,展示了上述网络模型的训练效果: - - ===================== =============================== ============= ================================== - 网络名称 参数数量 错误率 配置文件 - ===================== =============================== ============= ================================== - 逻辑回归模型 252 KB 8.652% trainer_config.lr.py - 词向量模型 15 MB 8.484% trainer_config.emb.py - 卷积模型 16 MB 5.628% trainer_config.cnn.py - 时序模型 16 MB 4.812% trainer_config.lstm.py - ===================== =============================== ============= ================================== - - -附录 -===== - -命令行参数 ----------- - -* \--config:网络配置 -* \--save_dir:模型存储路径 -* \--log_period:每隔多少batch打印一次日志 -* \--num_passes:训练轮次,一个pass表示过一遍所有训练样本 -* \--config_args:命令指定的参数会传入网络配置中。 -* \--init_model_path:指定初始化模型路径,可用在测试或训练时指定初始化模型。 - -默认一个pass保存一次模型,也可以通过saving_period_by_batches设置每隔多少batch保存一次模型。 -可以通过show_parameter_stats_period设置打印参数信息等。 -其他参数请参考 命令行参数文档(链接待补充)。 - -输出日志 ---------- - -.. code-block:: bash - - TrainerInternal.cpp:160] Batch=20 samples=2560 AvgCost=0.628761 CurrentCost=0.628761 Eval: classification_error_evaluator=0.304297 CurrentEval: classification_error_evaluator=0.304297 - -模型训练会看到类似上面这样的日志信息,详细的参数解释,请参考如下表格: - - =========================================== ============================================================== - 名称 解释 - =========================================== ============================================================== - Batch=20 表示过了20个batch - samples=2560 表示过了2560个样本 - AvgCost 每个pass的第0个batch到当前batch所有样本的平均cost - CurrentCost 当前log_period个batch所有样本的平均cost - Eval: classification_error_evaluator 每个pass的第0个batch到当前batch所有样本的平均分类错误率 - CurrentEval: classification_error_evaluator 当前log_period个batch所有样本的平均分类错误率 - =========================================== ============================================================== diff --git a/doc/tutorials/quick_start/index_en.md b/doc/tutorials/quick_start/index_en.md deleted file mode 100644 index ca110431cf921ae0480d3fb2b17c58f90a84cc0e..0000000000000000000000000000000000000000 --- a/doc/tutorials/quick_start/index_en.md +++ /dev/null @@ -1,562 +0,0 @@ -# Quick Start - -This tutorial will teach the basics of deep learning (DL), including how to implement many different models in PaddlePaddle. You will learn how to: - - Prepare data into the standardized format that PaddlePaddle accepts. - - Write data providers that read data into PaddlePaddle. - - Configure neural networks in PaddlePaddle layer by layer. - - Train models. - - Perform inference with trained models. - - -## Install - -To get started, please install PaddlePaddle on your computer. Throughout this tutorial, you will learn by implementing different DL models for text classification. - -To install PaddlePaddle, please follow the instructions here: Build and Install. - -## Overview -For the first step, you will use PaddlePaddle to build a **text classification** system. For example, suppose you run an e-commence website, and you want to analyze the sentiment of user reviews to evaluate product quality. - -For example, given the input - -``` -This monitor is fantastic. -``` - -Your classifier should output “positive”, since this text snippet shows that the user is satisfied with the product. Given this input: - -``` -The monitor breaks down two months after purchase. -``` - -the classifier should output “negative“. - -To build your text classification system, your code will need to perform five steps: -
![](./src/Pipeline_en.jpg)
- - - Preprocess data into a standardized format. - - Provide data to the learning model. - - Specify the neural network structure. - - Train the model. - - Inference (make prediction on test examples). - - -1. Preprocess data into standardized format - - In the text classification example, you will start with a text file with one training example per line. Each line contains category id (in machine learning, often denoted the target y), followed by the input text (often denoted x); these two elements are separated by a Tab. For example: ```positive [tab] This monitor is fantastic```. You will preprocess this raw data into a format that Paddle can use. - -2. Provide data to the learning model. - - You can write data providers in Python. For any required data preprocessing step, you can add the preprocessing code to the PyDataProvider Python file. - - In our text classification example, every word or character will be converted into an integer id, specified in a dictionary file. It perform a dictionary lookup in PyDataProvider to get the id. -3. Specify neural network structure. (From easy to hard, we provide 4 kinds of network configurations) - - A logistic regression model. - - A word embedding model. - - A convolutional neural network model. - - A sequential recurrent neural network model. - - You will also learn different learning algorithms. -4. Training model. -5. Inference. - -## Preprocess data into standardized format -In this example, you are going to use [Amazon electronic product review dataset](http://jmcauley.ucsd.edu/data/amazon/) to build a bunch of deep neural network models for text classification. Each text in this dataset is a product review. This dataset has two categories: “positive” and “negative”. Positive means the reviewer likes the product, while negative means the reviewer does not like the product. - -`demo/quick_start` in the [source code](https://github.com/PaddlePaddle/Paddle) provides script for downloading the preprocessed data as shown below. (If you want to process the raw data, you can use the script `demo/quick_start/data/proc_from_raw_data/get_data.sh`). - -```bash -cd demo/quick_start -./data/get_data.sh -``` - -## Transfer Data to Model -### Write Data Provider with Python -The following `dataprovider_bow.py` gives a complete example of writing data provider with Python. It includes the following parts: - -* initalizer: define the additional meta-data of the data provider and the types of the input data. -* process: Each `yield` returns a data sample. In this case, it return the text representation and category id. The order of features in the returned result needs to be consistent with the definition of the input types in `initalizer`. - -```python -from paddle.trainer.PyDataProvider2 import * - -# id of the word not in dictionary -UNK_IDX = 0 - -# initializer is called by the framework during initialization. -# It allows the user to describe the data types and setup the -# necessary data structure for later use. -# `settings` is an object. initializer need to properly fill settings.input_types. -# initializer can also store other data structures needed to be used at process(). -# In this example, dictionary is stored in settings. -# `dictionay` and `kwargs` are arguments passed from trainer_config.lr.py -def initializer(settings, dictionary, **kwargs): - # Put the word dictionary into settings - settings.word_dict = dictionary - - # setting.input_types specifies what the data types the data provider - # generates. - settings.input_types = [ - # The first input is a sparse_binary_vector, - # which means each dimension of the vector is either 0 or 1. It is the - # bag-of-words (BOW) representation of the texts. - sparse_binary_vector(len(dictionary)), - # The second input is an integer. It represents the category id of the - # sample. 2 means there are two labels in the dataset. - # (1 for positive and 0 for negative) - integer_value(2)] - -# Delaring a data provider. It has an initializer 'data_initialzer'. -# It will cache the generated data of the first pass in memory, so that -# during later pass, no on-the-fly data generation will be needed. -# `setting` is the same object used by initializer() -# `file_name` is the name of a file listed train_list or test_list file given -# to define_py_data_sources2(). See trainer_config.lr.py. -@provider(init_hook=initializer, cache=CacheType.CACHE_PASS_IN_MEM) -def process(settings, file_name): - # Open the input data file. - with open(file_name, 'r') as f: - # Read each line. - for line in f: - # Each line contains the label and text of the comment, separated by \t. - label, comment = line.strip().split('\t') - - # Split the words into a list. - words = comment.split() - - # convert the words into a list of ids by looking them up in word_dict. - word_vector = [settings.word_dict.get(w, UNK_IDX) for w in words] - - # Return the features for the current comment. The first is a list - # of ids representing a 0-1 binary sparse vector of the text, - # the second is the integer id of the label. - yield word_vector, int(label) -``` - -### Define Python Data Provider in Configuration files. -You need to add a data provider definition `define_py_data_sources2` in our network configuration. This definition specifies: - -- The path of the training and testing data (`data/train.list`, `data/test.list`). -- The location of the data provider file (`dataprovider_bow`). -- The function to call to get data. (`process`). -- Additional arguments or data. Here it passes the path of word dictionary. - -```python -from paddle.trainer_config_helpers import * - -file = "data/dict.txt" -word_dict = dict() -with open(dict_file, 'r') as f: - for i, line in enumerate(f): - w = line.strip().split()[0] - word_dict[w] = i -# define the data sources for the model. -# We need to use different process for training and prediction. -# For training, the input data includes both word IDs and labels. -# For prediction, the input data only includs word Ids. -define_py_data_sources2(train_list='data/train.list', - test_list='data/test.list', - module="dataprovider_bow", - obj="process", - args={"dictionary": word_dict}) -``` -You can refer to the following link for more detailed examples and data formats: PyDataProvider2. - -## Network Architecture -We will describe four kinds of network architectures in this section. -
![](./src/PipelineNetwork_en.jpg)
- -First, you will build a logistic regression model. Later, you will also get chance to build other more powerful network architectures. -For more detailed documentation, you could refer to: layer documentation. All configuration files are in `demo/quick_start` directory. - -### Logistic Regression -The architecture is illustrated in the following picture: -
![](./src/NetLR_en.png)
- -- You need define the data for text features. The size of the data layer is the number of words in the dictionary. - -```python -word = data_layer(name="word", size=voc_dim) -``` - -- You also need to define the category id for each example. The size of the data layer is the number of labels. - -```python -label = data_layer(name="label", size=label_dim) -``` - -- It uses logistic regression model to classify the vector, and it will output the classification error during training. - - Each layer has an *input* argument that specifies its input layer. Some layers can have multiple input layers. You can use a list of the input layers as input in that case. - - *size* for each layer means the number of neurons of the layer. - - *act_type* means activation function applied to the output of each neuron independently. - - Some layers can have additional special inputs. For example, `classification_cost` needs ground truth label as input to compute classification loss and error. -```python -# Define a fully connected layer with logistic activation (also called softmax activation). -output = fc_layer(input=word, - size=label_dim, - act_type=SoftmaxActivation()) -# Define cross-entropy classification loss and error. -classification_cost(input=output, label=label) -``` - -Performance summary: You can refer to the training and testing scripts later. In order to compare different network architectures, the model complexity and test classification error are listed in the following table: - - -
- - - - - - - - - - - - - - - - - -
Network nameNumber of parametersTest error
Logistic regression252 KB8.652%
- -
- -### Word Embedding Model -In order to use the word embedding model, you need to change the data provider a little bit to make the input words as a sequence of word IDs. The revised data provider `dataprovider_emb.py` is listed below. You only need to change initializer() for the type of the first input. It is changed from sparse_binary_vector to sequence of intergers. process() remains the same. This data provider can also be used for later sequence models. - -```python -def initializer(settings, dictionary, **kwargs): - # Put the word dictionary into settings - settings.word_dict = dictionary - settings.input_types = [ - # Define the type of the first input as a sequence of integers. - integer_value_sequence(len(dictionary)), - # Define the second input for label id - integer_value(2)] - -@provider(init_hook=initializer) -def process(settings, file_name): - ... - # omitted, it is same as the data provider for LR model -``` - -This model is very similar to the framework of logistic regression, but it uses word embedding vectors instead of a sparse vectors to represent words. -
![](./src/NetContinuous_en.png)
- -- It can look up the dense word embedding vector in the dictionary (its words embedding vector is `word_dim`). The input is a sequence of N words, the output is N word_dim dimensional vectors. - -```python -emb = embedding_layer(input=word, dim=word_dim) -``` - -- It averages all the word embedding in a sentence to get its sentence representation. - -```python -avg = pooling_layer(input=emb, pooling_type=AvgPooling()) -``` - -The other parts of the model are the same as logistic regression network. - -The performance is summarized in the following table: - - -
- - - - - - - - - - - - - - - - - -
Network nameNumber of parametersTest error
Word embedding model15 MB8.484%
-
-
- -### Convolutional Neural Network Model -Convolutional neural network converts a sequence of word embeddings into a sentence representation using temporal convolutions. You will transform the fully connected layer of the word embedding model to 3 new sub-steps. -
![](./src/NetConv_en.png)
- - -Text convolution has 3 steps: -1. Get K nearest neighbor context of each word in a sentence, stack them into a 2D vector representation. -2. Apply temporal convolution to this representation to produce a new hidden_dim dimensional vector. -3. Apply max-pooling to the new vectors at all the time steps in a sentence to get a sentence representation. - -```python -# context_len means convolution kernel size. -# context_start means the start of the convolution. It can be negative. In that case, zero padding is applied. -text_conv = sequence_conv_pool(input=emb, - context_start=k, - context_len=2 * k + 1) -``` - -The performance is summarized in the following table: - - -
- - - - - - - - - - - - - - - - - -
Network nameNumber of parametersTest error
Convolutional model16 MB5.628%
-
- -### Recurrent Model -
![](./src/NetRNN_en.png)
- -You can use Recurrent neural network as our time sequence model, including simple RNN model, GRU model, and LSTM model。 - -- GRU model can be specified via: - -```python -gru = simple_gru(input=emb, size=gru_size) -``` - -- LSTM model can be specified via: - -```python -lstm = simple_lstm(input=emb, size=lstm_size) -``` - -You can use single layer LSTM model with Dropout for our text classification problem. The performance is summarized in the following table: - - -
- - - - - - - - - - - - - - - - - -
Network nameNumber of parametersTest error
Recurrent model16 MB4.812%
- -
- -## Optimization Algorithm -Optimization algorithms include Momentum, RMSProp, AdaDelta, AdaGrad, Adam, and Adamax. You can use Adam optimization method here, with L2 regularization and gradient clipping, because Adam has been proved to work very well for training recurrent neural network. - -```python -settings(batch_size=128, - learning_rate=2e-3, - learning_method=AdamOptimizer(), - regularization=L2Regularization(8e-4), - gradient_clipping_threshold=25) -``` - -## Training Model -After completing data preparation and network architecture specification, you will run the training script. -
![](./src/PipelineTrain_en.png)
- -Training script: our training script is in `train.sh` file. The training arguments are listed below: - -```bash -paddle train \ ---config=trainer_config.py \ ---log_period=20 \ ---save_dir=./output \ ---num_passes=15 \ ---use_gpu=false -``` - -We do not provide examples on how to train on clusters here. If you want to train on clusters, please follow the distributed training documentation or other demos for more details. - -## Inference -You can use the trained model to perform prediction on the dataset with no labels. You can also evaluate the model on dataset with labels to obtain its test accuracy. -
![](./src/PipelineTest_en.png)
- -The test script is listed below. PaddlePaddle can evaluate a model on the data with labels specified in `test.list`. - -```bash -paddle train \ ---config=trainer_config.lstm.py \ ---use_gpu=false \ ---job=test \ ---init_model_path=./output/pass-0000x -``` - -We will give an example of performing prediction using Recurrent model on a dataset with no labels. You can refer to Python Prediction API tutorial,or other demo for the prediction process using Python. You can also use the following script for inference or evaluation. - -inference script (predict.sh): - -```bash -model="output/pass-00003" -paddle train \ - --config=trainer_config.lstm.py \ - --use_gpu=false \ - --job=test \ - --init_model_path=$model \ - --config_args=is_predict=1 \ - --predict_output_dir=. \ - -mv rank-00000 result.txt -``` -User can choose the best model base on the training log instead of model `output/pass-00003`. There are several differences between training and inference network configurations. -- You do not need labels during inference. -- Outputs need to be specified to the classification probability layer (the output of softmax layer), or the id of maximum probability (`max_id` layer). An example to output the id and probability is given in the code snippet. -- batch_size = 1. -- You need to specify the location of `test_list` in the test data. - -The results in `result.txt` is as follows, each line is one sample. - -``` -predicted_label_id;probability_of_label_0 probability_of_label_1 # the first sample -predicted_label_id;probability_of_label_0 probability_of_label_1 # the second sample -``` - - -```python -is_predict = get_config_arg('is_predict', bool, False) -trn = 'data/train.list' if not is_predict else None -tst = 'data/test.list' if not is_predict else 'data/pred.list' -obj = 'process' if not is_predict else 'process_pre' -batch_size = 128 if not is_predict else 1 -if is_predict: - maxid = maxid_layer(output) - outputs([maxid,output]) -else: - label = data_layer(name="label", size=2) - cls = classification_cost(input=output, label=label) outputs(cls) -``` - -## Summary -The scripts of data downloading, network configurations, and training scrips are in `/demo/quick_start`. The following table summarizes the performance of our network architecture on Amazon-Elec dataset(25k): - -
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Network nameNumber of parametersError rateConfiguration file name
Logistic regression model(BOW) 252KB 8.652%trainer_config.lr.py
Word embedding 15MB 8.484%trainer_config.emb.py
Convolution model 16MB 5.628%trainer_config.cnn.py
Time sequence model 16MB 4.812%trainer_config.lstm.py
-
-
- -## Appendix -### Command Line Argument - -* \--config:network architecture path. -* \--save_dir:model save directory. -* \--log_period:the logging period per batch. -* \--num_passes:number of training passes. One pass means the training would go over the whole training dataset once. -* \--config_args:Other configuration arguments. -* \--init_model_path:The path of the initial model parameter. - -By default, the trainer will save model every pass. You can also specify `saving_period_by_batches` to set the frequency of batch saving. You can use `show_parameter_stats_period` to print the statistics of the parameters, which are very useful for tuning parameters. Other command line arguments can be found in command line argument documentation。 - -### Log - -``` -TrainerInternal.cpp:160] Batch=20 samples=2560 AvgCost=0.628761 CurrentCost=0.628761 Eval: classification_error_evaluator=0.304297 CurrentEval: classification_error_evaluator=0.304297 -``` -During model training, you will see the log like the examples above: -
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
NameExplanation
Batch=20 You have trained 20 batches.
samples=2560 You have trained 2560 examples.
AvgCost The average cost from the first batch to the current batch.
CurrentCost the average cost of the last log_period batches
Eval: classification_error_evaluator The average classification error from the first batch to the current batch.
CurrentEval: classification_error_evaluator The average error rate of the last log_period batches
-
-
diff --git a/doc/tutorials/quick_start/src/NetContinuous_cn.jpg b/doc/tutorials/quick_start/src/NetContinuous_cn.jpg deleted file mode 100755 index b18e452a481232f47c453fc30521f8c78ad2da0b..0000000000000000000000000000000000000000 Binary files a/doc/tutorials/quick_start/src/NetContinuous_cn.jpg and /dev/null differ diff --git a/doc/tutorials/quick_start/src/NetContinuous_en.png b/doc/tutorials/quick_start/src/NetContinuous_en.png deleted file mode 100644 index 7bdef1aa366711806585d35c8653c987fd63d59e..0000000000000000000000000000000000000000 Binary files a/doc/tutorials/quick_start/src/NetContinuous_en.png and /dev/null differ diff --git a/doc/tutorials/quick_start/src/NetConv_cn.jpg b/doc/tutorials/quick_start/src/NetConv_cn.jpg deleted file mode 100755 index 0f5ebfa52ff2abcfc63464d48f00e406d4cbfe86..0000000000000000000000000000000000000000 Binary files a/doc/tutorials/quick_start/src/NetConv_cn.jpg and /dev/null differ diff --git a/doc/tutorials/quick_start/src/NetConv_en.png b/doc/tutorials/quick_start/src/NetConv_en.png deleted file mode 100644 index ad618d1d6f8f4839f566f5f5cb5db37a4b7d9093..0000000000000000000000000000000000000000 Binary files a/doc/tutorials/quick_start/src/NetConv_en.png and /dev/null differ diff --git a/doc/tutorials/quick_start/src/NetLR_cn.jpg b/doc/tutorials/quick_start/src/NetLR_cn.jpg deleted file mode 100755 index ee65d1f4127a347e991b4de5e5208347b2b0f51f..0000000000000000000000000000000000000000 Binary files a/doc/tutorials/quick_start/src/NetLR_cn.jpg and /dev/null differ diff --git a/doc/tutorials/quick_start/src/NetLR_en.png b/doc/tutorials/quick_start/src/NetLR_en.png deleted file mode 100644 index 9d514bf1b18a0c330f98c28785e5d008f409fc1d..0000000000000000000000000000000000000000 Binary files a/doc/tutorials/quick_start/src/NetLR_en.png and /dev/null differ diff --git a/doc/tutorials/quick_start/src/NetRNN_cn.jpg b/doc/tutorials/quick_start/src/NetRNN_cn.jpg deleted file mode 100755 index f8bc081827f3a50af80495b4561813c5efe9c9dc..0000000000000000000000000000000000000000 Binary files a/doc/tutorials/quick_start/src/NetRNN_cn.jpg and /dev/null differ diff --git a/doc/tutorials/quick_start/src/NetRNN_en.png b/doc/tutorials/quick_start/src/NetRNN_en.png deleted file mode 100644 index 180f273d32ea59dc8ececa69c08e249f79f9d4f7..0000000000000000000000000000000000000000 Binary files a/doc/tutorials/quick_start/src/NetRNN_en.png and /dev/null differ diff --git a/doc/tutorials/quick_start/src/PipelineNetwork_cn.jpg b/doc/tutorials/quick_start/src/PipelineNetwork_cn.jpg deleted file mode 100755 index 7e68891d7a536b06027ada6b40cb13dbc3c2d72e..0000000000000000000000000000000000000000 Binary files a/doc/tutorials/quick_start/src/PipelineNetwork_cn.jpg and /dev/null differ diff --git a/doc/tutorials/quick_start/src/PipelineNetwork_en.jpg b/doc/tutorials/quick_start/src/PipelineNetwork_en.jpg deleted file mode 100644 index e779aed06d5cdb2b442754e7915e79b72946418e..0000000000000000000000000000000000000000 Binary files a/doc/tutorials/quick_start/src/PipelineNetwork_en.jpg and /dev/null differ diff --git a/doc/tutorials/quick_start/src/PipelineTest_cn.jpg b/doc/tutorials/quick_start/src/PipelineTest_cn.jpg deleted file mode 100755 index 01715db886d811a776c9e7d47f32ba8b1ffe6230..0000000000000000000000000000000000000000 Binary files a/doc/tutorials/quick_start/src/PipelineTest_cn.jpg and /dev/null differ diff --git a/doc/tutorials/quick_start/src/PipelineTest_en.png b/doc/tutorials/quick_start/src/PipelineTest_en.png deleted file mode 100644 index 7e7ef520b5effa2f43fd2964048f05c42f2ea890..0000000000000000000000000000000000000000 Binary files a/doc/tutorials/quick_start/src/PipelineTest_en.png and /dev/null differ diff --git a/doc/tutorials/quick_start/src/PipelineTrain_cn.jpg b/doc/tutorials/quick_start/src/PipelineTrain_cn.jpg deleted file mode 100755 index 8049d3e53c483a4d4956a4840b9a0dce5c2990f1..0000000000000000000000000000000000000000 Binary files a/doc/tutorials/quick_start/src/PipelineTrain_cn.jpg and /dev/null differ diff --git a/doc/tutorials/quick_start/src/PipelineTrain_en.png b/doc/tutorials/quick_start/src/PipelineTrain_en.png deleted file mode 100644 index 132d29bfd5d678d2518161d0b5ed2e16a233a048..0000000000000000000000000000000000000000 Binary files a/doc/tutorials/quick_start/src/PipelineTrain_en.png and /dev/null differ diff --git a/doc/tutorials/quick_start/src/Pipeline_cn.jpg b/doc/tutorials/quick_start/src/Pipeline_cn.jpg deleted file mode 100755 index d5d99253ea528ebd2d8af6cddc6a9f00e20de8cd..0000000000000000000000000000000000000000 Binary files a/doc/tutorials/quick_start/src/Pipeline_cn.jpg and /dev/null differ diff --git a/doc/tutorials/quick_start/src/Pipeline_en.jpg b/doc/tutorials/quick_start/src/Pipeline_en.jpg deleted file mode 100644 index 21a7a7bb6a1af746120e6f4f51f797b6aaafb9d8..0000000000000000000000000000000000000000 Binary files a/doc/tutorials/quick_start/src/Pipeline_en.jpg and /dev/null differ diff --git a/doc/tutorials/rec/ml_dataset_cn.md b/doc/tutorials/rec/ml_dataset_cn.md deleted file mode 100644 index 2207a776f0774e72aba15169e59258dd04583637..0000000000000000000000000000000000000000 --- a/doc/tutorials/rec/ml_dataset_cn.md +++ /dev/null @@ -1,105 +0,0 @@ -```eval_rst -.. _demo_ml_dataset: - -``` - -# MovieLens数据集 - -[MovieLens 数据集](http://grouplens.org/datasets/movielens/)由GroupLens Research实验室搜集整理。 -该数据集包含一些用户信息、电影信息以及电影评分\[1-5\]。根据数据量规模,该数据及有很多不同的版本。 -我们用[MovieLens 百万数据集](http://files.grouplens.org/datasets/movielens/ml-1m.zip)作为示例数据 -集,其中包含6,000位用户对4,000部电影的1,000,000条评价。该数据集于2003年2月发布。 - -## 数据集特征 - -在[ml-1m 数据集](http://files.grouplens.org/datasets/movielens/ml-1m.zip)中有许多的特征。在[ml-1m 数据集] -(http://files.grouplens.org/datasets/movielens/ml-1m.zip)中的这些数据文件(含有".dat"的后缀)实际上是CSV文件, -分隔符为"::"。以下我们翻译数据集网站中README文件的描述: - -### 评分文件描述(ratings.dat) - - -所有的评分数据都包含在"ratings.dat"文件中,遵循如下的格式: - -用户ID::电影ID::评分::时间戳 - -- 用户ID范围从1到6040 -- 电影ID范围从1到3952 -- 评分被调整为5星的规模(只允许整数的星级) -- 时间戳表示为从1970-01-01(UTC)来的秒数,与time(2)的返回值一致 -- 每位用户至少有20条评分 - -### 用户文件描述(users.dat) - -所有的用户信息都包含在"users.dat"文件中,遵循如下的格式: - -用户ID::性别::年龄::职业::邮编 - -所有的人口统计学信息由用户自愿提供,没有进行正确性的检查。只有含有人 -口统计学信息的用户才被包含在数据集中。 - -- 性别,用"M"表示男性,"F"表示女性 -- 年龄从下列列表范围中选取: - - * 1: "18岁以下" - * 18: "18-24岁" - * 25: "25-34岁" - * 35: "35-44岁" - * 45: "45-49岁" - * 50: "50-55岁" - * 56: "56+" - -- 职业从下面所列中选择: - - * 0: "其他"或不确定 - * 1: "学术/教育工作者" - * 2: "艺术家" - * 3: "文书工作/管理员" - * 4: "大学生/研究生" - * 5: "客户服务" - * 6: "医生/医疗保健" - * 7: "行政工作/管理人员" - * 8: "农民" - * 9: "操持家务者" - * 10: "高中毕业生" - * 11: "律师" - * 12: "程序员" - * 13: "退休人员" - * 14: "销售/市场" - * 15: "科学家" - * 16: "自由职业者" - * 17: "技术员/工程师" - * 18: "推销员/手工艺者" - * 19: "无业人士" - * 20: "作家" - -### 电影文件描述(movies.dat) - -所有的电影信息都包含在"movies.dat"文件中,遵循如下的格式: - -电影ID::电影名称::电影类型 - -- 电影名称(包括发行时间)与IMDB网站提供的一致 -- 电影类型如符合多种用管道符号|分割,选自下列类型: - - * 动作片 - * 冒险片 - * 动画片 - * 儿童片 - * 喜剧片 - * 犯罪片 - * 纪录片 - * 戏剧 - * 奇幻片 - * 黑色电影 - * 恐怖片 - * 音乐剧 - * 悬疑片 - * 浪漫片 - * 科幻片 - * 惊险电影 - * 战争片 - * 西部片 - -- 由于意外的副本记录和测试记录,有些电影ID可能与实际电影不相符合 -- 电影大部分是手工输入数据,因此可能会有一些错误和不一致发生 diff --git a/doc/tutorials/rec/ml_dataset_en.md b/doc/tutorials/rec/ml_dataset_en.md deleted file mode 100644 index 25dea5c4afbf1ce1c1ac6195cbd245b116459e2e..0000000000000000000000000000000000000000 --- a/doc/tutorials/rec/ml_dataset_en.md +++ /dev/null @@ -1,111 +0,0 @@ -```eval_rst -.. _demo_ml_dataset: -``` - -# MovieLens Dataset - -The [MovieLens Dataset](http://grouplens.org/datasets/movielens/) was collected by GroupLens Research. -The data set contains some user information, movie information, and many movie ratings from \[1-5\]. -The data sets have many version depending on the size of set. -We use [MovieLens 1M Dataset](http://files.grouplens.org/datasets/movielens/ml-1m.zip) as a demo dataset, which contains -1 million ratings from 6000 users on 4000 movies. Released 2/2003. - -## Dataset Features - -In [ml-1m Dataset](http://files.grouplens.org/datasets/movielens/ml-1m.zip), there are many features in these dataset. -The data files (which have ".dat" extension) in [ml-1m Dataset](http://files.grouplens.org/datasets/movielens/ml-1m.zip) -is basically CSV file that delimiter is "::". The description in README we quote here. - -### RATINGS FILE DESCRIPTION(ratings.dat) - - -All ratings are contained in the file "ratings.dat" and are in the -following format: - -UserID::MovieID::Rating::Timestamp - -- UserIDs range between 1 and 6040 -- MovieIDs range between 1 and 3952 -- Ratings are made on a 5-star scale (whole-star ratings only) -- Timestamp is represented in seconds since the epoch as returned by time(2) -- Each user has at least 20 ratings - -### USERS FILE DESCRIPTION(users.dat) - -User information is in the file "users.dat" and is in the following -format: - -UserID::Gender::Age::Occupation::Zip-code - -All demographic information is provided voluntarily by the users and is -not checked for accuracy. Only users who have provided some demographic -information are included in this data set. - -- Gender is denoted by a "M" for male and "F" for female -- Age is chosen from the following ranges: - - * 1: "Under 18" - * 18: "18-24" - * 25: "25-34" - * 35: "35-44" - * 45: "45-49" - * 50: "50-55" - * 56: "56+" - -- Occupation is chosen from the following choices: - - * 0: "other" or not specified - * 1: "academic/educator" - * 2: "artist" - * 3: "clerical/admin" - * 4: "college/grad student" - * 5: "customer service" - * 6: "doctor/health care" - * 7: "executive/managerial" - * 8: "farmer" - * 9: "homemaker" - * 10: "K-12 student" - * 11: "lawyer" - * 12: "programmer" - * 13: "retired" - * 14: "sales/marketing" - * 15: "scientist" - * 16: "self-employed" - * 17: "technician/engineer" - * 18: "tradesman/craftsman" - * 19: "unemployed" - * 20: "writer" - -### MOVIES FILE DESCRIPTION(movies.dat) - -Movie information is in the file "movies.dat" and is in the following -format: - -MovieID::Title::Genres - -- Titles are identical to titles provided by the IMDB (including -year of release) -- Genres are pipe-separated and are selected from the following genres: - - * Action - * Adventure - * Animation - * Children's - * Comedy - * Crime - * Documentary - * Drama - * Fantasy - * Film-Noir - * Horror - * Musical - * Mystery - * Romance - * Sci-Fi - * Thriller - * War - * Western - -- Some MovieIDs do not correspond to a movie due to accidental duplicate -entries and/or test entries -- Movies are mostly entered by hand, so errors and inconsistencies may exist diff --git a/doc/tutorials/rec/ml_regression_cn.rst b/doc/tutorials/rec/ml_regression_cn.rst deleted file mode 100644 index 9278c9f603b648099f448963bc2246b8dc014ab7..0000000000000000000000000000000000000000 --- a/doc/tutorials/rec/ml_regression_cn.rst +++ /dev/null @@ -1,349 +0,0 @@ -MovieLens数据集评分回归模型 -=========================== - -这里我们在MovieLens数据集描述一种 **余弦相似度回归** 任务。 -该示例将展示paddle如何进行词向量嵌入,处理相似度回归,针对文本 -的单词级别的卷积神经网络,以及paddle如何处理多种类型的输入。 -需要注意的是,该模型网络只是用于进行demo展示paddle如何工作,而 -没有进行结构的微调。 - - -**我们非常欢迎您用PADDLEPADDLE构建更好的示例,如果您有好的建议来 -让这个示例变得更好,希望能让我们知晓。** - -数据准备 -````````` -下载并解压数据集 -''''''''''''''''' -这里我们使用 :ref:`demo_ml_dataset` 。 -要下载和解压数据集,只需要简单的运行下面的命令即可。 - -.. code-block:: bash - - cd demo/recommendation/data - ./ml_data.sh - -:code:`demo/recommendation/data/ml-1m` 的目录结构为: - -.. code-block:: text - - +--ml-1m - +--- movies.dat # 电影特征 - +--- ratings.dat # 评分 - +--- users.dat # 用户特征 - +--- README # 数据集描述 - -字段配置文件 -''''''''''''' -**字段配置文件** 用来具体说明数据集的字段和文件格式, -例如,说明每个特征文件具体字段是 **什么** 类型。 - -ml-1m的字段配置文件在目录 :code:`demo/recommendation/data/config.json` 中。 -其具体说明了字段类型和文件名称: - -1) 用户文件中有四种类型的字段\: 编号,性别,年龄和职业; - -2) 文件名称为"users.dat",文件的分隔符为"::"。 - -.. include:: ../../../demo/recommendation/data/config.json - :code: json - :literal: - -准备数据 -````````` -你需要安装python的第三方库。 -**强烈推荐使用VIRTUALENV来创造一个干净的python环境。** - -.. code-block:: bash - - pip install -r requirements.txt - -预处理数据一般的命令为: - -.. code-block:: bash - - cd demo/recommendation - ./preprocess.sh - -下面介绍预处理过程具体的步骤。 - -提取电影或用户的特征并生成python对象 -''''''''''''''''''''''''''''''''''''' - -在movielens 1m数据集中,电影和用户有许多的特征。 -评分文件的每一行仅仅提供电影或用户的编号来代表相应的电影或用户。 -我们首先处理电影或用户的特征文件,然后用pickle命令将特征( **Meta** )对象存储为文件。 - -Meta配置文件 -............. - -**Meta配置文件** 用来具体描述 **如何** 解析数据集中的每一个字段。 -该文件可以从字段配置文件生成,或是手动编辑生成。文件的格式可以 -为json或yaml格式。解析器能通过文件的扩展名自动识别文件的格式。 - -要将字段配置文件转化为meta配置文件,只需要运行: - -.. code-block:: bash - - cd demo/recommendation/data - python config_generator.py config.json > meta_config.json - -生成的meta配置文件如下所示: - -.. include:: ../../../demo/recommendation/data/meta_config.json - :code: json - :literal: - -在meta文件中有两种特征\: 电影和用户。 - -* 在电影文件movies.dat中 - * 我们仅用"::"来分隔每一行 - * pos 0 代表编号 - * pos 1 特征: - * name是电影名 - * 利用正则表达式来解析该特征 - * 基于字母的词嵌入特征 - * 是序列 - * pos 2 特征: - * name是体裁 - * type是one hot稠密向量 - * dictionary由解析自动生成,每一个key由'|'分隔 -* 在用户文件users.dat中 - * 我们仅用"::"来分隔每一行 - * pos 0 代表编号 - * pos 1 特征: - * name是性别 - * 简单的基于字母的词嵌入 - * pos 2 特征: - * name是年龄 - * 是整个的词嵌入 - * 嵌入编号会根据单词排序 - * pos 3 特征: - * name是职业 - * 简单的整个词嵌入 - - -Meta文件 -'''''''' - -有了meta配置文件之后,我们可以生成 **Meta文件** ,该文件是python的pickle对象, -存储着电影或用户信息。可以运行下面的命令来生成。 - -.. code-block:: bash - - python meta_generator.py ml-1m meta.bin --config=meta_config.json - -meta文件 :code:`meta.bin` 的结构如下: - -.. code-block:: text - - +--+ movie - | +--+ __meta__ - | | +--+ raw_meta # 每个特征的meta配置。列表 - | | | + - | | | | # 编号字段,我们用编号作为key - | | | +--+ {'count': 3883, 'max': 3952, 'is_key': True, 'type': 'id', 'min': 1} - | | | | - | | | | # 电影名字段,嵌入特征字典 - | | | +--+ {'dict': [ ... ], 'type': 'embedding', 'name': 'title', 'seq': 'sequence'} - | | | | - | | | | # 体裁字段,体裁字典 - | | | +--+ {'dict': [ ... ], 'type': 'one_hot_dense', 'name': 'genres'} - | | | - | | +--+ feature_map [1, 2] # a list for raw_meta index for feature field. - | | # it means there are 2 features for each key. - | | # * 0 offset of feature is raw_meta[1], Title. - | | # * 1 offset of feature is raw_meta[2], Genres. - | | - | +--+ 1 # 电影1的特征 - | | + - | | +---+ [[...], [...]] # title ids, genres dense vector - | | - | +--+ 2 - | | - | +--+ ... - | - +--- user - +--+ __meta__ - | + - | +--+ raw_meta - | | + - | | +--+ id field as user - | | | - | | +--+ {'dict': ['F', 'M'], 'type': 'embedding', 'name': 'gender', 'seq': 'no_sequence'} - | | | - | | +--+ {'dict': ['1', '18', '25', '35', '45', '50', '56'], 'type': 'embedding', 'name': 'age', 'seq': 'no_sequence'} - | | | - | | +--+ {'dict': [...], 'type': 'embedding', 'name': 'occupation', 'seq': 'no_sequence'} - | | - | +--+ feature_map [1, 2, 3] - | - +--+ 1 # 用户1的特征 - | - +--+ 2 - +--+ ... - - -分割训练/测试文件 -'''''''''''''''''' - -我们将 :code:`ml-1m/ratings.dat` 文件分割为训练和测试文件。分割文件的方法是:对于每位用户,我们将评分分成两部分。 -这样的话每位用户在测试文件中将与训练文件含有同样的信息。 - -用 :code:`separate.py` 来分离训练和测试文件。 - -.. code-block:: bash - - python split.py ml-1m/ratings.dat --delimiter="::" --test_ratio=0.1 - -这样就会生成两个文件::code:`ml-1m/ratings.dat.train` 和 :code:`ml-1m/ratings.data.test` 。 -将他们移动到目录 :code:`data` ,然后进行随机打乱,再为paddle的训练过程提供文件列表。 - -.. code-block:: bash - - shuf ml-1m/ratings.dat.train > ratings.dat.train - cp ml-1m/ratings.dat.test . - echo "./data/ratings.dat.train" > train.list - echo "./data/ratings.dat.test" > test.list - - -神经网络结构配置 -````````````````` - -训练器配置文件 -''''''''''''''' - -网络结构如下图所示: - -.. image:: rec_regression_network.png - :align: center - :alt: rec_regression_network - -该示例的神经网络配置文件 :code:`trainer_config.py` 如下所示: - -.. literalinclude:: ../../../demo/recommendation/trainer_config.py - :language: python - :lines: 15- - -在文件 :code:`trainer_config.py` 中,我们仅仅是将每个特征种类映射到一个特征向量中,以下 -展示了如何将每个特征映射到一个向量。 - -* :code:`id` \: 仅仅是简单的嵌入,然后添加一个全连接层。 -* :code:`embedding` \: - - 如果是序列,则先做嵌入,然后再做一次文本卷积网络操作, - 然后得到平均采样的结果。 - - 如果不是序列,则先做嵌入,然后添加一个全连接层。 -* :code:`one_host_dense` \: - - 仅仅是两个全连接层。 - -然后我们利用多输入的:code:`fc_layer` 全连接层将电影的每个特征结合成一个电影特征, -并且对用户的特征做同样的操作,也得到一个用户特征。然后我们求这两个特征的余弦相似度。 - -在这些网络中,我们用以下的一些:ref:`api_trainer_config` 中的接口。 - -* 数据层, :ref:`api_trainer_config_helpers_layers_data_layer` -* 全连接层, :ref:`api_trainer_config_helpers_layers_fc_layer` -* 嵌入层, :ref:`api_trainer_config_helpers_layers_embedding_layer` -* 文本投影层, :ref:`api_trainer_config_helpers_layers_context_projection` -* 采样层, :ref:`api_trainer_config_helpers_layers_pooling_layer` -* 余弦相似度层, :ref:`api_trainer_config_helpers_layers_cos_sim` -* 文本卷积采样层, :ref:`api_trainer_config_helpers_network_text_conv_pool` -* 声明Python数据源, :ref:`api_trainer_config_helpers_data_sources` - -数据提供脚本 -''''''''''''' - -.. literalinclude:: ../../../demo/recommendation/dataprovider.py - :language: python - :lines: 15- - -数据提供脚本仅仅是读取meta.bin和评分文件,生成训练需要的样本。 -在脚本 :code:`dataprovider.py` 中,我们需要设置: - -* obj.slots\: 特征的类型和维度。 -* use_seq\: :code:`dataprovider.py` 中的数据是否为序列模式。 -* process\: 返回数据的每一条样本给 :code:`paddle` 。 - -数据提供脚本的细节文档可以参考 :ref:`api_pydataprovider2` 。 - -训练 -```` - -准备好数据,配置了网络,编写好数据提供脚本后,现在我们可以开始paddle训练了。 - -代码 :code:`run.sh` 如下: - -.. literalinclude:: ../../../demo/recommendation/run.sh - :language: bash - :lines: 16- - -该脚本仅仅是开始一个paddle训练过程,将日志写入文件 :code:`log.txt` ,然后 -打印在屏幕上。 - -脚本 :code:`run.sh` 中的每一行命令,请参考页面 :ref:`cmd_line_index` 。 -这些参数的简短介绍如下: - -* config\: 告诉paddle哪个文件是神经网络的配置文件。 -* save_dir\: 告诉paddle将模型保存在: code:`./output` 中。 -* use_gpu\: 是否使用GPU,默认为不使用。 -* trainer_count\: 一台机器上面的线程数量。 -* test_all_data_in_one_period\: 每一个测试周期测试一次所有数据。否则, - 每个测试周期测试: code:`batch_size` 批次的数据。 -* log_period\: 在训练了: code:`log_period` 批次后打印日志。 -* dot_period\: 在每训练: code:`dot_period` 个批次后打印一个 :code:`.` 。 -* num_passes\: 训练至多: code:`num_passes` 轮。 - -如果训练过程启动成功的话,输出应该类似如下: - -.. code-block:: text - - I0601 08:07:22.832059 10549 TrainerInternal.cpp:157] Batch=100 samples=160000 AvgCost=4.13494 CurrentCost=4.13494 Eval: CurrentEval: - - I0601 08:07:50.672627 10549 TrainerInternal.cpp:157] Batch=200 samples=320000 AvgCost=3.80957 CurrentCost=3.48421 Eval: CurrentEval: - - I0601 08:08:18.877369 10549 TrainerInternal.cpp:157] Batch=300 samples=480000 AvgCost=3.68145 CurrentCost=3.42519 Eval: CurrentEval: - - I0601 08:08:46.863963 10549 TrainerInternal.cpp:157] Batch=400 samples=640000 AvgCost=3.6007 CurrentCost=3.35847 Eval: CurrentEval: - - I0601 08:09:15.413025 10549 TrainerInternal.cpp:157] Batch=500 samples=800000 AvgCost=3.54811 CurrentCost=3.33773 Eval: CurrentEval: - I0601 08:09:36.058670 10549 TrainerInternal.cpp:181] Pass=0 Batch=565 samples=902826 AvgCost=3.52368 Eval: - I0601 08:09:46.215489 10549 Tester.cpp:101] Test samples=97383 cost=3.32155 Eval: - I0601 08:09:46.215966 10549 GradientMachine.cpp:132] Saving parameters to ./output/model/pass-00000 - I0601 08:09:46.233397 10549 ParamUtil.cpp:99] save dir ./output/model/pass-00000 - I0601 08:09:46.233438 10549 Util.cpp:209] copy trainer_config.py to ./output/model/pass-00000 - I0601 08:09:46.233541 10549 ParamUtil.cpp:147] fileName trainer_config.py - -模型被保存在 :code:`output/` 目录中。你可以在任何时候用 :code:`Ctrl-C` 来停止训练。 - -模型评估和预测 -``````````````` - -在训练了几个轮次以后,你可以对模型进行评估,得到最好轮次下的模型。运行下面命令即可: - -.. code-block:: bash - - ./evaluate.sh - -你将看到如下的信息: - -.. code-block:: text - - Best pass is 00009, error is 3.06949, which means predict get error as 0.875998002281 - evaluating from pass output/pass-00009 - -然后,你可以预测任何用户对于任何一部电影的评价,运行下面命令即可: - -.. code-block:: bash - - python prediction.py 'output/pass-00009/' - -预测程序将读取用户的输入,然后输出预测分数。用户预测的命令行界面如下: - -.. code-block:: text - - Input movie_id: 9 - Input user_id: 4 - Prediction Score is 2.56 - Input movie_id: 8 - Input user_id: 2 - Prediction Score is 3.13 diff --git a/doc/tutorials/rec/ml_regression_en.rst b/doc/tutorials/rec/ml_regression_en.rst deleted file mode 100644 index 993b9a516f134ff8b59e8755b721f76c8f32f0fd..0000000000000000000000000000000000000000 --- a/doc/tutorials/rec/ml_regression_en.rst +++ /dev/null @@ -1,348 +0,0 @@ -Regression MovieLens Ratting -============================ - -Here we demonstrate a **Cosine Similarity Regression** job in movie lens dataset. -This demo will show how paddle does (word) embedding job, -handles the similarity regression, -the character-level convolutional networks for text, and how does paddle handle -multiple types of inputs. -Note that the model structure is not fine-tuned and just a demo to show how paddle works. - - -YOU ARE WELCOME TO BUILD A BETTER DEMO -BY USING PADDLEPADDLE, AND LET US KNOW TO MAKE THIS DEMO BETTER. - -Data Preparation -```````````````` -Download and extract dataset -'''''''''''''''''''''''''''' -We use :ref:`demo_ml_dataset` here. -To download and unzip the dataset, simply run the following commands. - -.. code-block:: bash - - cd demo/recommendation/data - ./ml_data.sh - -And the directory structure of :code:`demo/recommendation/data/ml-1m` is: - -.. code-block:: text - - +--ml-1m - +--- movies.dat # movie features - +--- ratings.dat # ratings - +--- users.dat # user features - +--- README # dataset description - -Field config file -''''''''''''''''' -**Field config file** is used to specify the fields of the dataset and the file format, -i.e, specific **WHAT** type it is in each feature file. - -The field config file of ml-1m shows in :code:`demo/recommendation/data/config.json`. -It specifics the field types and file names: 1) there are four types of field for user file\: id, gender, age and occupation; -2) the filename is "users.dat", and the delimiter of file is "::". - -.. include:: ../../../demo/recommendation/data/config.json - :code: json - :literal: - -Preprocess Data -``````````````` -You need to install python 3rd party libraries. -IT IS HIGHLY RECOMMEND TO USE VIRTUALENV MAKE A CLEAN PYTHON ENVIRONMENT. - -.. code-block:: bash - - pip install -r requirements.txt - -The general command for preprocessing the dataset is: - -.. code-block:: bash - - cd demo/recommendation - ./preprocess.sh - -And the detail steps are introduced as follows. - -Extract Movie/User features to python object -''''''''''''''''''''''''''''''''''''''''''''' - -There are many features in movie or user in movielens 1m dataset. -Each line of rating file just provides a Movie/User id to refer each movie or user. -We process the movie/user feature file first, and pickle the feature (**Meta**) object as a file. - -Meta config file -................ - -**Meta config file** is used to specific **HOW** to parse each field in dataset. -It could be translated from field config file, or written by hand. -Its file format could be either json or yaml syntax file. Parser will automatically choose the file format by extension name. - -To convert Field config file to meta config file, just run: - -.. code-block:: bash - - cd demo/recommendation/data - python config_generator.py config.json > meta_config.json - -The meta config file shows below: - -.. include:: ../../../demo/recommendation/data/meta_config.json - :code: json - :literal: - -There are two kinds of features in meta\: movie and user. - -* in movie file, whose name is movies.dat - * we just split each line by "::" - * pos 0 is id. - * pos 1 feature: - * name is title. - * it uses regex to parse this feature. - * it is a char based word embedding feature. - * it is a sequence. - * pos 2 feature: - * name is genres. - * type is one hot dense vector. - * dictionary is auto generated by parsing, each key is split by '|' -* in user file, whose name is users.dat - * we just split each line by "::" - * pos 0 is id. - * pos 1 feature: - * name is gender - * just simple char based embedding. - * pos 2 feature: - * name is age - * just whole word embedding. - * embedding id will be sort by word. - * pos 3 feature: - * name is occupation. - * just simple whole word embedding. - - -Meta file -''''''''' - -After having meta config file, we can generate **Meta file**, a python pickle object which stores movie/user information. -The following commands could be run to generate it. - -.. code-block:: bash - - python meta_generator.py ml-1m meta.bin --config=meta_config.json - -And the structure of the meta file :code:`meta.bin` is: - -.. code-block:: text - - +--+ movie - | +--+ __meta__ - | | +--+ raw_meta # each feature meta config. list - | | | + - | | | | # ID Field, we use id as key - | | | +--+ {'count': 3883, 'max': 3952, 'is_key': True, 'type': 'id', 'min': 1} - | | | | - | | | | # Titile field, the dictionary list of embedding. - | | | +--+ {'dict': [ ... ], 'type': 'embedding', 'name': 'title', 'seq': 'sequence'} - | | | | - | | | | # Genres field, the genres dictionary - | | | +--+ {'dict': [ ... ], 'type': 'one_hot_dense', 'name': 'genres'} - | | | - | | +--+ feature_map [1, 2] # a list for raw_meta index for feature field. - | | # it means there are 2 features for each key. - | | # * 0 offset of feature is raw_meta[1], Title. - | | # * 1 offset of feature is raw_meta[2], Genres. - | | - | +--+ 1 # movie 1 features - | | + - | | +---+ [[...], [...]] # title ids, genres dense vector - | | - | +--+ 2 - | | - | +--+ ... - | - +--- user - +--+ __meta__ - | + - | +--+ raw_meta - | | + - | | +--+ id field as user - | | | - | | +--+ {'dict': ['F', 'M'], 'type': 'embedding', 'name': 'gender', 'seq': 'no_sequence'} - | | | - | | +--+ {'dict': ['1', '18', '25', '35', '45', '50', '56'], 'type': 'embedding', 'name': 'age', 'seq': 'no_sequence'} - | | | - | | +--+ {'dict': [...], 'type': 'embedding', 'name': 'occupation', 'seq': 'no_sequence'} - | | - | +--+ feature_map [1, 2, 3] - | - +--+ 1 # user 1 features - | - +--+ 2 - +--+ ... - - -Split Training/Testing files -'''''''''''''''''''''''''''' - -We split :code:`ml-1m/ratings.dat` into a training and testing file. The way to split file is for each user, we split the -rating by two parts. So each user in testing file will have some rating information in training file. - -Use :code:`separate.py` to separate the training and testing file. - -.. code-block:: bash - - python split.py ml-1m/ratings.dat --delimiter="::" --test_ratio=0.1 - -Then two files will be generated\: :code:`ml-1m/ratings.dat.train` and :code:`ml-1m/rating.data.test`. -Move them to workspace :code:`data`, shuffle the train file, and prepare the file list for paddle train. - -.. code-block:: bash - - shuf ml-1m/ratings.dat.train > ratings.dat.train - cp ml-1m/ratings.dat.test . - echo "./data/ratings.dat.train" > train.list - echo "./data/ratings.dat.test" > test.list - - -Neural Network Configuration -```````````````````````````` - -Trainer Config File -''''''''''''''''''' - -The network structure shows below. - -.. image:: rec_regression_network.png - :align: center - :alt: rec_regression_network - -The demo's neural network config file :code:`trainer_config.py` show as below. - -.. literalinclude:: ../../../demo/recommendation/trainer_config.py - :language: python - :lines: 15- - -In this :code:`trainer_config.py`, we just map each feature type to -a feature vector, following shows how to map each feature to a vector shows below. - -* :code:`id`\: Just simple embedding, and then add to fully connected layer. -* :code:`embedding`\: - - if is_sequence, get the embedding and do a text convolutional operation, - get the average pooling result. - - if not sequence, get the embedding and add to fully connected layer. -* :code:`one_host_dense`\: - - just two fully connected layer. - -Then we combine each features of movie into one movie feature by a -:code:`fc_layer` with multiple inputs, and do the same thing to user features, -get one user feature. Then we calculate the cosine similarity of these two -features. - -In these networks, we use several APIs in :ref:`api_trainer_config` . There are - -* Data Layer, :ref:`api_trainer_config_helpers_layers_data_layer` -* Fully Connected Layer, :ref:`api_trainer_config_helpers_layers_fc_layer` -* Embedding Layer, :ref:`api_trainer_config_helpers_layers_embedding_layer` -* Context Projection Layer, :ref:`api_trainer_config_helpers_layers_context_projection` -* Pooling Layer, :ref:`api_trainer_config_helpers_layers_pooling_layer` -* Cosine Similarity Layer, :ref:`api_trainer_config_helpers_layers_cos_sim` -* Text Convolution Pooling Layer, :ref:`api_trainer_config_helpers_network_text_conv_pool` -* Declare Python Data Sources :ref:`api_trainer_config_helpers_data_sources`. - -Data Provider -''''''''''''' - -.. literalinclude:: ../../../demo/recommendation/dataprovider.py - :language: python - :lines: 15- - -The data provider just read the meta.bin and rating file, yield each sample for training. -In this :code:`dataprovider.py`, we should set\: - -* obj.slots\: The feature types and dimension. -* use_seq\: Whether this :code:`dataprovider.py` in sequence mode or not. -* process\: Return each sample of data to :code:`paddle`. - -The data provider details document see :ref:`api_pydataprovider2`. - -Train -````` - -After prepare data, config network, writting data provider, now we can run paddle training. - -The :code:`run.sh` is shown as follow: - -.. literalinclude:: ../../../demo/recommendation/run.sh - :language: bash - :lines: 16- - -It just start a paddle training process, write the log to :code:`log.txt`, -then print it on screen. - -Each command line argument in :code:`run.sh`, please refer to the :ref:`cmd_line_index` page. The short description of these arguments is shown as follow. - -* config\: Tell paddle which file is neural network configuration. -* save_dir\: Tell paddle save model into :code:`./output`. -* use_gpu\: Use gpu or not. Default is false. -* trainer_count\: The compute thread in one machine. -* test_all_data_in_one_period\: Test All Data during one test period. Otherwise, - will test a :code:`batch_size` data in one test period. -* log_period\: Print log after train :code:`log_period` batches. -* dot_period\: Print a :code:`.` after train :code:`dot_period` batches. -* num_passes\: Train at most :code:`num_passes`. - -If training process starts successfully, the output likes follow: - -.. code-block:: text - - I0601 08:07:22.832059 10549 TrainerInternal.cpp:157] Batch=100 samples=160000 AvgCost=4.13494 CurrentCost=4.13494 Eval: CurrentEval: - - I0601 08:07:50.672627 10549 TrainerInternal.cpp:157] Batch=200 samples=320000 AvgCost=3.80957 CurrentCost=3.48421 Eval: CurrentEval: - - I0601 08:08:18.877369 10549 TrainerInternal.cpp:157] Batch=300 samples=480000 AvgCost=3.68145 CurrentCost=3.42519 Eval: CurrentEval: - - I0601 08:08:46.863963 10549 TrainerInternal.cpp:157] Batch=400 samples=640000 AvgCost=3.6007 CurrentCost=3.35847 Eval: CurrentEval: - - I0601 08:09:15.413025 10549 TrainerInternal.cpp:157] Batch=500 samples=800000 AvgCost=3.54811 CurrentCost=3.33773 Eval: CurrentEval: - I0601 08:09:36.058670 10549 TrainerInternal.cpp:181] Pass=0 Batch=565 samples=902826 AvgCost=3.52368 Eval: - I0601 08:09:46.215489 10549 Tester.cpp:101] Test samples=97383 cost=3.32155 Eval: - I0601 08:09:46.215966 10549 GradientMachine.cpp:132] Saving parameters to ./output/model/pass-00000 - I0601 08:09:46.233397 10549 ParamUtil.cpp:99] save dir ./output/model/pass-00000 - I0601 08:09:46.233438 10549 Util.cpp:209] copy trainer_config.py to ./output/model/pass-00000 - I0601 08:09:46.233541 10549 ParamUtil.cpp:147] fileName trainer_config.py - -The model is saved in :code:`output/` directory. You can use :code:`Ctrl-C` to stop training whenever you want. - -Evaluate and Predict -```````````````````` - -After training several passes, you can evaluate them and get the best pass. Just run - -.. code-block:: bash - - ./evaluate.sh - -You will see messages like this: - -.. code-block:: text - - Best pass is 00009, error is 3.06949, which means predict get error as 0.875998002281 - evaluating from pass output/pass-00009 - -Then, you can predict what any user will rate a movie. Just run - -.. code-block:: bash - - python prediction.py 'output/pass-00009/' - -Predictor will read user input, and predict scores. It has a command-line user interface as follows: - -.. code-block:: text - - Input movie_id: 9 - Input user_id: 4 - Prediction Score is 2.56 - Input movie_id: 8 - Input user_id: 2 - Prediction Score is 3.13 diff --git a/doc/tutorials/rec/rec_regression_network.png b/doc/tutorials/rec/rec_regression_network.png deleted file mode 100644 index 7d2b54d4fcf560cd5b667628f0012c3822efd9b2..0000000000000000000000000000000000000000 Binary files a/doc/tutorials/rec/rec_regression_network.png and /dev/null differ diff --git a/doc/tutorials/semantic_role_labeling/feature.jpg b/doc/tutorials/semantic_role_labeling/feature.jpg deleted file mode 100644 index 0e3310e4ace5613917e7779d3198ccbb3cdc5ada..0000000000000000000000000000000000000000 Binary files a/doc/tutorials/semantic_role_labeling/feature.jpg and /dev/null differ diff --git a/doc/tutorials/semantic_role_labeling/index_cn.md b/doc/tutorials/semantic_role_labeling/index_cn.md deleted file mode 100644 index f6061766c038a7bb6e4ae376685a10cd5669d2ed..0000000000000000000000000000000000000000 --- a/doc/tutorials/semantic_role_labeling/index_cn.md +++ /dev/null @@ -1,201 +0,0 @@ -# 语义角色标注教程 # - -语义角色标注(Semantic role labeling, SRL)是浅层语义解析的一种形式,其目的是在给定的输入句子中发现每个谓词的谓词论元结构。 SRL作为很多自然语言处理任务中的中间步骤是很有用的,如信息提取、文档自动分类和问答。 实例如下 [1]: - - [ A0 He ] [ AM-MOD would ][ AM-NEG n’t ] [ V accept] [ A1 anything of value ] from [A2 those he was writing about ]. - -- V: 动词 -- A0: 接受者 -- A1: 接受的东西 -- A2: 从……接受 -- A3: 属性 -- AM-MOD: 情态动词 -- AM-NEG: 否定 - -给定动词“accept”,句子中的组块将会扮演某些语义角色。这里,标签方案来自 Penn Proposition Bank。 - -到目前为止,大多数成功的SRL系统是建立在某种形式的句法分析结果之上的,使用了基于句法结构的预定义特征模板。 本教程将介绍使用深度双向长短期记忆(DB-LSTM)模型[2]的端到端系统来解决SRL任务,这在很大程度上优于先前的最先进的系统。 这个系统将SRL任务视为序列标注问题。 - -## 数据描述 -相关论文[2]采用 CoNLL-2005&2012 共享任务中设置的数据进行训练和测试。由于数据许可的原因,演示采用 CoNLL-2005 的测试数据集,可以在网站上找到。 - -用户只需执行以下命令就可以下载并处理原始数据: - -```bash -cd data -./get_data.sh -``` -`data `目录会出现如下几个新的文件: -```bash -conll05st-release:the test data set of CoNll-2005 shared task -test.wsj.words:the Wall Street Journal data sentences -test.wsj.props: the propositional arguments -feature: the extracted features from data set -``` - -## 训练 -### DB-LSTM -请参阅情感分析的演示以了解有关长期短期记忆单元的更多信息。 - -与在 Sentiment Analysis 演示中使用的 Bidirectional-LSTM 不同,DB-LSTM 采用另一种方法来堆叠LSTM层。首先,标准LSTM以正向处理该序列。该 LSTM 层的输入和输出作为下一个 LSTM 层的输入,并被反向处理。这两个标准 LSTM 层组成一对 LSTM。然后我们堆叠一对对的 LSTM 层后得到深度 LSTM 模型。 - -下图展示了时间扩展的2层 DB-LSTM 网络。 -
-![pic](./network_arch.png) -
- -### 特征 -两个输入特征在这个流程中起着至关重要的作用:predicate(pred)和argument(arguments)。 还采用了两个其他特征:谓词上下文(ctx-p)和区域标记(mr)。 因为单个谓词不能精确地描述谓词信息,特别是当相同的词在句子中出现多于一次时。 使用谓词上下文,可以在很大程度上消除歧义。类似地,如果它位于谓词上下文区域中,则使用区域标记 mr = 1 来表示参数位置,反之则 mr = 0。这四个简单的特征是我们的SRL系统所需要的。上下文大小设置为1的一个样本的特征如下[2]所示: -
-![pic](./feature.jpg) -
- -在这个示例中,相应的标记句子是: - -[ A1 A record date ] has [ AM-NEG n't ] been [ V set ] . - -在演示中, 我们采用上面的特征模板, 包括: `argument`, `predicate`, `ctx-p (p=-1,0,1)`, `mark` 并使用 `B/I/O` 方案来标记每个参数。这些特征和标签存储在 `feature` 文件中, 用`\t`分割。 - -### 数据提供 - -`dataprovider.py` 是一个包装数据的 Python 文件。 函数 `hook()` 定义了网络的数据槽。六个特征和标签都是索引槽。 -``` -def hook(settings, word_dict, label_dict, **kwargs): - settings.word_dict = word_dict - settings.label_dict = label_dict - #all inputs are integral and sequential type - settings.slots = [ - integer_value_sequence(len(word_dict)), - integer_value_sequence(len(predicate_dict)), - integer_value_sequence(len(word_dict)), - integer_value_sequence(len(word_dict)), - integer_value_sequence(len(word_dict)), - integer_value_sequence(len(word_dict)), - integer_value_sequence(len(word_dict)), - integer_value_sequence(2), - integer_value_sequence(len(label_dict))] -``` -相应的数据迭代器如下: -``` -@provider(init_hook=hook, should_shuffle=True, calc_batch_size=get_batch_size, - can_over_batch_size=False, cache=CacheType.CACHE_PASS_IN_MEM) -def process(settings, file_name): - with open(file_name, 'r') as fdata: - for line in fdata: - sentence, predicate, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, mark, label = \ - line.strip().split('\t') - - words = sentence.split() - sen_len = len(words) - word_slot = [settings.word_dict.get(w, UNK_IDX) for w in words] - - predicate_slot = [settings.predicate_dict.get(predicate)] * sen_len - ctx_n2_slot = [settings.word_dict.get(ctx_n2, UNK_IDX)] * sen_len - ctx_n1_slot = [settings.word_dict.get(ctx_n1, UNK_IDX)] * sen_len - ctx_0_slot = [settings.word_dict.get(ctx_0, UNK_IDX)] * sen_len - ctx_p1_slot = [settings.word_dict.get(ctx_p1, UNK_IDX)] * sen_len - ctx_p2_slot = [settings.word_dict.get(ctx_p2, UNK_IDX)] * sen_len - - marks = mark.split() - mark_slot = [int(w) for w in marks] - - label_list = label.split() - label_slot = [settings.label_dict.get(w) for w in label_list] - yield word_slot, predicate_slot, ctx_n2_slot, ctx_n1_slot, \ - ctx_0_slot, ctx_p1_slot, ctx_p2_slot, mark_slot, label_slot -``` -函数 `process` 返回8个特征list和1个标签list。 - -### 神经网络配置 - -`db_lstm.py` 是在训练过程中加载字典并定义数据提供程序模块和网络架构的神经网络配置文件。 - -九个 `data_layer` 从数据提供程序加载实例。八个特征分别转换为向量,并由`mixed_layer`混合。 深度双向LSTM层提取softmax层的特征。目标函数是标签的交叉熵。 - -### 训练 -训练的脚本是 `train.sh`,用户只需执行: -```bash - ./train.sh -``` -`train.sh` 中的内容: -``` -paddle train \ - --config=./db_lstm.py \ - --use_gpu=0 \ - --log_period=5000 \ - --trainer_count=1 \ - --show_parameter_stats_period=5000 \ - --save_dir=./output \ - --num_passes=10000 \ - --average_test_period=10000000 \ - --init_model_path=./data \ - --load_missing_parameter_strategy=rand \ - --test_all_data_in_one_period=1 \ -2>&1 | tee 'train.log' -``` - -- \--config=./db_lstm.py : 网络配置文件 -- \--use_gpu=false: 使用 CPU 训练(如果已安装 PaddlePaddle GPU版本并想使用 GPU 训练可以设置为true,目前 crf_layer 不支持 GPU) -- \--log_period=500: 每20个batch输出日志 -- \--trainer_count=1: 设置线程数(或 GPU 数) -- \--show_parameter_stats_period=5000: 每100个batch显示参数统计 -- \--save_dir=./output: 模型输出路径 -- \--num_passes=10000: 设置数据遍历次数,一个pass意味着PaddlePaddle训练数据集中的所有样本被遍历一次 -- \--average_test_period=10000000: 每个 average_test_period 批次对平均参数进行测试 -- \--init_model_path=./data: 参数初始化路径 -- \--load_missing_parameter_strategy=rand: 随机初始不存在的参数 -- \--test_all_data_in_one_period=1: 在一个周期内测试所有数据 - - -训练后,模型将保存在目录`output`中。 我们的训练曲线如下: -
-![pic](./src/curve.jpg) -
- -### 测试 -测试脚本是 `test.sh`, 执行: -```bash - ./test.sh -``` -`tesh.sh` 的主要部分: -``` -paddle train \ - --config=./db_lstm.py \ - --model_list=$model_list \ - --job=test \ - --config_args=is_test=1 \ -``` - - - \--config=./db_lstm.py: 网络配置文件 - - \--model_list=$model_list.list: 模型列表文件 - - \--job=test: 指示测试任务 - - \--config_args=is_test=1: 指示测试任务的标记 - - \--test_all_data_in_one_period=1: 在一个周期内测试所有数据 - - -### 预测 -预测脚本是 `predict.sh`,用户只需执行: -```bash - ./predict.sh - -``` -在`predict.sh`中,用户应该提供网络配置文件,模型路径,标签文件,字典文件,特征文件。 -``` -python predict.py - -c $config_file \ - -w $best_model_path \ - -l $label_file \ - -p $predicate_dict_file \ - -d $dict_file \ - -i $input_file \ - -o $output_file -``` - -`predict.py` 是主要的可执行python脚本,其中包括函数:加载模型,加载数据,数据预测。网络模型将输出标签的概率分布。 在演示中,我们使用最大概率的标签作为结果。用户还可以根据概率分布矩阵实现柱搜索或维特比解码。 - -预测后,结果保存在 `predict.res` 中。 - -## 引用 -[1] Martha Palmer, Dan Gildea, and Paul Kingsbury. The Proposition Bank: An Annotated Corpus of Semantic Roles , Computational Linguistics, 31(1), 2005. - -[2] Zhou, Jie, and Wei Xu. "End-to-end learning of semantic role labeling using recurrent neural networks." Proceedings of the Annual Meeting of the Association for Computational Linguistics. 2015. diff --git a/doc/tutorials/semantic_role_labeling/index_en.md b/doc/tutorials/semantic_role_labeling/index_en.md deleted file mode 100644 index 92d7c634832119c718711a57c16f69492d405f28..0000000000000000000000000000000000000000 --- a/doc/tutorials/semantic_role_labeling/index_en.md +++ /dev/null @@ -1,204 +0,0 @@ -```eval_rst -.. _semantic_role_labeling: -``` - -# Semantic Role labeling Tutorial # - -Semantic role labeling (SRL) is a form of shallow semantic parsing whose goal is to discover the predicate-argument structure of each predicate in a given input sentence. SRL is useful as an intermediate step in a wide range of natural language processing tasks, such as information extraction. automatic document categorization and question answering. An instance is as following [1]: - - [ A0 He ] [ AM-MOD would ][ AM-NEG n’t ] [ V accept] [ A1 anything of value ] from [A2 those he was writing about ]. - -- V: verb -- A0: acceptor -- A1: thing accepted -- A2: accepted-from -- A3: Attribute -- AM-MOD: modal -- AM-NEG: negation - -Given the verb "accept", the chunks in sentence would play certain semantic roles. Here, the label scheme is from Penn Proposition Bank. - -To this date, most of the successful SRL systems are built on top of some form of parsing results where pre-defined feature templates over the syntactic structure are used. This tutorial will present an end-to-end system using deep bidirectional long short-term memory (DB-LSTM)[2] for solving the SRL task, which largely outperforms the previous state-of-the-art systems. The system regards SRL task as the sequence labelling problem. - -## Data Description -The relevant paper[2] takes the data set in CoNLL-2005&2012 Shared Task for training and testing. Accordingto data license, the demo adopts the test data set of CoNLL-2005, which can be reached on website. - -To download and process the original data, user just need to execute the following command: - -```bash -cd data -./get_data.sh -``` -Several new files appear in the `data `directory as follows. -```bash -conll05st-release:the test data set of CoNll-2005 shared task -test.wsj.words:the Wall Street Journal data sentences -test.wsj.props: the propositional arguments -feature: the extracted features from data set -``` - -## Training -### DB-LSTM -Please refer to the Sentiment Analysis demo to learn more about the long short-term memory unit. - -Unlike Bidirectional-LSTM that used in Sentiment Analysis demo, the DB-LSTM adopts another way to stack LSTM layer. First a standard LSTM processes the sequence in forward direction. The input and output of this LSTM layer are taken by the next LSTM layer as input, processed in reversed direction. These two standard LSTM layers compose a pair of LSTM. Then we stack LSTM layers pair after pair to obtain the deep LSTM model. - -The following figure shows a temporal expanded 2-layer DB-LSTM network. -
-![pic](./src/network_arch.png) -
- -### Features -Two input features play an essential role in this pipeline: predicate (pred) and argument (argu). Two other features: predicate context (ctx-p) and region mark (mr) are also adopted. Because a single predicate word can not exactly describe the predicate information, especially when the same words appear more than one times in a sentence. With the predicate context, the ambiguity can be largely eliminated. Similarly, we use region mark mr = 1 to denote the argument position if it locates in the predicate context region, or mr = 0 if does not. These four simple features are all we need for our SRL system. Features of one sample with context size set to 1 is showed as following[2]: -
-![pic](./src/feature.jpg) -
- -In this sample, the coresponding labelled sentence is: - -[ A1 A record date ] has [ AM-NEG n't ] been [ V set ] . - -In the demo, we adopt the feature template as above, consists of : `argument`, `predicate`, `ctx-p (p=-1,0,1)`, `mark` and use `B/I/O` scheme to label each argument. These features and labels are stored in `feature` file, and separated by `\t`. - -### Data Provider - -`dataprovider.py` is the python file to wrap data. `hook()` function is to define the data slots for network. The Six features and label are all IndexSlots. -``` -def hook(settings, word_dict, label_dict, **kwargs): - settings.word_dict = word_dict - settings.label_dict = label_dict - #all inputs are integral and sequential type - settings.slots = [ - integer_value_sequence(len(word_dict)), - integer_value_sequence(len(predicate_dict)), - integer_value_sequence(len(word_dict)), - integer_value_sequence(len(word_dict)), - integer_value_sequence(len(word_dict)), - integer_value_sequence(len(word_dict)), - integer_value_sequence(len(word_dict)), - integer_value_sequence(2), - integer_value_sequence(len(label_dict))] -``` -The corresponding data iterator is as following: -``` -@provider(init_hook=hook, should_shuffle=True, calc_batch_size=get_batch_size, - can_over_batch_size=False, cache=CacheType.CACHE_PASS_IN_MEM) -def process(settings, file_name): - with open(file_name, 'r') as fdata: - for line in fdata: - sentence, predicate, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, mark, label = \ - line.strip().split('\t') - - words = sentence.split() - sen_len = len(words) - word_slot = [settings.word_dict.get(w, UNK_IDX) for w in words] - - predicate_slot = [settings.predicate_dict.get(predicate)] * sen_len - ctx_n2_slot = [settings.word_dict.get(ctx_n2, UNK_IDX)] * sen_len - ctx_n1_slot = [settings.word_dict.get(ctx_n1, UNK_IDX)] * sen_len - ctx_0_slot = [settings.word_dict.get(ctx_0, UNK_IDX)] * sen_len - ctx_p1_slot = [settings.word_dict.get(ctx_p1, UNK_IDX)] * sen_len - ctx_p2_slot = [settings.word_dict.get(ctx_p2, UNK_IDX)] * sen_len - - marks = mark.split() - mark_slot = [int(w) for w in marks] - - label_list = label.split() - label_slot = [settings.label_dict.get(w) for w in label_list] - yield word_slot, predicate_slot, ctx_n2_slot, ctx_n1_slot, \ - ctx_0_slot, ctx_p1_slot, ctx_p2_slot, mark_slot, label_slot -``` -The `process`function yield 9 lists which are 8 features and label. - -### Neural Network Config -`db_lstm.py` is the neural network config file to load the dictionaries and define the data provider module and network architecture during the training procedure. - -Nine `data_layer` load instances from data provider. Eight features are transformed into embedddings respectively, and mixed by `mixed_layer` . Deep bidirectional LSTM layers extract features for the softmax layer. The objective function is cross entropy of labels. - -### Run Training -The script for training is `train.sh`, user just need to execute: -```bash - ./train.sh -``` -The content in `train.sh`: -``` -paddle train \ - --config=./db_lstm.py \ - --use_gpu=0 \ - --log_period=5000 \ - --trainer_count=1 \ - --show_parameter_stats_period=5000 \ - --save_dir=./output \ - --num_passes=10000 \ - --average_test_period=10000000 \ - --init_model_path=./data \ - --load_missing_parameter_strategy=rand \ - --test_all_data_in_one_period=1 \ -2>&1 | tee 'train.log' -``` - -- \--config=./db_lstm.py : network config file. -- \--use_gpu=false: use CPU to train, set true, if you install GPU version of PaddlePaddle and want to use GPU to train, until now crf_layer do not support GPU -- \--log_period=500: print log every 20 batches. -- \--trainer_count=1: set thread number (or GPU count). -- \--show_parameter_stats_period=5000: show parameter statistic every 100 batches. -- \--save_dir=./output: output path to save models. -- \--num_passes=10000: set pass number, one pass in PaddlePaddle means training all samples in dataset one time. -- \--average_test_period=10000000: do test on average parameter every average_test_period batches -- \--init_model_path=./data: parameter initialization path -- \--load_missing_parameter_strategy=rand: random initialization unexisted parameters -- \--test_all_data_in_one_period=1: test all data in one period - - -After training, the models will be saved in directory `output`. Our training curve is as following: -
-![pic](./src/curve.jpg) -
- -### Run testing -The script for testing is `test.sh`, user just need to execute: -```bash - ./test.sh -``` -The main part in `tesh.sh` -``` -paddle train \ - --config=./db_lstm.py \ - --model_list=$model_list \ - --job=test \ - --config_args=is_test=1 \ -``` - - - \--config=./db_lstm.py: network config file - - \--model_list=$model_list.list: model list file - - \--job=test: indicate the test job - - \--config_args=is_test=1: flag to indicate test - - \--test_all_data_in_one_period=1: test all data in 1 period - - -### Run prediction -The script for prediction is `predict.sh`, user just need to execute: -```bash - ./predict.sh - -``` -In `predict.sh`, user should offer the network config file, model path, label file, word dictionary file, feature file -``` -python predict.py - -c $config_file \ - -w $best_model_path \ - -l $label_file \ - -p $predicate_dict_file \ - -d $dict_file \ - -i $input_file \ - -o $output_file -``` - -`predict.py` is the main executable python script, which includes functions: load model, load data, data prediction. The network model will output the probability distribution of labels. In the demo, we take the label with maximum probability as result. User can also implement the beam search or viterbi decoding upon the probability distribution matrix. - -After prediction, the result is saved in `predict.res`. - -## Reference -[1] Martha Palmer, Dan Gildea, and Paul Kingsbury. The Proposition Bank: An Annotated Corpus of Semantic Roles , Computational Linguistics, 31(1), 2005. - -[2] Zhou, Jie, and Wei Xu. "End-to-end learning of semantic role labeling using recurrent neural networks." Proceedings of the Annual Meeting of the Association for Computational Linguistics. 2015. diff --git a/doc/tutorials/semantic_role_labeling/network_arch.png b/doc/tutorials/semantic_role_labeling/network_arch.png deleted file mode 100644 index 4ae7864212f2a0a38102ee7ff600527ea99fec82..0000000000000000000000000000000000000000 Binary files a/doc/tutorials/semantic_role_labeling/network_arch.png and /dev/null differ diff --git a/doc/tutorials/semantic_role_labeling/src/curve.jpg b/doc/tutorials/semantic_role_labeling/src/curve.jpg deleted file mode 100644 index baa35ae7f0a0b6c246f3a0d331735477ab8bcd70..0000000000000000000000000000000000000000 Binary files a/doc/tutorials/semantic_role_labeling/src/curve.jpg and /dev/null differ diff --git a/doc/tutorials/semantic_role_labeling/src/feature.jpg b/doc/tutorials/semantic_role_labeling/src/feature.jpg deleted file mode 100644 index 0e3310e4ace5613917e7779d3198ccbb3cdc5ada..0000000000000000000000000000000000000000 Binary files a/doc/tutorials/semantic_role_labeling/src/feature.jpg and /dev/null differ diff --git a/doc/tutorials/semantic_role_labeling/src/network_arch.png b/doc/tutorials/semantic_role_labeling/src/network_arch.png deleted file mode 100644 index 4ae7864212f2a0a38102ee7ff600527ea99fec82..0000000000000000000000000000000000000000 Binary files a/doc/tutorials/semantic_role_labeling/src/network_arch.png and /dev/null differ diff --git a/doc/tutorials/sentiment_analysis/index_cn.md b/doc/tutorials/sentiment_analysis/index_cn.md deleted file mode 100644 index 1323ec1a6abb2e7b5eeb2fbfff9cce5fe78a2c06..0000000000000000000000000000000000000000 --- a/doc/tutorials/sentiment_analysis/index_cn.md +++ /dev/null @@ -1,325 +0,0 @@ -# 情感分析教程 - -情感分析有许多应用场景。 一个基本的应用场景是区分给定文本的褒贬两极性,给定的文本可以是一个文档、句子、或者是一个小的文本片段。 一个简单的例子如:把用户在购物网站、旅游网站、团购网站(亚马逊、天猫、淘宝等)上发表的评论分成正面评论和负面评论两类。 - -情感分析也常用于基于大量评论和个人博客来监控社会媒体。 例如,研究人员分析了几个关于消费者信心和政治观点的调查,结果发现它们与同时期的Twitter消息中的情绪词频率相关 [1]。 另一个例子是通过分析每日Twitter博客的文本内容来预测股票变动 [2]。 - -另一方面,抓取产品的用户评论并分析他们的情感,有助于理解用户对不同公司,不同产品,甚至不同竞争对手产品的偏好。 - -本教程将指导您完成长期短期记忆(LSTM)网络的训练过程,以分类来自[大型电影评论数据集](http://ai.stanford.edu/~amaas/data/sentiment/)(有时称为[互联网电影数据库 (IMDB)](http://ai.stanford.edu/~amaas/papers/wvSent_acl2011.pdf))的句子的情感 。 此数据集包含电影评论及其相关联的类别标签,即正面和负面。 - -## 数椐准备 - -### IMDB 数椐介绍 - -训练模型之前, 我们需要预处理数椐并构建一个字典。 首先, 你可以使用下面的脚本下载 IMDB 数椐集和[Moses](http://www.statmt.org/moses/)工具, 这是一个基于统计的机器翻译系统. 我们提供了一个数据预处理脚本,它不仅能够处理IMDB数据,还能处理其他用户自定义的数据。 为了使用提前编写的脚本,需要将标记的训练和测试样本移动到另一个路径,这已经在`get_imdb.sh`中完成。 - -``` -cd demo/sentiment/data -./get_imdb.sh -``` -如果数椐获取成功,你将在目录```./demo/sentiment/data```中看到下面的文件: - -``` -aclImdb get_imdb.sh imdb mosesdecoder-master -``` - -* aclImdb: 从外部网站上下载的原始数椐集。 -* imdb: 仅包含训练和测试数椐集。 -* mosesdecoder-master: Moses 工具。 - -IMDB数据集包含25,000个已标注过的高极性电影评论用于训练,25,000个用于测试。负面的评论的得分小于等于4,正面的评论的得大于等于7,总评分10分。 运行完脚本 `./get_imdb.sh`后, 我们可以看到在目录 `aclImdb`中的数椐集的结构如下: - -``` -imdbEr.txt imdb.vocab README test train -``` -* train: 训练数椐集。 -* test : 测试数椐集。 -* imdb.vocab: 字典文件。 -* imdbEr.txt: 字典imdb.vocab中每个切分单词的预期评级。 -* README: 数椐说明文档。 - -测试集和训练集目录包含下面的文件: - -``` -labeledBow.feat neg pos unsup unsupBow.feat urls_neg.txt urls_pos.txt urls_unsup.txt -``` - -* pos: 正面评价样本,包含12,500个txt文件,每个文件是一个电影评论。 -* neg: 负面评价样本,包含12,500个txt文件,每个文件是一个电影评论。 -* unsup: 未标记的评价样本,包含50,000个txt文件。 -* urls_xx.txt: 每个评论的网址。 -* xxBow.feat: 用于统计词频的Bow模型特征。 - -### IMDB 数椐准备 - -在这个例子中,我们只使用已经标注过的训练集和测试集,且默认在训练集上构建字典,而不使用IMDB数椐集中的imdb.vocab做为字典。训练集已经做了随机打乱排序而测试集没有。 Moses 工具中的脚本`tokenizer.perl` 用于切分单单词和标点符号。执行下面的命令就可以预处理数椐。 - -``` -cd demo/sentiment/ -./preprocess.sh -``` -preprocess.sh: - -``` -data_dir="./data/imdb" -python preprocess.py -i data_dir -``` - -* data_dir: 输入数椐所在目录。 -* preprocess.py: 预处理脚本。 - -运行成功后目录`demo/sentiment/data/pre-imdb` 结构如下: - -``` -dict.txt labels.list test.list test_part_000 train.list train_part_000 -``` -* test\_part\_000 and train\_part\_000: 所有标记的测试集和训练集, 训练集已经随机打乱。 -* train.list and test.list: 训练集和测试集文件列表。 -* dict.txt: 利用训练集生成的字典。 -* labels.txt: neg 0, pos 1, 含义:标签0表示负面的评论,标签1表示正面的评论。 - -### 用户自定义数椐预处理 - -如果你执行其它的用情感分析来分类文本的任务,可以按如下的结构来准备数椐. 我们提供了脚本来构建字典和预处理数椐。所以你只用按下面的结构来组织数椐就行了。 - -``` -dataset -|----train -| |----class1 -| | |----text_files -| |----class2 -| | |----text_files -| | ... -|----test -| |----class1 -| | |----text_files -| |----class2 -| | |----text_files -| | ... -``` -* dataset: 一级目录。 -* train, test: 二级目录。 -* class1,class2,...: 三级目录。 -* text_files: 文本格式的实例文件。 - -所有同目录下的文本实例文件都是同级别的。 每个文本文件包含一个或者多个实例,每一行表示一个实例。 为了充分的随机打乱训练集, 在预处理含有多行数椐的文本文件时参数设置稍有不同, 执行`preprocess.sh`脚本时需要加上`-m True`参数。 tokenizer.perl 默认用来切分单记和标点符号,如果你不需要这个操作,在运行`preprocess.sh`时加上`-t False`参数即可。 - -## 训练模型 - -在这步任务中,我们使用了循环神经网络(RNN)的 LSTM 架构来训练情感分析模型。 引入LSTM模型主要是为了克服消失梯度的问题。 LSTM网络类似于具有隐藏层的标准循环神经网络, 但是隐藏层中的每个普通节点被一个记忆单元替换。 每个记忆单元包含四个主要的元素: 输入门, 具有自循环连接的神经元,忘记门和输出门。 更多的细节可以在文献中找到[4]。 LSTM架构的最大优点是它可以在长时间间隔内记忆信息,而没有短时记忆的损失。在有新的单词来临的每一个时间步骤内,存储在记忆单元区块的历史信息被更新用来迭代的学习单词以合理的序列程现。 - -
![LSTM](src/lstm.png)
-
图表 1. LSTM [3]
- -情感分析是自然语言理解中最典型的问题之一。 它的目的是预测在一个序列中表达的情感态度。 通常, ,仅仅是一些关键词,如形容词和副词,在预测序列或段落的情感中起主要作用。然而有些评论上下文非常长,例如 IMDB的数椐集。 我们只所以使用LSTM来执行这个任务是因为其改进的设计并且具有门机制。 首先,它能够从词级到具有可变上下文长度的上下文级别来总结表示。 第二,它可以在句子级别利用可扩展的上下文, 而大多数方法只是利用n-gram级别的知识。第三,它直接学习段落表示,而不是组合上下文级别信息。 - -在本演示中,我们提供两个网络,即双向LSTM和三层堆叠LSTM。 - -#### 双向LSTM - -图2是双向LSTM网络,后面连全连接层和softmax层。 - -
![BiLSTM](src/bi_lstm.jpg)
-
图 2. Bidirectional-LSTM
- -#### Stacked-LSTM -图3是三层LSTM结构。图的底部是word embedding(对文档处理后形成的单词向量)。 接下来,连接三个LSTM隐藏层,并且第二个是反向LSTM。然后提取隐藏LSTM层的所有时间步长的最大词向量作为整个序列的表示。 最后,使用具有softmax激活的全连接前馈层来执行分类任务。 更多内容可查看参考文献 [5]。 - -
![StackedLSTM](src/stacked_lstm.jpg)
-
图 3. Stacked-LSTM for sentiment analysis
- -**配置** - -进入`demo/sentiment` 目录 , `trainer_config.py` 是一个配置文件的例子, 其中包含算法和网络配置。第一行从`sentiment_net.py`中导出预定义的网络。 - -trainer_config.py: - -```python -from sentiment_net import * - -data_dir = "./data/pre-imdb" -# whether this config is used for test -is_test = get_config_arg('is_test', bool, False) -# whether this config is used for prediction -is_predict = get_config_arg('is_predict', bool, False) -dict_dim, class_dim = sentiment_data(data_dir, is_test, is_predict) - -################## Algorithm Config ##################### - -settings( - batch_size=128, - learning_rate=2e-3, - learning_method=AdamOptimizer(), - regularization=L2Regularization(8e-4), - gradient_clipping_threshold=25 -) - -#################### Network Config ###################### -stacked_lstm_net(dict_dim, class_dim=class_dim, - stacked_num=3, is_predict=is_predict) -#bidirectional_lstm_net(dict_dim, class_dim=class_dim, is_predict=is_predict) -``` - -* **数椐定义**: - * get\_config\_arg(): 获取通过 `--config_args=xx` 设置的命令行参数。 - * 定义训练数椐和测试数椐提供者, 这里使用了PaddlePaddle的Python接口来加载数椐。想了解更多细节可以参考PyDataProvider部分的文档 - -* **算法配置**: - * 使用随机梯度下降(sgd)算法。 - * 使用 adam 优化。 - * 设置batch size大小为128。 - * 设置平均sgd窗口。 - * 设置全局学习率。 -* **网络配置**: - * dict_dim: 获取字典维度。 - * class_dim: 设置类别数,IMDB有两个标签,即正面评价标签和负面评价标签。 - * `stacked_lstm_net`: 预定义网络如图3所示,默认情况下使用此网络 - * `bidirectional_lstm_net`: 预定义网络,如图2所示。 - -**训练** - -首先安装PaddlePaddle。 然后使用下面的脚本 `train.sh` 来开启本地的训练。 - -``` -cd demo/sentiment/ -./train.sh -``` - -train.sh: - -``` -config=trainer_config.py -output=./model_output -paddle train --config=$config \ - --save_dir=$output \ - --job=train \ - --use_gpu=false \ - --trainer_count=4 \ - --num_passes=10 \ - --log_period=20 \ - --dot_period=20 \ - --show_parameter_stats_period=100 \ - --test_all_data_in_one_period=1 \ - 2>&1 | tee 'train.log' -``` - -* \--config=$config: 设置网络配置。 -* \--save\_dir=$output: 设置输出路径以保存训练完成的模型。 -* \--job=train: 设置工作模式为训练。 -* \--use\_gpu=false: 使用CPU训练,如果你安装GPU版本的PaddlePaddle,并想使用GPU来训练设置为true。 -* \--trainer\_count=4:设置线程数(或GPU个数)。 -* \--num\_passes=15: 设置pass,PaddlePaddle中的一个pass意味着对数据集中的所有样本进行一次训练。 -* \--log\_period=20: 每20个batch打印一次日志。 -* \--show\_parameter\_stats\_period=100: 每100个batch打印一次统计信息。 -* \--test\_all_data\_in\_one\_period=1: 每次测试都测试所有数据。 - -如果运行成功,输出日志保存在路径 `demo/sentiment/train.log`中,模型保存在目录`demo/sentiment/model_output/`中。 输出日志说明如下: - -``` -Batch=20 samples=2560 AvgCost=0.681644 CurrentCost=0.681644 Eval: classification_error_evaluator=0.36875 CurrentEval: classification_error_evaluator=0.36875 -... -Pass=0 Batch=196 samples=25000 AvgCost=0.418964 Eval: classification_error_evaluator=0.1922 -Test samples=24999 cost=0.39297 Eval: classification_error_evaluator=0.149406 -``` -- Batch=xx: 表示训练了xx个Batch。 -- samples=xx: 表示训练了xx个样本。。 -- AvgCost=xx: 从第0个batch到当前batch的平均损失。 -- CurrentCost=xx: 最新log_period个batch处理的当前损失。 -- Eval: classification\_error\_evaluator=xx: 表示第0个batch到当前batch的分类错误。 -- CurrentEval: classification\_error\_evaluator: 最新log_period个batch的分类错误。 -- Pass=0: 通过所有训练集一次称为一遍。 0表示第一次经过训练集。 - -默认情况下,我们使用`stacked_lstm_net`网络,当传递相同的样本数时,它的收敛速度比`bidirectional_lstm_net`快。如果要使用双向LSTM,只需删除最后一行中的注释并把“stacked_lstm_net”注释掉。 - -## 测试模型 - -测试模型是指使用训练出的模型评估已标记的验证集。 - -``` -cd demo/sentiment -./test.sh -``` - -test.sh: - -```bash -function get_best_pass() { - cat $1 | grep -Pzo 'Test .*\n.*pass-.*' | \ - sed -r 'N;s/Test.* error=([0-9]+\.[0-9]+).*\n.*pass-([0-9]+)/\1 \2/g' | \ - sort | head -n 1 -} - -log=train.log -LOG=`get_best_pass $log` -LOG=(${LOG}) -evaluate_pass="model_output/pass-${LOG[1]}" - -echo 'evaluating from pass '$evaluate_pass - -model_list=./model.list -touch $model_list | echo $evaluate_pass > $model_list -net_conf=trainer_config.py -paddle train --config=$net_conf \ - --model_list=$model_list \ - --job=test \ - --use_gpu=false \ - --trainer_count=4 \ - --config_args=is_test=1 \ - 2>&1 | tee 'test.log' -``` - -函数`get_best_pass`依据分类错误率获得最佳模型进行测试。 在本示例中,我们默认使用IMDB的测试数据集作为验证。 与训练不同,它需要在这里指定`--job = test`和模型路径,即`--model_list = $model_list`。如果运行成功,日志将保存在“demo / sentiment / test.log”的路径中。例如,在我们的测试中,最好的模型是`model_output / pass-00002`,分类误差是0.115645,如下: - -``` -Pass=0 samples=24999 AvgCost=0.280471 Eval: classification_error_evaluator=0.115645 -``` - -## 预测 - -`predict.py`脚本提供了一个预测接口。在使用它之前请安装PaddlePaddle的python api。 预测IMDB的未标记评论的一个实例如下: - -``` -cd demo/sentiment -./predict.sh -``` -predict.sh: - -``` -#Note the default model is pass-00002, you shold make sure the model path -#exists or change the mode path. -model=model_output/pass-00002/ -config=trainer_config.py -label=data/pre-imdb/labels.list -cat ./data/aclImdb/test/pos/10007_10.txt | python predict.py \ - --tconf=$config\ - --model=$model \ - --label=$label \ - --dict=./data/pre-imdb/dict.txt \ - --batch_size=1 -``` - -* `cat ./data/aclImdb/test/pos/10007_10.txt` : 输入预测样本。 -* `predict.py` : 预测接口脚本。 -* `--tconf=$config` : 设置网络配置。 -* `--model=$model` : 设置模型路径。 -* `--label=$label` : 设置标签类别字典,这个字典是整数标签和字符串标签的一个对应。 -* `--dict=data/pre-imdb/dict.txt` : 设置字典文件。 -* `--batch_size=1` : 设置batch size。 - -注意应该确保默认模型路径`model_output / pass-00002`存在或更改为其它模型路径。 - -本示例的预测结果: - -``` -Loading parameters from model_output/pass-00002/ -./data/aclImdb/test/pos/10014_7.txt: predicting label is pos -``` -我们真诚地感谢您的关注,并欢迎您来参与贡献。 - -## 参考文档 -[1] Brendan O'Connor, Ramnath Balasubramanyan, Bryan R. Routledge, and Noah A. Smith. 2010. [From Tweets to Polls: Linking Text Sentiment to Public Opinion Time Series](http://homes.cs.washington.edu/~nasmith/papers/oconnor+balasubramanyan+routledge+smith.icwsm10.pdf). In ICWSM-2010.
-[2] Johan Bollen, Huina Mao, Xiaojun Zeng. 2011. [Twitter mood predicts the stock market](http://arxiv.org/abs/1010.3003), Journal of Computational Science.
-[3] Alex Graves, Marcus Liwicki, Santiago Fernan- dez, Roman Bertolami, Horst Bunke, and Ju ̈rgen Schmidhuber. 2009. [A novel connectionist system for unconstrained handwriting recognition. IEEE Transactions on Pattern Analysis and Machine In- telligence](http://www.cs.toronto.edu/~graves/tpami_2009.pdf), 31(5):855–868.
-[4] Zachary C. Lipton, [A Critical Review of Recurrent Neural Networks for Sequence Learning](http://arxiv.org/abs/1506.00019v1), arXiv:1506.00019.
-[5] Jie Zhou and Wei Xu; [End-to-end Learning of Semantic Role Labeling Using Recurrent Neural Networks](http://www.aclweb.org/anthology/P/P15/P15-1109.pdf); ACL-IJCNLP 2015.
diff --git a/doc/tutorials/sentiment_analysis/index_en.md b/doc/tutorials/sentiment_analysis/index_en.md deleted file mode 100644 index bb7681db44ca6f286ad6935ddfecb9becb429192..0000000000000000000000000000000000000000 --- a/doc/tutorials/sentiment_analysis/index_en.md +++ /dev/null @@ -1,328 +0,0 @@ -# Sentiment Analysis Tutorial - -Sentiment analysis has many applications. A basic task in sentiment analysis is classifying the polarity of a given text at the document, sentence or feature/aspect level. One simple example is to classify the customer reviews in a shopping website, a tourism website, and group buying websites like Amazon, TaoBao, Tmall etc. - -Sentiment analysis is also used to monitor social media based on large amount of reviews or blogs. For example, the researchers analyzed several surveys on consumer confidence and political opinion, found they correlate to sentiment word frequencies in contemporaneous Twitter messages [1]. Another example is to forecast stock movements through analyzing the text content of a daily Twitter blog [2]. - -On the other hand, grabbing the user comments of products and analyzing their sentiment are useful to understand user preferences for companies, products, even competing products. - -This tutorial will guide you through the process of training a Long Short Term Memory (LSTM) Network to classify the sentiment of sentences from [Large Movie Review Dataset](http://ai.stanford.edu/~amaas/data/sentiment/), sometimes known as the Internet Movie Database (IMDB). This dataset contains movie reviews along with their associated binary sentiment polarity labels, namely positive and negative. So randomly guessing yields 50% accuracy. - -## Data Preparation - -### IMDB Data Introduction - -Before training models, we need to preprocess the data and build a dictionary. First, you can use following script to download IMDB dataset and [Moses](http://www.statmt.org/moses/) tool, which is a statistical machine translation system. We provide a data preprocessing script, which is capable of handling not only IMDB data, but also other user-defined data. In order to use the pre-written script, it needs to move labeled train and test samples to another path, which has been done in `get_imdb.sh`. - -``` -cd demo/sentiment/data -./get_imdb.sh -``` -If the data is obtained successfuly, you will see the following files at ```./demo/sentiment/data```: - -``` -aclImdb get_imdb.sh imdb mosesdecoder-master -``` - -* aclImdb: raw dataset downloaded from website. -* imdb: only contains train and test data. -* mosesdecoder-master: Moses tool. - -IMDB dataset contains 25,000 highly polar movie reviews for training, and 25,000 for testing. A negative review has a score ≤ 4 out of 10, and a positive review has a score ≥ 7 out of 10. After running `./get_imdb.sh`, we can find the dataset has the following structure in `aclImdb`. - -``` -imdbEr.txt imdb.vocab README test train -``` -* train: train sets. -* test : test sets. -* imdb.vocab: dictionary. -* imdbEr.txt: expected rating for each token in imdb.vocab. -* README: data documentation. - -The file in train set directory is as follows. The test set also contains them except `unsup` and `urls_unsup.txt`. - -``` -labeledBow.feat neg pos unsup unsupBow.feat urls_neg.txt urls_pos.txt urls_unsup.txt -``` - -* pos: positive samples, contains 12,500 txt files, each file is one movie review. -* neg: negative samples, contains 12,500 txt files, each file is one movie review. -* unsup: unlabeled samples, contains 50,000 txt files. -* urls_xx.txt: urls of each reviews. -* xxBow.feat: already-tokenized bag of words (BoW) features. - -### IMDB Data Preparation - -In this demo, we only use labled train and test set and not use imdb.vocab as dictionary. By default, dictionary is builded on train set. Train set is shuffled and test set is not. `tokenizer.perl` in Moses tool is used to tokenize the words and punctuation. Simply execute the following command to preprcess data. - -``` -cd demo/sentiment/ -./preprocess.sh -``` -preprocess.sh: - -``` -data_dir="./data/imdb" -python preprocess.py -i data_dir -``` - -* data_dir: input data directory. -* preprocess.py: preprocess script. - -If running successfully, you will see `demo/sentiment/data/pre-imdb` directory as follows: - -``` -dict.txt labels.list test.list test_part_000 train.list train_part_000 -``` -* test\_part\_000 and train\_part\_000: all labeled test and train sets. Train sets have be shuffled. -* train.list and test.list: train and test file lists. -* dict.txt: dictionary generated on train sets by default. -* labels.txt: neg 0, pos 1, means label 0 is negative review, label 1 is positive review. - -### User-defined Data Preparation - -If you perform other sentiment classifcation task, you can prepare data as follows. We have provided the scripts to build dictionary and preprocess data. So just organize data as follows. - -``` -dataset -|----train -| |----class1 -| | |----text_files -| |----class2 -| | |----text_files -| | ... -|----test -| |----class1 -| | |----text_files -| |----class2 -| | |----text_files -| | ... -``` -* dataset: 1st directory. -* train, test: 2nd directory. -* class1,class2,...: 3rd directory. -* text_files: samples with text file format. - -All samples with text files format under the same folder are same category. Each text file contains one or more samples and each line is one sample. In order to shuffle fully, the preprocessing is a little different for data with multiple lines in one text file, which needs to set `-m True` in `preprocess.sh`. And tokenizer.perl is used by default. If you don't need it, only set `-t False` in `preprocess.sh'. - -## Training - -In this task, we use Recurrent Neural Network (RNN) of LSTM architecure to train sentiment analysis model. LSTM model was introduced primarily in order to overcome the problem of vanishing gradients. LSTM network resembles a standard recurrent neural network with a hidden layer, but each ordinary node in the hidden layer is replaced by a memory cell. Each memory cell contains four main elements: an input gate, a neuron with a self-recurrent connection, a forget gate and an output gate. More details can be found in the literature [4]. The biggest advantage of the LSTM architecture is that it learns to memorize information over long time intervals without the loss of short time memory. At each time step with a new coming word, historical information stored in the memory block is updated to iteratively learn the sequence representation. - -
![LSTM](./lstm.png)
-
Figure 1. LSTM [3]
- -Sentiment analysis is among the most typical problems in natural language understanding. It aims at predicting the attitude expressed in a sequence. Usually, only some key words, like adjectives and adverbs words, play a major role in predicting the sentiment of sequences or paragraphs. However, some review or comment contexts are very long, such as IMDB dataset. We use LSTM to perform this task for its improved design with the gate mechanism. First, it is able to summarize the representation from word level to context level with variable context length which is adapted by the gate values. Second, it can utilize the expanded context at the sentence level, while most methods are good at utilizing n-gram level knowledge. Third, it learns the paragraph representation directly rather than combining the context level information. This results in this end-to-end framework. - -In this demo we provide two network, namely bidirectional-LSTM and three layers of stacked-LSTM. - -#### Bidirectional-LSTM - -One is a bidirectional LSTM network, connected by fully connected layer and softmax, as shown in Figure 2. - -
![BiLSTM](./bi_lstm.jpg)
-
Figure 2. Bidirectional-LSTM
- -#### Stacked-LSTM -Another is three-layer LSTM structure in Figure 3. The bottom of the figure is word embedding. Next, three LSTM-Hidden layers are connected and the second LSTM is reversed. Then extract the maximum hidden vectors of all time step of hidden and LSTM layer as the representation for the entire sequence. Finally, a fully connected feed forward layer with softmax activation is used to perform the classification task. This network is refered to paper [5]. - -
![StackedLSTM](./stacked_lstm.jpg)
-
Figure 3. Stacked-LSTM for sentiment analysis
- -**Config** - -Switch into `demo/sentiment` directory, `trainer_config.py` file is an example of the config, containing algorithm and newtork configure. The first line imports predefined networks from `sentiment_net.py`. - -trainer_config.py: - -```python -from sentiment_net import * - -data_dir = "./data/pre-imdb" -# whether this config is used for test -is_test = get_config_arg('is_test', bool, False) -# whether this config is used for prediction -is_predict = get_config_arg('is_predict', bool, False) -dict_dim, class_dim = sentiment_data(data_dir, is_test, is_predict) - -################## Algorithm Config ##################### - -settings( - batch_size=128, - learning_rate=2e-3, - learning_method=AdamOptimizer(), - average_window=0.5, - regularization=L2Regularization(8e-4), - gradient_clipping_threshold=25 -) - -#################### Network Config ###################### -stacked_lstm_net(dict_dim, class_dim=class_dim, - stacked_num=3, is_predict=is_predict) -#bidirectional_lstm_net(dict_dim, class_dim=class_dim, is_predict=is_predict) -``` - -* **Data Definition**: - * get\_config\_arg(): get arguments setted by `--config_args=xx` in commandline argument. - * Define data provider, here using Python interface to load data. For details, you can refer to the document of PyDataProvider2. - -* **Algorithm Configuration**: - * set batch size of 128. - * set global learning rate. - * use adam optimization. - * set average sgd window. - * set L2 regularization. - * set gradient clipping threshold. -* **Network Configuration**: - * dict_dim: dictionary dimension. - * class_dim: category number, IMDB has two label, namely positive and negative label. - * `stacked_lstm_net`: predefined network as shown in Figure 3, use this network by default. - * `bidirectional_lstm_net`: predefined network as shown in Figure 2. - -**Training** - -Install PaddlePaddle first if necessary. Then you can use script `train.sh` as follows to launch local training. - -``` -cd demo/sentiment/ -./train.sh -``` - -train.sh: - -``` -config=trainer_config.py -output=./model_output -paddle train --config=$config \ - --save_dir=$output \ - --job=train \ - --use_gpu=false \ - --trainer_count=4 \ - --num_passes=10 \ - --log_period=20 \ - --dot_period=20 \ - --show_parameter_stats_period=100 \ - --test_all_data_in_one_period=1 \ - 2>&1 | tee 'train.log' -``` - -* \--config=$config: set network config. -* \--save\_dir=$output: set output path to save models. -* \--job=train: set job mode to train. -* \--use\_gpu=false: use CPU to train, set true, if you install GPU version of PaddlePaddle and want to use GPU to train. -* \--trainer\_count=4: set thread number (or GPU count). -* \--num\_passes=15: set pass number, one pass in PaddlePaddle means training all samples in dataset one time. -* \--log\_period=20: print log every 20 batches. -* \--show\_parameter\_stats\_period=100: show parameter statistic every 100 batches. -* \--test\_all_data\_in\_one\_period=1: test all data every testing. - -If the run succeeds, the output log is saved in path of `demo/sentiment/train.log` and model is saved in path of `demo/sentiment/model_output/`. The output log is explained as follows. - -``` -Batch=20 samples=2560 AvgCost=0.681644 CurrentCost=0.681644 Eval: classification_error_evaluator=0.36875 CurrentEval: classification_error_evaluator=0.36875 -... -Pass=0 Batch=196 samples=25000 AvgCost=0.418964 Eval: classification_error_evaluator=0.1922 -Test samples=24999 cost=0.39297 Eval: classification_error_evaluator=0.149406 -``` -- Batch=xx: means passing xx batches. -- samples=xx: means passing xx samples. -- AvgCost=xx: averaged cost from 0-th batch to current batch. -- CurrentCost=xx: current cost of latest log_period batches. -- Eval: classification\_error\_evaluator=xx: means classfication error from 0-th batch ro current batch. -- CurrentEval: classification\_error\_evaluator: current classfication error of the lates log_period batches. -- Pass=0: Going through all training set one time is called one pass. 0 means going through training set first time. - -By default, we use the `stacked_lstm_net` network, which converges at a faster rate than `bidirectional_lstm_net` when passing same sample number. If you want to use bidirectional LSTM, just remove comment in the last line and comment `stacked_lstm_net`. - -## Testing - -Testing means evaluating the labeled validation set using trained model. - -``` -cd demo/sentiment -./test.sh -``` - -test.sh: - -```bash -function get_best_pass() { - cat $1 | grep -Pzo 'Test .*\n.*pass-.*' | \ - sed -r 'N;s/Test.* error=([0-9]+\.[0-9]+).*\n.*pass-([0-9]+)/\1 \2/g' | \ - sort | head -n 1 -} - -log=train.log -LOG=`get_best_pass $log` -LOG=(${LOG}) -evaluate_pass="model_output/pass-${LOG[1]}" - -echo 'evaluating from pass '$evaluate_pass - -model_list=./model.list -touch $model_list | echo $evaluate_pass > $model_list -net_conf=trainer_config.py -paddle train --config=$net_conf \ - --model_list=$model_list \ - --job=test \ - --use_gpu=false \ - --trainer_count=4 \ - --config_args=is_test=1 \ - 2>&1 | tee 'test.log' -``` - -The function `get_best_pass` gets the best model by classification error rate for testing. In this example, We use test dataset of IMDB as validation by default. Unlike training, it needs to specify `--job=test` and model path, namely `--model_list=$model_list` here. If running successfully, the log is saved in path of `demo/sentiment/test.log`. For example, in our test, the best model is `model_output/pass-00002`, the classification error is 0.115645 as follows. - -``` -Pass=0 samples=24999 AvgCost=0.280471 Eval: classification_error_evaluator=0.115645 -``` - -## Prediction - -`predict.py` provides a predicting interface. You should install python api of PaddlePaddle before using it. One example to predict unlabeled review of IMDB is as follows. Simply running: - -``` -cd demo/sentiment -./predict.sh -``` -predict.sh: - -``` -#Note the default model is pass-00002, you shold make sure the model path -#exists or change the mode path. -model=model_output/pass-00002/ -config=trainer_config.py -label=data/pre-imdb/labels.list -cat ./data/aclImdb/test/pos/10007_10.txt | python predict.py \ - --tconf=$config\ - --model=$model \ - --label=$label \ - --dict=./data/pre-imdb/dict.txt \ - --batch_size=1 -``` - -* `cat ./data/aclImdb/test/pos/10007_10.txt` : the input sample. -* `predict.py` : predicting interface. -* `--tconf=$config` : set network configure. -* ` --model=$model` : set model path. -* `--label=$label` : set dictionary about corresponding relation between integer label and string label. -* `--dict=data/pre-imdb/dict.txt` : set dictionary. -* `--batch_size=1` : set batch size. - -Note you should make sure the default model path `model_output/pass-00002` -exists or change the model path. - -Predicting result of this example: - -``` -Loading parameters from model_output/pass-00002/ -./data/aclImdb/test/pos/10014_7.txt: predicting label is pos -``` -We sincerely appreciate your interest and welcome your contributions. - -## Reference -[1] Brendan O'Connor, Ramnath Balasubramanyan, Bryan R. Routledge, and Noah A. Smith. 2010. [From Tweets to Polls: Linking Text Sentiment to Public Opinion Time Series](http://homes.cs.washington.edu/~nasmith/papers/oconnor+balasubramanyan+routledge+smith.icwsm10.pdf). In ICWSM-2010.
-[2] Johan Bollen, Huina Mao, Xiaojun Zeng. 2011. [Twitter mood predicts the stock market](http://arxiv.org/abs/1010.3003), Journal of Computational Science.
-[3] Alex Graves, Marcus Liwicki, Santiago Fernan- dez, Roman Bertolami, Horst Bunke, and Ju ̈rgen Schmidhuber. 2009. [A novel connectionist system for unconstrained handwriting recognition. IEEE Transactions on Pattern Analysis and Machine In- telligence](http://www.cs.toronto.edu/~graves/tpami_2009.pdf), 31(5):855–868.
-[4] Zachary C. Lipton, [A Critical Review of Recurrent Neural Networks for Sequence Learning](http://arxiv.org/abs/1506.00019v1), arXiv:1506.00019.
-[5] Jie Zhou and Wei Xu; [End-to-end Learning of Semantic Role Labeling Using Recurrent Neural Networks](http://www.aclweb.org/anthology/P/P15/P15-1109.pdf); ACL-IJCNLP 2015.
diff --git a/doc/tutorials/sentiment_analysis/lstm.png b/doc/tutorials/sentiment_analysis/lstm.png deleted file mode 100644 index aaf1fc690da2ffb8418cde5ed81848ddb5263030..0000000000000000000000000000000000000000 Binary files a/doc/tutorials/sentiment_analysis/lstm.png and /dev/null differ diff --git a/doc/tutorials/sentiment_analysis/src/bi_lstm.jpg b/doc/tutorials/sentiment_analysis/src/bi_lstm.jpg deleted file mode 100644 index adec1606d64d6e35ffe7e62abfa9a09309b05c84..0000000000000000000000000000000000000000 Binary files a/doc/tutorials/sentiment_analysis/src/bi_lstm.jpg and /dev/null differ diff --git a/doc/tutorials/sentiment_analysis/src/lstm.png b/doc/tutorials/sentiment_analysis/src/lstm.png deleted file mode 100644 index aaf1fc690da2ffb8418cde5ed81848ddb5263030..0000000000000000000000000000000000000000 Binary files a/doc/tutorials/sentiment_analysis/src/lstm.png and /dev/null differ diff --git a/doc/tutorials/sentiment_analysis/src/stacked_lstm.jpg b/doc/tutorials/sentiment_analysis/src/stacked_lstm.jpg deleted file mode 100644 index 4239055050966e0095e188a8c81d860711bce29d..0000000000000000000000000000000000000000 Binary files a/doc/tutorials/sentiment_analysis/src/stacked_lstm.jpg and /dev/null differ diff --git a/doc/tutorials/sentiment_analysis/stacked_lstm.jpg b/doc/tutorials/sentiment_analysis/stacked_lstm.jpg deleted file mode 100644 index 4239055050966e0095e188a8c81d860711bce29d..0000000000000000000000000000000000000000 Binary files a/doc/tutorials/sentiment_analysis/stacked_lstm.jpg and /dev/null differ diff --git a/doc/tutorials/text_generation/index_cn.md b/doc/tutorials/text_generation/index_cn.md deleted file mode 100644 index 41a87b926db399d692d677e5278e7d5a0b7b5594..0000000000000000000000000000000000000000 --- a/doc/tutorials/text_generation/index_cn.md +++ /dev/null @@ -1,339 +0,0 @@ -# 文本生成教程 # - -在语言生成领域中,“序列到序列”(sequence to sequence)的方法已被证明是一种强大的模型。它可以被应用于进行机器翻译(machine translation)、query改写(query rewriting)、图像描述(image captioning)等等。 - -本篇教程将会指导你通过训练一个“序列到序列”的神经网络机器翻译(NMT)模型来将法语翻译成英语。 - -我们遵循 [Neural Machine Translation by Jointly Learning to Align and Translate](http://arxiv.org/abs/1409.0473) 这篇文章,其中详细说明了模型架构,以及在WMT-14数据集上得到良好表现的训练过程。本篇教程在PaddlePaddle中重现了这一良好的训练结果。 - -我们感谢@caoying的pull request,其中定义了模型架构和solver配置。 - -## 数据准备 ## -### 下载与解压缩 ### -从该链接 [http://www-lium.univ-lemans.fr/~schwenk/cslm\_joint\_paper/](http://www-lium.univ-lemans.fr/~schwenk/cslm_joint_paper/) 下载WMT-14数据集,然后解压,并将Develop和Test数据分别放入不同的文件夹。 - -- **Train data**: [bitexts (选择过后的)](http://www-lium.univ-lemans.fr/~schwenk/cslm_joint_paper/data/bitexts.tgz) -- **Develop and Test data**: [dev 与 test 数据](http://www-lium.univ-lemans.fr/~schwenk/cslm_joint_paper/data/dev+test.tgz) - -在Linux下,只需要简单地运行以下命令。否则你需要自己下载、解压、拆分到不同文件夹、并且分别重命名文件后缀。 - -```bash -cd demo/seqToseq/data -./wmt14_data.sh -``` - -我们会发现数据集 `wmt14` 中包含如下表所示的3个文件夹。 - ------ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
folder nameFrench-English parallel corpora filenumber of total filesize
train_dataccb2_pc30.src, ccb2_pc30.trg, etc123.55G
test_datantst1213.src, ntst1213.trg21636k
gen_datantst14.src, ntst14.trg2864k
-
- -- 每个文件夹都包含法语到英语的平行语料库 -- **XXX.src** 是原始法语文件;**XXX.trg** 是目标英语文件 -- **XXX.src** 和 **XXX.trg** 的行数应该一致 -- 每行都是一个法语或者英语的句子 -- **XXX.src** 和 **XXX.trg** 中任意第i行的句子之间都有着一一对应的关系 - -### 用户自定义数据集 ### - -如果你想进行诸如语义转述(Paraphrasing)等其他“序列到序列”的任务,你只需要按照如下方式组织数据,并将它们放在`demo/seqToseq/data`目录下: - - dataset - train - file1.src file1.trg - file2.src file2.trg - ...... - test - file1.src file1.trg - file2.src file2.trg - ...... - gen - file1.src file1.trg - file2.src file2.trg - ...... - -- 一级目录:数据集文件夹名称 -- 二级目录:train、test和gen这三个文件夹是固定的 -- 三级目录:源语言到目标语言的平行语料库文件 - - **XXX.src** 是源语言的文件,**XXX.trg** 时目标语言的文件 - - 文件中的每行都必须是一个句子 - - **XXX.src** 和 **XXX.trg** 中任意第i行的句子之间都必须有着一一对应的关系 - -## 数据预处理 ## -### 预处理工作流程 ### -- 将每个源语言到目标语言的平行语料库文件合并为一个文件: - - 合并每个 **XXX.src** 和 **XXX.trg** 文件为 **XXX** - - **XXX** 中的第i行 = **XXX.src** 中的第i行 + '\t' + **XXX.trg**中的第i行 -- 创建训练数据的“源字典”和“目标字典”,每个字典都有DICTSIZE个单词,包括: - - 词频最高的(DICTSIZE - 3)个单词 - - 3个特殊符号 - - ``:序列的开始 - - ``:序列的结束 - - ``:未包含在字典中的单词 - -### 预处理命令和结果 -对数据集进行预处理的基本命令是: - -```python -cd demo/seqToseq/ -python preprocess.py -i INPUT [-d DICTSIZE] [-m] -``` - -- `-i INPUT`:输入的原始数据集路径 -- `-d DICTSIZE`:指定的字典单词数,如果没有设置,字典会包含输入数据集中的所有单词 -- `-m --mergeDict`:合并 “源字典”和“目标字典”,使得两个字典有相同的上下文 - -你将会看到如下消息: - - concat parallel corpora for dataset - build source dictionary for train data - build target dictionary for train data - dictionary size is XXX - -然后你只需要运行以下命令: - -```python -python preprocess.py -i data/wmt14 -d 30000 -``` - -这将花费数分钟的时间,并且将预处理好的数据集存放在`demo/seqToseq/data/pre-wmt14`目录下。目录结构如下: - - train test gen train.list test.list gen.list src.dict trg.dict# Text generation Tutorial # - -- **train, test, gen**:分别包含了法语到英语的平行语料库的训练数据、测试数据和生成数据。文件夹中的每个文件的每一行包含两部分,首先是法语序列,然后是对应的英语序列。 -- **train.list, test.list, gen.list**:分别为train,test,gen文件夹中的文件列表 -- **src.dict, trg.dict**:源(法语)/目标(英语)字典,每个字典包含总共30000个单词:29997个最高频单词和3个特殊符号 - -## 模型训练 ## -### 简介### - -神经网络机器翻译(NMT)旨在建立一个可以被协同调至最优翻译效果的单神经元网络。近期提出的NMT模型通常都属于编解码模型(encoder–decoder models)的一种。编解码模型将一个源语句编码为一个定长的向量,然后解码器通过这个向量生成一个目标语句。 - -在这个任务中,我们使用了一个编解码模型的扩展,它同时学习排列(align)与翻译。每当模型在翻译过程中生成了一个单词,它就会在源语句中搜索出最相关信息的位置的集合。解码器根据上下文向量预测出一个目标单词,这个向量与源中搜索出的位置和所有之前生成的目标单词有关。如想了解更多详细的解释,可以参考 [Neural Machine Translation by Jointly Learning to Align and Translate](http://arxiv.org/abs/1409.0473)。 - -这个模型对于编解码模型来说,最不同的特色是它并没有将输入语句编码为一个单独的定长向量。相反,它将输入语句编码为向量的序列,其中每个向量对应输入语句中的一个元素。然后在解码被翻译的语句时,会自适应地从这些向量中选择一个子集出来。这使得NMT模型得以解放出来,不必再将任意长度源语句中的所有信息压缩至一个定长的向量中。该模型在长语句翻译的场景下效果提升更加明显,在任意长度语句翻译的场景下都可以观察到其效果的提升。 -
![](./encoder-decoder-attention-model.png)
-
Figure 1. Encoder-Decoder-Attention-Model
- -### 使用PaddlePaddle训练模型 ### -我们在训练之前需要常见一个模型配置文件,这里是一个例子`demo/seqToseq/translation/train.conf`。前三行import了定义network,job_mode和attention_mode的python函数。 - -```python -from seqToseq_net import * -is_generating = False - -### Data Definiation -train_conf = seq_to_seq_data(data_dir = "./data/pre-wmt14", - is_generating = is_generating) - -### Algorithm Configuration -settings( - learning_method = AdamOptimizer(), - batch_size = 50, - learning_rate = 5e-4) - -### Network Architecture -gru_encoder_decoder(train_conf, is_generating) -``` - -1. **Data Definiation**:在示例中我们定义了一个序列到序列的训练和测试数据。它返回train_conf作为配置,其输入参数如下: - - data_dir:训练数据和测试数据的目录 - - is_generating:这个配置是否用来生成,这里设置为False -2. **Algorithm Configuration**:在示例中我们使用SGD训练算法(默认),和ADAM学习方法,指定batch_size为50,learning_rate为5e-4 -3. **Network Architecture**:在示例中我们使用attention版本的GRU编解码网络。它包括了一个双向的GRU作为编码器和解码器,它模拟了解码翻译过程中在源语句中的搜索。 - -### 训练模型的命令与结果### -写完模型配置之后,我们可以通过以下命令来训练模型: - -```bash -cd demo/seqToseq/translation -./train.sh -``` - -`train.sh` 的内容如下所示: - -```bash -paddle train \ ---config='translation/train.conf' \ ---save_dir='translation/model' \ ---use_gpu=false \ ---num_passes=16 \ ---show_parameter_stats_period=100 \ ---trainer_count=4 \ ---log_period=10 \ ---dot_period=5 \ -2>&1 | tee 'translation/train.log' -``` -- config: 设置神经网络的配置文件 -- save_dir: 设置保存模型的输出路径 -- use_gpu: 是否使用GPU训练,这里设置为使用CPU -- num_passes: 设置passes的数量。paddle中的一条pass表示训练数据集中所有的样本一次 -- show_parameter_stats_period: 这里每隔100个batch显示一次参数统计信息 -- trainer_count: 设置CPU线程数或者GPU设备数 -- log_period: 这里每隔10个batch打印一次日志 -- dot_period: 这里每个5个batch打印一个点"." - -训练的损失函数默认每隔10个batch打印一次,你将会看到如下消息: - - I0719 19:16:45.952062 15563 TrainerInternal.cpp:160] Batch=10 samples=500 AvgCost=198.475 CurrentCost=198.475 Eval: classification_error_evaluator=0.737155 CurrentEval: classification_error_evaluator=0.737155 - I0719 19:17:56.707319 15563 TrainerInternal.cpp:160] Batch=20 samples=1000 AvgCost=157.479 CurrentCost=116.483 Eval: classification_error_evaluator=0.698392 CurrentEval: classification_error_evaluator=0.659065 - ..... -- AvgCost:从第0个batch到当前batch的平均cost -- CurrentCost::当前batch的cost -- classification\_error\_evaluator(Eval):从第0个评估到当前评估中,每个单词的预测错误率 -- classification\_error\_evaluator(CurrentEval):当前评估中,每个单词的预测错误率 - -当classification\_error\_evaluator的值低于0.35时,模型就训练成功了。 - -## 文本生成 ## -### 简介### - -一般而言,NMT模型受制于源语句的编码,并且通过给出当前目标单词来预测下一个目标单词。在训练过程中,当前单词在相比之下总是被当作真值(ground truth)。在生成过程中,当前单词是解码器最后一步的输出,这来自于PaddlePaddle的内存中。 - -而且,我们使用集束搜索(Beam Search)来生成序列。集束搜索使用广度优先搜索来构建搜索树。对于树的每一层,生成当前层的所有后继状态,并将它们按照启发代价(heuristic cost)升序排列。但是这种方法在每层只保存预设数量的最优状态(这个数量称为beam size)。 - -### 预训练的模型 ### -我们在拥有50个节点的集群中训练模型,每个节点有两个6核CPU。我们在5天里训练了16个pass,其中每条pass花费了7个小时。model_dir中有16个子目录,每个里面都包含202MB的全部的模型参数。然后我们发现pass-00012的模型有着最高的BLEU值27.77(参考文献[BLEU: a Method for Automatic Evaluation of Machine Translation](http://www.aclweb.org/anthology/P02-1040.pdf))。要下载解压这个模型,只需在linux下运行如下命令: - -```bash -cd demo/seqToseq/data -./wmt14_model.sh -``` - -### 使用PaddlePaddle生成模型 ### -在翻译法语句子之前,我们需要创建模型配置文件。这里是一个例子`demo/seqToseq/translation/gen.conf`。前三行import了定义network,job_mode和attention_mode的python函数。 - -```python -from seqToseq_net import * -is_generating = True - -################## Data Definiation ##################### -gen_conf = seq_to_seq_data(data_dir = "./data/pre-wmt14", - is_generating = is_generating, - gen_result = "./translation/gen_result") - -############## Algorithm Configuration ################## -settings( - learning_method = AdamOptimizer(), - batch_size = 1, - learning_rate = 0) - -################# Network configure ##################### -gru_encoder_decoder(gen_conf, is_generating) -``` - -1. **Data Definiation**:在示例中我们定义了一个序列到序列的生成数据。它返回gen_conf作为配置,其输入参数如下: - - data_dir:生成数据的目录 -  - is_generating:这个配置是否用来生成,这里设置为True -  - gen_result:保存生成结果的文件 -2. **Algorithm Configuration**:在生成过程中我们使用SGD训练算法,并指定batch_size为1(每次生成1个序列),learning_rate为0 -3. **Network Architecture**:本质上与训练模型一样 - -### 生成模型的命令与结果 ### -写完模型配置之后,我们可以通过以下命令来进行从法语到英语的文本翻译: - -```bash -cd demo/seqToseq/translation -./gen.sh -``` - - `gen.sh` 的内容如下所示。与训练模型不同的是,这里有一些不同的参数需要指定: - -```bash -paddle train \ ---job=test \ ---config='translation/gen.conf' \ ---save_dir='data/wmt14_model' \ ---use_gpu=true \ ---num_passes=13 \ ---test_pass=12 \ ---trainer_count=1 \ -2>&1 | tee 'translation/gen.log' -``` -- job:设置任务的模式为测试 -- save_dir:存储模型的路径 -- num_passes and test_pass:从test_pass到(num_passes - 1)加载模型参数,这里只加载 `data/wmt14_model/pass-00012` - -你将会看到这样的消息: - - I0706 14:48:31.178915 31441 GradientMachine.cpp:143] Loading parameters from data/wmt14_model/pass-00012 - I0706 14:48:40.012039 31441 Tester.cpp:125] Batch=100 samples=100 AvgCost=0 - I0706 14:48:48.898632 31441 Tester.cpp:125] Batch=200 samples=200 AvgCost=0 - ... - -然后在`demo/seqToseq/translation/gen_result`中的生成结果如下所示: - - 0 - 0 -11.1314 The about the width of the seats while large controls are at stake - 1 -11.1519 The on the width of the seats while large controls are at stake - 2 -11.5988 The about the width of the seats while large controls are at stake . - - 1 - 0 -24.4149 The dispute is between the major aircraft manufacturers about the width of the tourist seats on the flights , paving the way for a confrontation during the month of the Dubai . - 1 -26.9524 The dispute is between the major aircraft manufacturers about the width of the tourist seats on the flights , paving the way for a confrontation during the month of Dubai ' s . - 2 -27.9574 The dispute is between the major aircraft manufacturers about the width of the tourist seats on the flights , paving the way for a confrontation during the month of Dubai ' s Dubai . - ... - -- 这是集束搜索的结果,其中beam size是3 -- 第一行的“0”和第6行的“1”表示生成数据的序列id -- 其他六行列出了集束搜索的结果 - - 第二列是集束搜索的得分(从大到小) - - 第三列是生成的英语序列 -- 有两个特殊标识: - - ``:序列的结尾 - - ``:不包含在字典中的单词 - -### BLEU评估 ### -对机器翻译的人工评估工作很广泛但也很昂贵。一篇论文 [BLEU: a Method for Automatic Evaluation of Machine Translation](http://www.aclweb.org/anthology/P02-1040.pdf) 展示了一种方法,当需要快速或者频繁的评估时,使用自动的替补来替代经验丰富的人工评判。[Moses](http://www.statmt.org/moses/) 是一个统计学的机器翻译系统,我们使用其中的 [multi-bleu.perl](https://github.com/moses-smt/mosesdecoder/blob/master/scripts/generic/multi-bleu.perl) 来做BLEU评估。运行以下命令来下载这个脚本: - -```bash -cd demo/seqToseq/translation -./moses_bleu.sh -``` - -由于标准的翻译结果已经下载到这里`data/wmt14/gen/ntst14.trg`,我们可以运行以下命令来做BLEU评估。 - -```bash -cd demo/seqToseq/translation -./eval_bleu.sh FILE BEAMSIZE -``` - -- FILE:生成的结果文件 -- BEAMSIZE:集束搜索中的扩展广度 diff --git a/doc/tutorials/text_generation/index_en.md b/doc/tutorials/text_generation/index_en.md deleted file mode 100644 index 5d8e667c20bd1fda64a6e11a88517d52112b72fa..0000000000000000000000000000000000000000 --- a/doc/tutorials/text_generation/index_en.md +++ /dev/null @@ -1,338 +0,0 @@ -# Text generation Tutorial # - -Sequence to sequence has been proven to be a powerful model for language generation. It can be used for machine translation, query rewriting, image captioning, etc. - -This tutorial guides you through training a sequence to sequence model for neural machine translation (NMT) network that translates French to English. - -We follow the paper [Neural Machine Translation by Jointly Learning to Align and Translate](http://arxiv.org/abs/1409.0473) , which details the model architecture and training procedure for good performance on WMT-14 dataset. This tutorial reproduces this result in PaddlePaddle. - -We thank @caoying for the pull request that defines the model architecture and solver configurations. - -## Data Preparation ## -### Download and Extract ### -Download the WMT-14 dataset from [http://www-lium.univ-lemans.fr/~schwenk/cslm\_joint\_paper/](http://www-lium.univ-lemans.fr/~schwenk/cslm_joint_paper/), extract it, and divide Develop and Test data into separate folder. - -- **Train data**: [bitexts (after selection)](http://www-lium.univ-lemans.fr/~schwenk/cslm_joint_paper/data/bitexts.tgz) -- **Develop and Test data**: [dev+test data](http://www-lium.univ-lemans.fr/~schwenk/cslm_joint_paper/data/dev+test.tgz) - -To do this, simply run the following commands in linux, otherwise, you need to download, extract, divide, and rename the file suffix respectively. - -```bash -cd demo/seqToseq/data -./wmt14_data.sh -``` - -We should find that the dataset `wmt14` has three folders as shown in the following table. - ------ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
folder nameFrench-English parallel corpora filenumber of total filesize
train_dataccb2_pc30.src, ccb2_pc30.trg, etctwelve3.55G
test_datantst1213.src, ntst1213.trgtwo1636k
gen_datantst14.src, ntst14.trgtwo864k
-
- -- Each folder has French-English parallel corpora -- **XXX.src** are source French files; **XXX.trg** are target English files. -- The number of lines of **XXX.src** and **XXX.trg** should be the same. -- Each line is a French/English sentence. -- There is a one-to-one correspondence between the sentence at the i-th line of **XXX.src** and **XXX.trg**. - -### User Defined Dataset ### - -If you need to do other sequence-to-sequence tasks, such as Paraphrasing, you only need to organize the data as follows, and place them in `demo/seqToseq/data`: - - dataset - train - file1.src file1.trg - file2.src file2.trg - ...... - test - file1.src file1.trg - file2.src file2.trg - ...... - gen - file1.src file1.trg - file2.src file2.trg - ...... -- 1st directory: dataset folder name -- 2nd directory: folder of train, test, and gen. The names of these three folders are fixed. -- 3rd file: Source-Target parallel corpora files. - - **XXX.src** are source files, **XXX.trg** are target files. - - Each line of the file must be a sequence. - - There should be a one-to-one correspondence between the i-th sequence of **XXX.src** and **XXX.trg**. - -## Data Preprocess ## -### Preprocessing Workflow ### -- Concat each Source-Target parallel corpora to be one file: - - concat each **XXX.src** and **XXX.trg** to be **XXX**. - - the i-th line of **XXX** = the i-th line of **XXX.src** + '\t' + the i-th line of **XXX.trg** -- Build source and target dictionary of train data, each dictionary has DICTSIZE words: - - the most frequent (DICTSIZE-3) words - - 3 special token: - - ``: the start of a sequence - - ``: the end of a sequence - - ``: a word not included in dictionary - -### Preprocessing Command and Result -The general command for preprocessing the dataset is: - -```python -cd demo/seqToseq/ -python preprocess.py -i INPUT [-d DICTSIZE] [-m] -``` - -- `-i INPUT`: the path of input original dataset -- `-d DICTSIZE`: the specified word count of dictionary, if not set, dictionary will contain all the words in input dataset -- `-m --mergeDict`: merge source and target dictionary, thus, two dictionaries have the same context - -And you will see messages like this: - - concat parallel corpora for dataset - build source dictionary for train data - build target dictionary for train data - dictionary size is XXX - -Here, you can simply run the command: - -```python -python preprocess.py -i data/wmt14 -d 30000 -``` - -It will take several minutes, and store the preprocessed dataset in `demo/seqToseq/data/pre-wmt14`, the directory has following structure. - - train test gen train.list test.list gen.list src.dict trg.dict - -- **train, test, gen**: folder contains French-English parallel corpora of train data, test data and gen data respectively. Each line of file in folder contains two parts, the former is a French sequence, and the latter is a corresponding English sequence. -- **train.list, test.list, gen.list**: text contains a file list in train folder, test folder and gen folder respectively -- **src.dict, trg.dict**: source (French) / target (English) dictionary, each dictionary has 30000 words: the most frequent 29997 words and 3 special token - -## Model Training ## -### Introduction ### - -Neural machine translation (NMT) aims at building a single neural network that can be jointly tuned to maximize translation performance. Recently proposed NMT models often belong to a family of encoder–decoder models. Encoder-Decoder models encode a source sentence into a fixed-length vector from which a decoder generates a target sentence. - -In this task, we use an extension to the encoder–decoder model which learns to align and translate jointly. Each time the model generates a word in a translation, it searches for a set of positions in the source sentence for the most relevant information. The decoder predicts a target word based on the context vectors associated with these source positions and all the previous generated target words. For more detailed explanation, readers can refer to paper [Neural Machine Translation by Jointly Learning to Align and Translate](http://arxiv.org/abs/1409.0473). - -The most distinguishing feature of this model is that it doesn't encode an input sentence into a single fixed-length vector. Instead, it encodes the input sentence into a sequence of vectors, where one vector corresponds to an input element. A subset of these vectors is chosen adaptively while decoding the translated sentence. This frees a NMT model from having to squash all the information of a source sentence, regardless of its length, into a fixed-length vector. The improvement of this model is more apparent for longer sentences, but the improvement can be observed for sentences of any length. -
![](./encoder-decoder-attention-model.png)
-
Figure 1. Encoder-Decoder-Attention-Model
- -### Training Model in PaddlePaddle ### -We need to create a model config file before training. Here is an example `demo/seqToseq/translation/train.conf`. The first three lines import python function for defining network, and define the job_mode and attention_mode. - -```python -from seqToseq_net import * -is_generating = False - -### Data Definiation -train_conf = seq_to_seq_data(data_dir = "./data/pre-wmt14", - is_generating = is_generating) - -### Algorithm Configuration -settings( - learning_method = AdamOptimizer(), - batch_size = 50, - learning_rate = 5e-4) - -### Network Architecture -gru_encoder_decoder(train_conf, is_generating) -``` - -1. **Data Definiation**: We define a SeqToSeq train and test data in our example. It returns train_conf as the configuration, following is its input arguments: - - data_dir: directory of train data and test data - - is\_generating: whether this config is used for generating, here is false -2. **Algorithm Configuration**: We use the SGD training algorithm (default), ADAM learning method in our example, specify batch_size as 50, and learning rate as 5e-4. -3. **Network Architecture**: We use an attention version of GRU Encoder-Decoder network in our example. It consists a bidirectional GRU as an encoder and a decoder that emulates searching through a source sentence during decoding a translation. - -### Training Command and Result### -After writing the model config, we can train the model by running the command: - -```bash -cd demo/seqToseq/translation -./train.sh -``` - -The `train.sh` is shown as follows: - -```bash -paddle train \ ---config='translation/train.conf' \ ---save_dir='translation/model' \ ---use_gpu=false \ ---num_passes=16 \ ---show_parameter_stats_period=100 \ ---trainer_count=4 \ ---log_period=10 \ ---dot_period=5 \ -2>&1 | tee 'translation/train.log' -``` -- config: set config of neural network -- save_dir: set output path to save models -- use_gpu: whether to use GPU to train, here use CPU -- num_passes: set number of passes. One pass in paddle means training all samples in dataset one time -- show_parameter_stats_period: here show parameter statistic every 100 batches -- trainer_count: set number of CPU threads or GPU devices -- log_period: here print log every 10 batches -- dot_period: here print '.' every 5 batches - -The training loss function is printed every 10 batch by default, and you will see messages like this: - - I0719 19:16:45.952062 15563 TrainerInternal.cpp:160] Batch=10 samples=500 AvgCost=198.475 CurrentCost=198.475 Eval: classification_error_evaluator=0.737155 CurrentEval: classification_error_evaluator=0.737155 - I0719 19:17:56.707319 15563 TrainerInternal.cpp:160] Batch=20 samples=1000 AvgCost=157.479 CurrentCost=116.483 Eval: classification_error_evaluator=0.698392 CurrentEval: classification_error_evaluator=0.659065 - ..... -- AvgCost: Average Cost from 0th batch to current batch -- CurrentCost: Cost in current batch -- classification\_error\_evaluator(Eval): False prediction rate for each word from 0th evaluation to current evaluation -- classification\_error\_evaluator(CurrentEval): False prediction rate for each word in current evaluation - -And when the classification\_error\_evaluator is less than 0.35, the model is trained sucessfully. - -## Text Generation ## -### Introduction ### - -Generally speaking, the NMT model is conditioned on the encodings of the source sentence, and then to predict the next target word by given the current target word. In the training process, the current word is always knowns as the ground truth, by contrast. In the generating process, the current word is the output of the decoder in last time step, which is accessed to from a memory in PaddlePaddle. - -Besides, we use Beam Search to generate sequences. Beam search uses breadth-first search to build its search tree. At each level of the tree, it generates all successors of the states at the current level, sorting them in increasing order of heuristic cost. However, it only stores a predetermined number of best states at each level (called the beam size). - -### Pretrained model ### -We trained the model on a cluster with 50 nodes, each node has two 6-core CPUs. We trained 16 passes in 5 days, where each pass takes 7 hours. The model_dir has 16 sub-folder, each of which contains the whole model parameters with 202MB size. And we find pass-00012 model has the highest BLEU 27.77 (see paper [BLEU: a Method for Automatic Evaluation of Machine Translation](http://www.aclweb.org/anthology/P02-1040.pdf)). To download and extract this model, simply run the following commands in linux. - -```bash -cd demo/seqToseq/data -./wmt14_model.sh -``` - -### Generating Model in PaddlePaddle ### -We need to create a model config file before translating French sequence. Here is an example `demo/seqToseq/translation/gen.conf`, the first three lines import python function for defining network, and define the job\_mode and attention\_mode. - -```python -from seqToseq_net import * -is_generating = True - -################## Data Definiation ##################### -gen_conf = seq_to_seq_data(data_dir = "./data/pre-wmt14", - is_generating = is_generating, - gen_result = "./translation/gen_result") - -############## Algorithm Configuration ################## -settings( - learning_method = AdamOptimizer(), - batch_size = 1, - learning_rate = 0) - -################# Network configure ##################### -gru_encoder_decoder(gen_conf, is_generating) -``` - -1. **Data Definiation**: We defines an SeqToSeq gen data in our example. It returns gen_conf as the configuration, following is its input arguments: - - data\_dir: directory of gen data -   - is\_generating: whether this config is used for generating, here is true -   - gen\_result: file to store the generation result -2. **Algorithm Configuration**: We use SGD traing algorithm in generation, and specify batch_size as 1 (each time generate one sequence), and learning rate as 0. -3. **Network Architecture**: Essentially the same as the training model. - -### Generating Command and Result ### -After writing the model config, we can do text translation from French to English by running the command: - -```bash -cd demo/seqToseq/translation -./gen.sh -``` - -The `gen.sh` is shown as follows, unlike training, there are some different arguments to specify: - -```bash -paddle train \ ---job=test \ ---config='translation/gen.conf' \ ---save_dir='data/wmt14_model' \ ---use_gpu=true \ ---num_passes=13 \ ---test_pass=12 \ ---trainer_count=1 \ -2>&1 | tee 'translation/gen.log' -``` -- job: set job mode to test -- save_dir: the path of saved models -- num_passes and test_pass: loading model parameters from test_pass to (num_passes - 1), here only loads `data/wmt14_model/pass-00012` - -You will see messages like this: - - I0706 14:48:31.178915 31441 GradientMachine.cpp:143] Loading parameters from data/wmt14_model/pass-00012 - I0706 14:48:40.012039 31441 Tester.cpp:125] Batch=100 samples=100 AvgCost=0 - I0706 14:48:48.898632 31441 Tester.cpp:125] Batch=200 samples=200 AvgCost=0 - ... - -And the generating result in `demo/seqToseq/translation/gen_result` likes: - - 0 - 0 -11.1314 The about the width of the seats while large controls are at stake - 1 -11.1519 The on the width of the seats while large controls are at stake - 2 -11.5988 The about the width of the seats while large controls are at stake . - - 1 - 0 -24.4149 The dispute is between the major aircraft manufacturers about the width of the tourist seats on the flights , paving the way for a confrontation during the month of the Dubai . - 1 -26.9524 The dispute is between the major aircraft manufacturers about the width of the tourist seats on the flights , paving the way for a confrontation during the month of Dubai ' s . - 2 -27.9574 The dispute is between the major aircraft manufacturers about the width of the tourist seats on the flights , paving the way for a confrontation during the month of Dubai ' s Dubai . - ... - -- This is the beam search result, where beam size is 3 -- '0' in 1st-line and '1' in 6th-line mean the sequence-id in gen data -- Other six lines list the beam search results - - The 2nd-column is the score of beam search (from large to small) - - The 3rd-colunm is the generating English sequence -- There is 2 special tokens: - - ``: the end of a sequence - - ``: a word not included in dictionary - -### Bleu Evalutaion ### -Human evaluations of machine translation are extensive but expensive. Paper [BLEU: a Method for Automatic Evaluation of Machine Translation](http://www.aclweb.org/anthology/P02-1040.pdf) presents a method as an automated understudy to skilled human judges which substitutes for them when there is need for quick or frequent evaluations. [Moses](http://www.statmt.org/moses/) is a statistical machine translation system, and we use [multi-bleu.perl](https://github.com/moses-smt/mosesdecoder/blob/master/scripts/generic/multi-bleu.perl) of it to do Bleu Evalution. To download this script, simply run the following command: - -```bash -cd demo/seqToseq/translation -./moses_bleu.sh -``` - -Since the standard translation is alrealy downloaded as `data/wmt14/gen/ntst14.trg`, we can do Bleu Evalution by running the command: - -```bash -cd demo/seqToseq/translation -./eval_bleu.sh FILE BEAMSIZE -``` - -- FILE: the generation result file -- BEAMSIZE: expand width in beam search diff --git a/doc_theme/templates/layout.html b/doc_theme/templates/layout.html index 65e61c5f298e19adc6330c378779a6edf418752e..9fca69dc4e7f0827acfc755a97a662350214b90e 100644 --- a/doc_theme/templates/layout.html +++ b/doc_theme/templates/layout.html @@ -101,7 +101,7 @@