diff --git a/.gitignore b/.gitignore index 2b30f7938c8a1672acd0a14b7051af12c37889fb..9622ab78e0e0556ec2b4cc974fee93ff680d54d2 100644 --- a/.gitignore +++ b/.gitignore @@ -16,3 +16,13 @@ third_party/ *~ bazel-* third_party/ + +# clion workspace. +cmake-build-* + +# generated while compiling +python/paddle/v2/framework/core.so +CMakeFiles +cmake_install.cmake +paddle/.timestamp +python/paddlepaddle.egg-info/ diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 9b138576fcc695408c4cc0a03d227da7d0c6f440..bb8c88787d37faf9ce4d7d856a307c11f1085d98 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -3,8 +3,8 @@ hooks: - id: remove-crlf files: (?!.*third_party)^.*$ | (?!.*book)^.*$ -- repo: https://github.com/reyoung/mirrors-yapf.git - sha: v0.13.2 +- repo: https://github.com/PaddlePaddle/mirrors-yapf.git + sha: 0d79c0c469bab64f7229c9aca2b1186ef47f0e37 hooks: - id: yapf files: (.*\.(py|bzl)|BUILD|.*\.BUILD|WORKSPACE)$ @@ -17,7 +17,20 @@ - id: detect-private-key files: (?!.*third_party)^.*$ | (?!.*book)^.*$ - id: end-of-file-fixer -- repo: https://github.com/PaddlePaddle/clang-format-pre-commit-hook.git - sha: 28c0ea8a67a3e2dbbf4822ef44e85b63a0080a29 +- repo: local hooks: - - id: clang-formater + - id: clang-format + name: clang-format + description: Format files with ClangFormat. + entry: clang-format -i + language: system + files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx|proto)$ +- repo: https://github.com/PaddlePaddle/pre-commit-golang + sha: 8337620115c25ff8333f1b1a493bd031049bd7c0 + hooks: + - id: go-fmt + types: + - go + - id: gometalinter + types: + - go diff --git a/.travis.yml b/.travis.yml index 387367a2305e7bf582e29538ab9e51571b9ae75b..b4b83fcdbc84ce0fb0c91c816ebc3c964acfa590 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,23 +1,23 @@ language: cpp cache: directories: - - $HOME/third_party - $HOME/.ccache - $HOME/.cache/pip + - $TRAVIS_BUILD_DIR/build/third_party + - $TRAVIS_BUILD_DIR/build_android/third_party sudo: required dist: trusty os: - linux env: - - JOB=DOCS - - JOB=BUILD_AND_TEST - - JOB=PRE_COMMIT + - JOB=build_doc + - JOB=check_style + - JOB=build_android addons: apt: packages: - gcc-4.8 - g++-4.8 - - gfortran-4.8 - git - build-essential - python @@ -34,26 +34,20 @@ addons: - libtool - ccache before_install: - - | - if [ ${JOB} == "BUILD_AND_TEST" ]; then - local change_list=`git diff --name-only $TRAVIS_COMMIT_RANGE` - if [ $? -eq 0 ]; then # if git diff return no zero, then rerun unit test. - if ! echo ${change_list} | grep -qvE '(\.md$)|(\.rst$)|(\.jpg$)|(\.png$)' - then - echo "Only markdown docs were updated, stopping build process." - exit - fi - fi - fi - - if [[ "$JOB" == "PRE_COMMIT" ]]; then sudo ln -s /usr/bin/clang-format-3.8 /usr/bin/clang-format; fi - # Paddle is using protobuf 3.1 currently. Protobuf 3.2 breaks the compatibility. So we specify the python + - if [[ "$JOB" == "check_style" ]]; then sudo ln -s /usr/bin/clang-format-3.8 /usr/bin/clang-format; fi + # Paddle is using protobuf 3.1 currently. Protobuf 3.2 breaks the compatibility. So we specify the python # protobuf version. - - pip install numpy wheel 'protobuf==3.1' sphinx==1.5.6 recommonmark sphinx-rtd-theme==0.1.9 virtualenv pre-commit requests==2.9.2 LinkChecker + - pip install -r $TRAVIS_BUILD_DIR/python/requirements.txt + - pip install wheel sphinx==1.5.6 recommonmark sphinx-rtd-theme==0.1.9 virtualenv pre-commit LinkChecker + - curl https://glide.sh/get | bash + - eval "$(GIMME_GO_VERSION=1.8.3 gimme)" + - go get -u github.com/alecthomas/gometalinter + - gometalinter --install - | function timeout() { perl -e 'alarm shift; exec @ARGV' "$@"; } script: - - | - timeout 2580 paddle/scripts/travis/main.sh # 43min timeout + - | + timeout 2580 paddle/scripts/travis/${JOB}.sh # 43min timeout RESULT=$?; if [ $RESULT -eq 0 ] || [ $RESULT -eq 142 ]; then true; else false; fi; notifications: email: diff --git a/AUTHORS.md b/AUTHORS.md index d5baee2161aa1d5360056e03ca67d5b2fe9ff7d2..4db4a4a8e7441b07ce2db4adff043bb99a09014b 100644 --- a/AUTHORS.md +++ b/AUTHORS.md @@ -1,28 +1,48 @@ | Github account | name | |---|---| -| reyoung | Yang Yu | +| backyes | Yan-Fei Wang | +| beckett1124 | Bin Qi | +| Canpio | Jia-Yi Feng | +| chengxiaohua1105 | Xiao-Hua Cheng | +| cxwangyi, yiwangbaidu, wangkuiyi | Yi Wang | +| cxysteven | Xing-Yi Cheng | +| dzhwinter | Zhi-Hong Dong | +| emailweixu | Wei Xu | | gangliao | Gang Liao | -| luotao01 | Tao Luo | -| jacquesqiao | Long-Fei Qiao | -| qingqing01 | Qing-Qing Dang | +| gongweibao | Wei-Bao Gong | +| Guo Sheng | Sheng Guo | +| Haichao-Zhang | Hai-Chao Zhang | | hedaoyuan | Dao-Yuan He | -| wangyang59 | Yang Wang | +| helinwang | He-Lin Wang | +| jacquesqiao | Long-Fei Qiao | +| kuke | Yi-Bing Liu | +| lcy-seso | Ying Cao | +| lipeng-unisound | Peng Li | +| liuyuan | Yuan Liu | +| livc | Zhao Li | +| llxxxll | Yong-Feng Liu | +| luotao01 | Tao Luo | +| lzhao4ever | Liang Zhao | +| NHZlX | Zhao-Long Xing | +| pakchoi | Chuan-Jiang Song | +| pengli09 | Peng Li | +| pkuyym | Ya-Ming Yang | | QiJune | Jun Qi | +| qingqing01 | Qing-Qing Dang | +| reyoung | Yang Yu | +| Superjom | Chun-Wei Yan | | tianbingsz | Tian-Bing Xu | -| cxwangyi, yiwangbaidu, wangkuiyi | Yi Wang | | typhoonzero | Yi Wu | -| backyes | Yan-Fei Wang | -| pengli09 | Peng Li | -| livc | Zhao Li | +| wanghaoshuang | Hao-Shuang Wang | +| wangyang59 | Yang Wang | +| wangzhen-nlp | Zhen Wang | +| wen-bo-yang | Wen-Bo Yang | +| wwhu | Wei-Wei Hu | +| xinghai-sun | Xing-Hai Sun | | Xreki | Yi-Qun Liu | +| xujun05 | Jun Xu | +| xushaoyong | Shao-Yong Xu | | Yancey1989 | Xu Yan | -| emailweixu | Wei Xu | -| wen-bo-yang | Wen-Bo Yang | -| helinwang | He-Lin Wang | -| lcy-seso | Ying Cao | -| Zrachel | Rui-Qing Zhang | -| Haichao-Zhang | Hai-Chao Zhang | -| gongweibao | Wei-Bao Gong | -| lzhao4ever | Liang Zhao | +| zhaopu7 | Pu Zhao | | zhouxiao-coder | Xiao Zhou | -| lipeng-unisound | Peng Li | +| Zrachel | Rui-Qing Zhang | diff --git a/CMakeLists.txt b/CMakeLists.txt index 884afa962bbaff1defe610a9cd5b4a6e5d46c7c3..c75b83e50cf9cef8290c37f88b38cdc3d77df39c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -13,13 +13,13 @@ # limitations under the License cmake_minimum_required(VERSION 3.0) - set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_CURRENT_SOURCE_DIR}/cmake") -set(PROJ_ROOT ${CMAKE_CURRENT_SOURCE_DIR}) +set(PADDLE_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}) +set(PADDLE_BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR}) include(system) -project(paddle CXX C) +project(paddle CXX C Go) find_package(Sphinx) if(NOT CMAKE_CROSSCOMPILING) @@ -27,12 +27,17 @@ if(NOT CMAKE_CROSSCOMPILING) endif(NOT CMAKE_CROSSCOMPILING) find_package(Git REQUIRED) find_package(Threads REQUIRED) +if(NOT ANDROID) + find_package(Boost QUIET) +endif() include(simd) ################################ Configurations ####################################### option(WITH_GPU "Compile PaddlePaddle with NVIDIA GPU" ${CUDA_FOUND}) option(WITH_AVX "Compile PaddlePaddle with AVX intrinsics" ${AVX_FOUND}) +option(WITH_MKLDNN "Compile PaddlePaddle with mkl-dnn support." OFF) +option(WITH_MKLML "Compile PaddlePaddle with mklml package." OFF) option(WITH_DSO "Compile PaddlePaddle with dynamic linked CUDA" ON) option(WITH_TESTING "Compile PaddlePaddle with unit testing" ON) option(WITH_SWIG_PY "Compile PaddlePaddle with inference api" ON) @@ -47,6 +52,9 @@ option(WITH_COVERAGE "Compile PaddlePaddle with code coverage" OFF) option(COVERALLS_UPLOAD "Package code coverage data to coveralls" OFF) option(ON_TRAVIS "Exclude special unit test on Travis CI" OFF) option(WITH_C_API "Compile PaddlePaddle with C-API(Prediction)" OFF) +option(WITH_GOLANG "Compile PaddlePaddle with GOLANG" OFF) +option(GLIDE_INSTALL "Download and install go dependencies " ON) +option(USE_NNPACK "Compile PaddlePaddle with NNPACK library" OFF) # CMAKE_BUILD_TYPE if(NOT CMAKE_BUILD_TYPE) @@ -68,9 +76,13 @@ if(ANDROID) "Disable PYTHON when cross-compiling for Android" FORCE) set(WITH_RDMA OFF CACHE STRING "Disable RDMA when cross-compiling for Android" FORCE) + set(WITH_MKLDNN OFF CACHE STRING + "Disable MKLDNN when cross-compiling for Android" FORCE) + set(WITH_MKLML OFF CACHE STRING + "Disable MKLML package when cross-compiling for Android" FORCE) endif(ANDROID) -set(THIRD_PARTY_PATH "${PROJ_ROOT}/third_party" CACHE STRING +set(THIRD_PARTY_PATH "${CMAKE_BINARY_DIR}/third_party" CACHE STRING "A path setting third party libraries download & build directories.") if (WITH_C_API AND WITH_PYTHON) @@ -81,6 +93,7 @@ endif() ######################################################################################## +include(external/mklml) # download mklml package include(external/zlib) # download, build, install zlib include(external/gflags) # download, build, install gflags include(external/glog) # download, build, install glog @@ -88,10 +101,15 @@ include(external/gtest) # download, build, install gtest include(external/protobuf) # download, build, install protobuf include(external/python) # download, build, install python include(external/openblas) # download, build, install openblas +include(external/mkldnn) # download, build, install mkldnn include(external/swig) # download, build, install swig include(external/warpctc) # download, build, install warpctc include(external/any) # download libn::any +include(external/eigen) # download eigen3 +include(external/pybind11) # download pybind11 +include(cudnn) # set cudnn libraries, must before configure +include(configure) # add paddle env configuration include(generic) # simplify cmake module include(package) # set paddle packages include(cpplint) # set paddle c++ style @@ -99,14 +117,15 @@ include(ccache) # set ccache for compilation include(util) # set unittest and link libs include(rdma) # set rdma libraries include(flags) # set paddle compile flags -include(cudnn) # set cudnn libraries include(version) # set PADDLE_VERSION include(coveralls) # set code coverage -include(configure) # add paddle env configuration -include_directories("${PROJ_ROOT}") -include_directories("${PROJ_ROOT}/paddle/cuda/include") + +include_directories("${PADDLE_SOURCE_DIR}") +include_directories("${PADDLE_SOURCE_DIR}/paddle/cuda/include") include_directories("${CMAKE_CURRENT_BINARY_DIR}/proto") +include_directories("${CMAKE_CURRENT_BINARY_DIR}/go/pserver/client/c") +include_directories(${Boost_INCLUDE_DIRS}) set(EXTERNAL_LIBS ${GFLAGS_LIBRARIES} @@ -124,9 +143,32 @@ if(WITH_GPU) endif(NOT WITH_DSO) endif(WITH_GPU) +if(WITH_MKLDNN) + list(APPEND EXTERNAL_LIBS ${MKLDNN_LIB} ${MKLDNN_IOMP_LIB}) +endif() + +if(USE_NNPACK) + include(external/nnpack) + list(APPEND EXTERNAL_LIBS ${NNPACK_LIBS}) +endif(USE_NNPACK) + add_subdirectory(proto) + +# "add_subdirectory(go)" should be placed after the following loine, +# because it depends on paddle/optimizer. +add_subdirectory(paddle/optimizer) + +# "add_subdirectory(paddle)" and "add_subdirectory(python)" should be +# placed after this block, because they depends on it. +if(WITH_GOLANG) + add_subdirectory(go) +endif(WITH_GOLANG) + +set(PADDLE_PYTHON_BUILD_DIR "${CMAKE_CURRENT_BINARY_DIR}/python/build") add_subdirectory(paddle) -add_subdirectory(python) +if(WITH_PYTHON) + add_subdirectory(python) +endif() if(WITH_DOC) add_subdirectory(doc) diff --git a/Dockerfile b/Dockerfile index ad0d086d3c65b5901178aa681aa36ccc0ea0c246..41b6729124228cec16be35d9b26da8042824b0b0 100644 --- a/Dockerfile +++ b/Dockerfile @@ -25,14 +25,30 @@ COPY ./paddle/scripts/docker/root/ /root/ RUN apt-get update && \ apt-get install -y \ git python-pip python-dev openssh-server bison \ - wget unzip tar xz-utils bzip2 gzip coreutils \ + wget unzip unrar tar xz-utils bzip2 gzip coreutils ntp \ curl sed grep graphviz libjpeg-dev zlib1g-dev \ - python-numpy python-matplotlib gcc g++ \ - automake locales clang-format-3.8 swig doxygen cmake \ + python-matplotlib gcc-4.8 g++-4.8 \ + automake locales clang-format swig doxygen cmake \ liblapack-dev liblapacke-dev libboost-dev \ - clang-3.8 llvm-3.8 libclang-3.8-dev && \ + clang-3.8 llvm-3.8 libclang-3.8-dev \ + net-tools && \ apt-get clean -y +# paddle is using numpy.flip, which is introduced since 1.12.0 +RUN pip --no-cache-dir install 'numpy>=1.12.0' + +# Install Go and glide +RUN wget -qO- https://storage.googleapis.com/golang/go1.8.1.linux-amd64.tar.gz | \ + tar -xz -C /usr/local && \ + mkdir /root/gopath && \ + mkdir /root/gopath/bin && \ + mkdir /root/gopath/src +ENV GOROOT=/usr/local/go GOPATH=/root/gopath +# should not be in the same line with GOROOT definition, otherwise docker build could not find GOROOT. +ENV PATH=${PATH}:${GOROOT}/bin:${GOPATH}/bin +# install glide +RUN curl -s -q https://glide.sh/get | sh + # git credential to skip password typing RUN git config --global credential.helper store @@ -47,13 +63,29 @@ RUN pip install --upgrade pip && \ pip install -U docopt PyYAML sphinx && \ pip install -U sphinx-rtd-theme==0.1.9 recommonmark && \ pip install pre-commit 'requests==2.9.2' 'ipython==5.3.0' && \ - pip install 'ipykernel==4.6.0' 'jupyter==1.0.0' + pip install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \ + pip install opencv-python rarfile 'scipy>=0.19.0' 'nltk>=3.2.2' # To fix https://github.com/PaddlePaddle/Paddle/issues/1954, we use # the solution in https://urllib3.readthedocs.io/en/latest/user-guide.html#ssl-py2 RUN apt-get install -y libssl-dev libffi-dev RUN pip install certifi urllib3[secure] +# TODO(qijun) The template library Eigen doesn't work well with GCC 5 +# coming with the default Docker image, so we switch to use GCC 4.8 +# by default. And I will check Eigen library later. + +RUN ln -sf gcc-4.8 /usr/bin/gcc && \ + ln -sf gcc-ar-4.8 /usr/bin/gcc-ar && \ + ln -sf gcc-nm-4.8 /usr/bin/gcc-nm && \ + ln -sf gcc-ranlib-4.8 /usr/bin/gcc-ranlib && \ + ln -sf gcc-4.8 /usr/bin/x86_64-linux-gnu-gcc && \ + ln -sf gcc-ar-4.8 /usr/bin/x86_64-linux-gnu-gcc-ar && \ + ln -sf gcc-nm-4.8 /usr/bin/x86_64-linux-gnu-gcc-nm && \ + ln -sf gcc-ranlib-4.8 /usr/bin/x86_64-linux-gnu-gcc-ranlib && \ + ln -sf g++-4.8 /usr/bin/g++ && \ + ln -sf g++-4.8 /usr/bin/x86_64-linux-gnu-g++ + # Install woboq_codebrowser to /woboq RUN git clone https://github.com/woboq/woboq_codebrowser /woboq && \ (cd /woboq \ diff --git a/Dockerfile.android b/Dockerfile.android new file mode 100644 index 0000000000000000000000000000000000000000..c0fa58c384f9ebcae60477ffce49ea4ffa929db9 --- /dev/null +++ b/Dockerfile.android @@ -0,0 +1,49 @@ +FROM ubuntu:16.04 +MAINTAINER PaddlePaddle Authors + +ARG UBUNTU_MIRROR +RUN /bin/bash -c 'if [[ -n ${UBUNTU_MIRROR} ]]; then sed -i 's#http://archive.ubuntu.com/ubuntu#${UBUNTU_MIRROR}#g' /etc/apt/sources.list; fi' + +ENV HOME=/root \ + ANDROID_NDK_HOME=/opt/android-ndk-linux \ + ANDROID_STANDALONE_TOOLCHAIN=/opt/android-toolchain-gcc + +RUN apt-get update && \ + apt-get install -y \ + git python-dev python-pip python-numpy \ + wget curl tar unzip gcc g++ locales clang-format-3.8 swig cmake && \ + apt-get clean -y + +# Install Go and glide +RUN wget -O go.tgz https://storage.googleapis.com/golang/go1.8.1.linux-amd64.tar.gz && \ + tar -C /usr/local -xzf go.tgz && \ + mkdir /root/gopath && \ + mkdir /root/gopath/bin && \ + mkdir /root/gopath/src && \ + rm go.tgz +ENV GOROOT=/usr/local/go GOPATH=/root/gopath +# should not be in the same line with GOROOT definition, otherwise docker build could not find GOROOT. +ENV PATH=${PATH}:${GOROOT}/bin:${GOPATH}/bin + +# git credential to skip password typing +RUN git config --global credential.helper store + +# Fix locales to en_US.UTF-8 +RUN localedef -i en_US -f UTF-8 en_US.UTF-8 + +RUN pip install --upgrade pip && \ + pip install -U 'protobuf==3.1.0' && \ + pip install -U wheel sphinx && \ + pip install pre-commit + +# Android NDK +RUN mkdir /opt/android-ndk-tmp && \ + cd /opt/android-ndk-tmp && \ + wget -q https://dl.google.com/android/repository/android-ndk-r14b-linux-x86_64.zip && \ + unzip -q android-ndk-r14b-linux-x86_64.zip && \ + mv android-ndk-r14b ${ANDROID_NDK_HOME} && \ + ${ANDROID_NDK_HOME}/build/tools/make-standalone-toolchain.sh --arch=arm --platform=android-21 --install-dir=${ANDROID_STANDALONE_TOOLCHAIN} && \ + rm -rf /opt/android-ndk-tmp && \ + rm -rf ${ANDROID_NDK_HOME} + +CMD ["bash", "/paddle/paddle/scripts/docker/build_android.sh"] diff --git a/README.md b/README.md index bcc24b84128df282a2e3f0bc62aafe1ffe172338..b9793c3eab5d40c28f01cc67ad607b97261b3235 100644 --- a/README.md +++ b/README.md @@ -2,8 +2,8 @@ [![Build Status](https://travis-ci.org/PaddlePaddle/Paddle.svg?branch=develop)](https://travis-ci.org/PaddlePaddle/Paddle) -[![Documentation Status](https://img.shields.io/badge/docs-latest-brightgreen.svg?style=flat)](http://www.paddlepaddle.org/develop/doc/) -[![Documentation Status](https://img.shields.io/badge/中文文档-最新-brightgreen.svg)](http://www.paddlepaddle.org/doc_cn/) +[![Documentation Status](https://img.shields.io/badge/docs-latest-brightgreen.svg?style=flat)](http://doc.paddlepaddle.org/develop/doc/) +[![Documentation Status](https://img.shields.io/badge/中文文档-最新-brightgreen.svg)](http://doc.paddlepaddle.org/develop/doc_cn/) [![Coverage Status](https://coveralls.io/repos/github/PaddlePaddle/Paddle/badge.svg?branch=develop)](https://coveralls.io/github/PaddlePaddle/Paddle?branch=develop) [![Release](https://img.shields.io/github/release/PaddlePaddle/Paddle.svg)](https://github.com/PaddlePaddle/Paddle/releases) [![License](https://img.shields.io/badge/license-Apache%202-blue.svg)](LICENSE) @@ -61,35 +61,36 @@ Please refer to our [release announcement](https://github.com/PaddlePaddle/Paddl ## Installation It is recommended to check out the -[Docker installation guide](http://www.paddlepaddle.org/develop/doc/getstarted/build_and_install/docker_install_en.html) +[Docker installation guide](http://doc.paddlepaddle.org/develop/doc/getstarted/build_and_install/docker_install_en.html) before looking into the -[build from source guide](http://www.paddlepaddle.org/develop/doc/getstarted/build_and_install/build_from_source_en.html) +[build from source guide](http://doc.paddlepaddle.org/develop/doc/getstarted/build_and_install/build_from_source_en.html) ## Documentation -We provide [English](http://www.paddlepaddle.org/develop/doc/) and -[Chinese](http://www.paddlepaddle.org/doc_cn/) documentation. +We provide [English](http://doc.paddlepaddle.org/develop/doc/) and +[Chinese](http://doc.paddlepaddle.org/doc_cn/) documentation. -- [Deep Learning 101](http://book.paddlepaddle.org/index.en.html) +- [Deep Learning 101](http://book.paddlepaddle.org/index.html) - You might want to start from the this online interactive book that can run in Jupyter Notebook. + You might want to start from this online interactive book that can run in Jupyter Notebook. -- [Distributed Training](http://www.paddlepaddle.org/develop/doc/howto/usage/cluster/cluster_train_en.html) +- [Distributed Training](http://doc.paddlepaddle.org/develop/doc/howto/usage/cluster/cluster_train_en.html) You can run distributed training jobs on MPI clusters. -- [Distributed Training on Kubernetes](http://www.paddlepaddle.org/develop/doc/howto/usage/k8s/k8s_en.html) +- [Distributed Training on Kubernetes](http://doc.paddlepaddle.org/develop/doc/howto/usage/k8s/k8s_en.html) You can also run distributed training jobs on Kubernetes clusters. -- [Python API](http://www.paddlepaddle.org/develop/doc/api/index_en.html) +- [Python API](http://doc.paddlepaddle.org/develop/doc/api/index_en.html) Our new API enables much shorter programs. -- [How to Contribute](http://www.paddlepaddle.org/develop/doc/howto/dev/contribute_to_paddle_en.html) +- [How to Contribute](http://doc.paddlepaddle.org/develop/doc/howto/dev/contribute_to_paddle_en.html) We appreciate your contributions! + ## Ask Questions You are welcome to submit questions and bug reports as [Github Issues](https://github.com/PaddlePaddle/Paddle/issues). diff --git a/paddle/go/cclient/cmake/CMakeDetermineGoCompiler.cmake b/cmake/CMakeDetermineGoCompiler.cmake similarity index 73% rename from paddle/go/cclient/cmake/CMakeDetermineGoCompiler.cmake rename to cmake/CMakeDetermineGoCompiler.cmake index b3f8fbe271d80aaa72d90d167a0d8130bec7f362..abf0a00c5e99e4201dede36f13200cfc9c151ad3 100644 --- a/paddle/go/cclient/cmake/CMakeDetermineGoCompiler.cmake +++ b/cmake/CMakeDetermineGoCompiler.cmake @@ -15,7 +15,7 @@ if(NOT CMAKE_Go_COMPILER) set(Go_BIN_PATH $ENV{GOPATH} $ENV{GOROOT} - $ENV{GOROOT}/../bin + $ENV{GOROOT}/bin $ENV{GO_COMPILER} /usr/bin /usr/local/bin @@ -28,10 +28,12 @@ if(NOT CMAKE_Go_COMPILER) NAMES go PATHS ${Go_BIN_PATH} ) - EXEC_PROGRAM(${CMAKE_Go_COMPILER} ARGS version OUTPUT_VARIABLE GOLANG_VERSION) - STRING(REGEX MATCH "go[0-9]+.[0-9]+.[0-9]+[ /A-Za-z0-9]*" VERSION "${GOLANG_VERSION}") - message("-- The Golang compiler identification is ${VERSION}") - message("-- Check for working Golang compiler: ${CMAKE_Go_COMPILER}") + if(CMAKE_Go_COMPILER) + EXEC_PROGRAM(${CMAKE_Go_COMPILER} ARGS version OUTPUT_VARIABLE GOLANG_VERSION) + STRING(REGEX MATCH "go[0-9]+[.0-9]*[ /A-Za-z0-9]*" VERSION "${GOLANG_VERSION}") + message("-- The Golang compiler identification is ${VERSION}") + message("-- Check for working Golang compiler: ${CMAKE_Go_COMPILER}") + endif() endif() endif() diff --git a/paddle/go/cclient/cmake/CMakeGoCompiler.cmake.in b/cmake/CMakeGoCompiler.cmake.in similarity index 100% rename from paddle/go/cclient/cmake/CMakeGoCompiler.cmake.in rename to cmake/CMakeGoCompiler.cmake.in diff --git a/paddle/go/cclient/cmake/CMakeGoInformation.cmake b/cmake/CMakeGoInformation.cmake similarity index 100% rename from paddle/go/cclient/cmake/CMakeGoInformation.cmake rename to cmake/CMakeGoInformation.cmake diff --git a/paddle/go/cclient/cmake/CMakeTestGoCompiler.cmake b/cmake/CMakeTestGoCompiler.cmake similarity index 100% rename from paddle/go/cclient/cmake/CMakeTestGoCompiler.cmake rename to cmake/CMakeTestGoCompiler.cmake diff --git a/cmake/cblas.cmake b/cmake/cblas.cmake index 913f711afff3b8f9f77b8da978a3b9e7165d0077..854066fd1d205c337fbdbe08997d88251095c799 100644 --- a/cmake/cblas.cmake +++ b/cmake/cblas.cmake @@ -15,23 +15,44 @@ set(CBLAS_FOUND OFF) -## Find MKL First. -set(INTEL_ROOT "/opt/intel" CACHE PATH "Folder contains intel libs") -set(MKL_ROOT ${INTEL_ROOT}/mkl CACHE PATH "Folder contains MKL") +## Find MKLML First. +if(WITH_MKLML AND MKLML_INC_DIR AND MKLML_LIB) + set(CBLAS_FOUND ON) + set(CBLAS_PROVIDER MKLML) + set(CBLAS_INC_DIR ${MKLML_INC_DIR}) + set(CBLAS_LIBRARIES ${MKLML_LIB}) + + add_definitions(-DPADDLE_USE_MKLML) + add_definitions(-DLAPACK_FOUND) + + message(STATUS "Found cblas and lapack in MKLML " + "(include: ${CBLAS_INC_DIR}, library: ${CBLAS_LIBRARIES})") + return() +endif() + +## Then find MKL. +set(INTEL_MKL_ROOT "/opt/intel/mkl" CACHE PATH "Folder contains intel mkl libs") +set(MKL_ROOT $ENV{MKL_ROOT} CACHE PATH "Folder contains env MKL") + +set(MKL_INCLUDE_SEARCH_PATHS + ${MKL_ROOT}/include + ${INTEL_MKL_ROOT}/include) +set(MKL_LIB_SEARCH_PATHS + ${MKL_ROOT}/lib + ${MKL_ROOT}/lib/intel64 + ${INTEL_MKL_ROOT}/lib + ${INTEL_MKL_ROOT}/lib/intel64) find_path(MKL_INC_DIR mkl.h PATHS - ${MKL_ROOT}/include) + ${MKL_INCLUDE_SEARCH_PATHS}) find_path(MKL_LAPACK_INC_DIR mkl_lapacke.h PATHS - ${MKL_ROOT}/include) + ${MKL_INCLUDE_SEARCH_PATHS}) find_library(MKL_CORE_LIB NAMES mkl_core PATHS - ${MKL_ROOT}/lib - ${MKL_ROOT}/lib/intel64) + ${MKL_LIB_SEARCH_PATHS}) find_library(MKL_SEQUENTIAL_LIB NAMES mkl_sequential PATHS - ${MKL_ROOT}/lib - ${MKL_ROOT}/lib/intel64) + ${MKL_LIB_SEARCH_PATHS}) find_library(MKL_INTEL_LP64 NAMES mkl_intel_lp64 PATHS - ${MKL_ROOT}/lib - ${MKL_ROOT}/lib/intel64) + ${MKL_LIB_SEARCH_PATHS}) if(MKL_LAPACK_INC_DIR AND MKL_INC_DIR AND MKL_CORE_LIB AND MKL_SEQUENTIAL_LIB AND MKL_INTEL_LP64) set(CBLAS_FOUND ON) diff --git a/cmake/configure.cmake b/cmake/configure.cmake index 5e507e78f74eee885922f502f35e3c15fafb622d..209f9078a637ac581d90212a48216eb388c477ed 100644 --- a/cmake/configure.cmake +++ b/cmake/configure.cmake @@ -1,11 +1,11 @@ # Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. -# +# # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at -# +# # http://www.apache.org/licenses/LICENSE-2.0 -# +# # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -40,6 +40,10 @@ if(NOT CMAKE_CROSSCOMPILING) endif() endif() +if(NOT WITH_GOLANG) + add_definitions(-DPADDLE_WITHOUT_GOLANG) +endif(NOT WITH_GOLANG) + if(NOT WITH_GPU) add_definitions(-DPADDLE_ONLY_CPU) add_definitions(-DHPPL_STUB_FUNC) @@ -63,5 +67,76 @@ else() include_directories(${CUDA_TOOLKIT_INCLUDE}) endif(NOT WITH_GPU) +if(WITH_MKLDNN) + add_definitions(-DPADDLE_USE_MKLDNN) + if (WITH_MKLML AND MKLDNN_IOMP_DIR) + message(STATUS "Enable Intel OpenMP at ${MKLDNN_IOMP_DIR}") + set(OPENMP_FLAGS "-fopenmp") + set(CMAKE_C_CREATE_SHARED_LIBRARY_FORBIDDEN_FLAGS ${OPENMP_FLAGS}) + set(CMAKE_CXX_CREATE_SHARED_LIBRARY_FORBIDDEN_FLAGS ${OPENMP_FLAGS}) + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OPENMP_FLAGS}") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OPENMP_FLAGS}") + else() + find_package(OpenMP) + if(OPENMP_FOUND) + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}") + else() + message(WARNING "Can not find OpenMP." + "Some performance features in MKLDNN may not be available") + endif() + endif() + +endif(WITH_MKLDNN) + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${SIMD_FLAG}") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${SIMD_FLAG}") + +if(WITH_GOLANG) + # we need to symlink Paddle directory into GOPATH. If we + # don't do it and we have code that depends on Paddle, go + # get ./... will download a new Paddle repo from Github, + # without the changes in our current Paddle repo that we + # want to build. + set(GOPATH "${CMAKE_CURRENT_BINARY_DIR}/go") + file(MAKE_DIRECTORY ${GOPATH}) + set(PADDLE_IN_GOPATH "${GOPATH}/src/github.com/PaddlePaddle/Paddle") + file(MAKE_DIRECTORY "${PADDLE_IN_GOPATH}") + set(PADDLE_GO_PATH "${CMAKE_SOURCE_DIR}/go") + + add_custom_target(go_path) + add_custom_command(TARGET go_path + # Symlink Paddle directory into GOPATH + COMMAND mkdir -p ${PADDLE_IN_GOPATH} + COMMAND rm -rf ${PADDLE_IN_GOPATH} + COMMAND ln -sf ${CMAKE_SOURCE_DIR} ${PADDLE_IN_GOPATH} + # Automatically get all dependencies specified in the source code + # We can't run `go get -d ./...` for every target, because + # multiple `go get` can not run concurrently, but make need to be + # able to run with multiple jobs. + WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} + ) + + if (GLIDE_INSTALL) + if(EXISTS $ENV{GOPATH}/bin/glide) + set(GLIDE "$ENV{GOPATH}/bin/glide") + else() + message(FATAL_ERROR "no glide executeble found: $ENV{GOPATH}/bin/glide") + endif() + + # this command will only run when the file it depends is missing + # or has changed, or the output is missing. + add_custom_command(OUTPUT ${CMAKE_BINARY_DIR}/glide + COMMAND env GOPATH=${GOPATH} ${GLIDE} install + COMMAND touch ${CMAKE_BINARY_DIR}/glide + DEPENDS ${PADDLE_SOURCE_DIR}/go/glide.lock + WORKING_DIRECTORY "${PADDLE_IN_GOPATH}/go" + ) + + # depends on the custom command which outputs + # ${CMAKE_BINARY_DIR}/glide, the custom command does not need to + # run every time this target is built. + add_custom_target(go_vendor DEPENDS ${CMAKE_BINARY_DIR}/glide go_path) + endif() + +endif(WITH_GOLANG) diff --git a/cmake/cpplint.cmake b/cmake/cpplint.cmake index 02a5c0b2c9be782c459a255c6ffd6ba6441f2693..8d5d533126c9b7fa84c725d614cf3486126d0284 100644 --- a/cmake/cpplint.cmake +++ b/cmake/cpplint.cmake @@ -25,8 +25,10 @@ set(STYLE_FILTER "${STYLE_FILTER}-readability/casting") set(IGNORE_PATTERN .*ImportanceSampler.* .*cblas\\.h.* + .*\\.pb\\.txt .*LtrDataProvider.* - .*MultiDataProvider.*) + .*MultiDataProvider.* + .*pb.*) # add_style_check_target # @@ -40,27 +42,21 @@ macro(add_style_check_target TARGET_NAME) if(WITH_STYLE_CHECK) set(SOURCES_LIST ${ARGN}) list(REMOVE_DUPLICATES SOURCES_LIST) - list(SORT SOURCES_LIST) - foreach(filename ${SOURCES_LIST}) - set(LINT ON) foreach(pattern ${IGNORE_PATTERN}) if(filename MATCHES ${pattern}) - message(STATUS "DROP LINT ${filename}") - set(LINT OFF) + list(REMOVE_ITEM SOURCES_LIST ${filename}) endif() endforeach() - if(LINT MATCHES ON) - get_filename_component(base_filename ${filename} NAME) - set(CUR_GEN ${CMAKE_CURRENT_BINARY_DIR}/${base_filename}.cpplint) - add_custom_command(OUTPUT ${CUR_GEN} - PRE_BUILD - COMMAND env ${py_env} "${PYTHON_EXECUTABLE}" "${PROJ_ROOT}/paddle/scripts/cpplint.py" - "--filter=${STYLE_FILTER}" - "--write-success=${CUR_GEN}" ${filename} - DEPENDS ${filename} - WORKING_DIRECTORY ${CMAKE_CURRENT_LIST_DIR}) - endif() endforeach() + + if(SOURCES_LIST) + add_custom_command(TARGET ${TARGET_NAME} POST_BUILD + COMMAND "${PYTHON_EXECUTABLE}" "${PADDLE_SOURCE_DIR}/paddle/scripts/cpplint.py" + "--filter=${STYLE_FILTER}" + ${SOURCES_LIST} + COMMENT "cpplint: Checking source code style" + WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}) + endif() endif() endmacro() diff --git a/cmake/cross_compiling/android.cmake b/cmake/cross_compiling/android.cmake index 9724c16122ab2e6be55864c8716698c9b9d7c3f0..5e3e437a8da9624df35a5c754fe77be73f20361d 100644 --- a/cmake/cross_compiling/android.cmake +++ b/cmake/cross_compiling/android.cmake @@ -106,6 +106,10 @@ IF("${CMAKE_VERSION}" VERSION_LESS "3.7.0") SET(CMAKE_SYSTEM_PROCESSOR armv7-a) ENDIF() ENDIF() + IF(ANDROID_ABI STREQUAL "arm64-v8a") + SET(ANDROID_TOOLCHAIN_NAME aarch64-linux-android) + SET(CMAKE_SYSTEM_PROCESSOR aarch64) + ENDIF() SET(ANDROID_TOOLCHAIN_PREFIX "${ANDROID_TOOLCHAIN_ROOT}/bin/${ANDROID_TOOLCHAIN_NAME}-") ENDIF() @@ -162,6 +166,10 @@ IF("${CMAKE_VERSION}" VERSION_LESS "3.7.0") ENDIF() ENDIF() + IF(ANDROID_ABI STREQUAL "arm64-v8a") + LIST(APPEND ANDROID_COMPILER_FLAGS -march=armv8-a) + ENDIF() + STRING(REPLACE ";" " " ANDROID_COMPILER_FLAGS "${ANDROID_COMPILER_FLAGS}") STRING(REPLACE ";" " " ANDROID_LINKER_FLAGS "${ANDROID_LINKER_FLAGS}") @@ -186,6 +194,10 @@ ELSE() SET(CMAKE_ANDROID_STANDALONE_TOOLCHAIN ${ANDROID_STANDALONE_TOOLCHAIN}) ENDIF() SET(CMAKE_ANDROID_ARCH_ABI ${ANDROID_ABI}) - SET(CMAKE_ANDROID_ARM_MODE ${ANDROID_ARM_MODE}) - SET(CMAKE_ANDROID_ARM_NEON ${ANDROID_ARM_NEON}) + IF(ANDROID_ABI MATCHES "^armeabi(-v7a)?$") + SET(CMAKE_ANDROID_ARM_MODE ${ANDROID_ARM_MODE}) + IF(ANDROID_ABI STREQUAL "armeabi-v7a") + SET(CMAKE_ANDROID_ARM_NEON ${ANDROID_ARM_NEON}) + ENDIF() + ENDIF() ENDIF() diff --git a/cmake/cudnn.cmake b/cmake/cudnn.cmake index af9be86961833dcd62371227165d411a3b61d79e..69f40df51680a104c47d9335c070c570dcaff59a 100644 --- a/cmake/cudnn.cmake +++ b/cmake/cudnn.cmake @@ -11,11 +11,16 @@ find_path(CUDNN_INCLUDE_DIR cudnn.h get_filename_component(__libpath_hist ${CUDA_CUDART_LIBRARY} PATH) +set(TARGET_ARCH "x86_64") +if(NOT ${CMAKE_SYSTEM_PROCESSOR}) + set(TARGET_ARCH ${CMAKE_SYSTEM_PROCESSOR}) +endif() + list(APPEND CUDNN_CHECK_LIBRARY_DIRS ${CUDNN_ROOT} ${CUDNN_ROOT}/lib64 ${CUDNN_ROOT}/lib - ${CUDNN_ROOT}/lib/x86_64-linux-gnu + ${CUDNN_ROOT}/lib/${TARGET_ARCH}-linux-gnu $ENV{CUDNN_ROOT} $ENV{CUDNN_ROOT}/lib64 $ENV{CUDNN_ROOT}/lib diff --git a/cmake/external/any.cmake b/cmake/external/any.cmake index 8116f235d535917c03deb646ff4ec083a0cdadc7..85cce80b70a1fcf57015ac7a264e4950616b2717 100644 --- a/cmake/external/any.cmake +++ b/cmake/external/any.cmake @@ -2,13 +2,13 @@ INCLUDE(ExternalProject) SET(ANY_SOURCE_DIR ${THIRD_PARTY_PATH}/any) -INCLUDE_DIRECTORIES(${ANY_SOURCE_DIR}/src/linb_any) +INCLUDE_DIRECTORIES(${ANY_SOURCE_DIR}/src/extern_lib_any) ExternalProject_Add( - linb_any + extern_lib_any ${EXTERNAL_PROJECT_LOG_ARGS} - GIT_REPOSITORY "https://github.com/thelink2012/any.git" - GIT_TAG "8fef1e93710a0edf8d7658999e284a1142c4c020" + GIT_REPOSITORY "https://github.com/PaddlePaddle/any.git" + GIT_TAG "15595d8324be9e8a9a80d9ae442fdd12bd66df5d" PREFIX ${ANY_SOURCE_DIR} UPDATE_COMMAND "" CONFIGURE_COMMAND "" @@ -17,4 +17,15 @@ ExternalProject_Add( TEST_COMMAND "" ) +if (${CMAKE_VERSION} VERSION_LESS "3.3.0") + set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/lib_any_dummy.c) + file(WRITE ${dummyfile} "const char * dummy_any = \"${dummyfile}\";") + add_library(lib_any STATIC ${dummyfile}) +else() + add_library(lib_any INTERFACE) +endif() + +add_dependencies(lib_any extern_lib_any) + add_definitions(-DANY_IMPL_ANY_CAST_MOVEABLE) +LIST(APPEND external_project_dependencies lib_any) diff --git a/cmake/external/eigen.cmake b/cmake/external/eigen.cmake new file mode 100644 index 0000000000000000000000000000000000000000..f7483f6be9169eb58f0148cd3a956a8c881e1fe3 --- /dev/null +++ b/cmake/external/eigen.cmake @@ -0,0 +1,30 @@ +INCLUDE(ExternalProject) + +SET(EIGEN_SOURCE_DIR ${THIRD_PARTY_PATH}/eigen3) + +INCLUDE_DIRECTORIES(${EIGEN_SOURCE_DIR}/src/extern_eigen3) + +ExternalProject_Add( + extern_eigen3 + ${EXTERNAL_PROJECT_LOG_ARGS} + GIT_REPOSITORY "https://github.com/RLovelett/eigen.git" + GIT_TAG "master" + PREFIX ${EIGEN_SOURCE_DIR} + UPDATE_COMMAND "" + CONFIGURE_COMMAND "" + BUILD_COMMAND "" + INSTALL_COMMAND "" + TEST_COMMAND "" +) + +if (${CMAKE_VERSION} VERSION_LESS "3.3.0") + set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/eigen3_dummy.c) + file(WRITE ${dummyfile} "const char * dummy_eigen3 = \"${dummyfile}\";") + add_library(eigen3 STATIC ${dummyfile}) +else() + add_library(eigen3 INTERFACE) +endif() + +add_dependencies(eigen3 extern_eigen3) + +LIST(APPEND external_project_dependencies eigen3) diff --git a/cmake/external/gflags.cmake b/cmake/external/gflags.cmake index 0afb3ab9af48046af01f03838eefa0bd2fcb2821..16e5bef4cdb8d6513de51838e3c3c8398dbad60d 100644 --- a/cmake/external/gflags.cmake +++ b/cmake/external/gflags.cmake @@ -1,11 +1,11 @@ # Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. -# +# # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at -# +# # http://www.apache.org/licenses/LICENSE-2.0 -# +# # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -26,9 +26,16 @@ ENDIF(WIN32) INCLUDE_DIRECTORIES(${GFLAGS_INCLUDE_DIR}) ExternalProject_Add( - gflags + extern_gflags ${EXTERNAL_PROJECT_LOG_ARGS} - GIT_REPOSITORY "https://github.com/gflags/gflags.git" + # TODO(yiwang): The annoying warnings mentioned in + # https://github.com/PaddlePaddle/Paddle/issues/3277 are caused by + # gflags. I fired a PR https://github.com/gflags/gflags/pull/230 + # to fix it. Before it gets accepted by the gflags team, we use + # my personal fork, which contains above fix, temporarily. Let's + # change this back to the official Github repo once my PR is + # merged. + GIT_REPOSITORY "https://github.com/wangkuiyi/gflags.git" PREFIX ${GFLAGS_SOURCES_DIR} UPDATE_COMMAND "" CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} @@ -44,4 +51,8 @@ ExternalProject_Add( -DCMAKE_BUILD_TYPE:STRING=Release ) +ADD_LIBRARY(gflags STATIC IMPORTED GLOBAL) +SET_PROPERTY(TARGET gflags PROPERTY IMPORTED_LOCATION ${GFLAGS_LIBRARIES}) +ADD_DEPENDENCIES(gflags extern_gflags) + LIST(APPEND external_project_dependencies gflags) diff --git a/cmake/external/glog.cmake b/cmake/external/glog.cmake index 4a9e2ecc6bbe74c5856a55fb0c982777d7ac25b7..8a594a825abdca6a0f989b94fa42f97d6df5e10a 100644 --- a/cmake/external/glog.cmake +++ b/cmake/external/glog.cmake @@ -27,7 +27,7 @@ ENDIF(WIN32) INCLUDE_DIRECTORIES(${GLOG_INCLUDE_DIR}) ExternalProject_Add( - glog + extern_glog ${EXTERNAL_PROJECT_LOG_ARGS} DEPENDS gflags GIT_REPOSITORY "https://github.com/google/glog.git" @@ -38,14 +38,21 @@ ExternalProject_Add( CMAKE_ARGS -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS} CMAKE_ARGS -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS} CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${GLOG_INSTALL_DIR} + CMAKE_ARGS -DCMAKE_INSTALL_LIBDIR=${GLOG_INSTALL_DIR}/lib CMAKE_ARGS -DCMAKE_POSITION_INDEPENDENT_CODE=ON CMAKE_ARGS -DWITH_GFLAGS=ON CMAKE_ARGS -Dgflags_DIR=${GFLAGS_INSTALL_DIR}/lib/cmake/gflags CMAKE_ARGS -DBUILD_TESTING=OFF CMAKE_ARGS -DCMAKE_BUILD_TYPE=Release CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${GLOG_INSTALL_DIR} + -DCMAKE_INSTALL_LIBDIR:PATH=${GLOG_INSTALL_DIR}/lib -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON -DCMAKE_BUILD_TYPE:STRING=Release ) +ADD_LIBRARY(glog STATIC IMPORTED GLOBAL) +SET_PROPERTY(TARGET glog PROPERTY IMPORTED_LOCATION ${GLOG_LIBRARIES}) +ADD_DEPENDENCIES(glog extern_glog gflags) +LINK_LIBRARIES(glog gflags) + LIST(APPEND external_project_dependencies glog) diff --git a/cmake/external/gtest.cmake b/cmake/external/gtest.cmake index 49c7d71443cda700a14af6be65ff6658eec7229f..e3970073a1a0b946fa1db6642799719d7a9fcf4f 100644 --- a/cmake/external/gtest.cmake +++ b/cmake/external/gtest.cmake @@ -1,11 +1,11 @@ # Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. -# +# # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at -# +# # http://www.apache.org/licenses/LICENSE-2.0 -# +# # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -34,9 +34,15 @@ IF(WITH_TESTING) "${GTEST_INSTALL_DIR}/lib/libgtest_main.a" CACHE FILEPATH "gtest main libraries." FORCE) ENDIF(WIN32) + IF(WITH_MKLML) + # wait for mklml downloading completed + SET(GTEST_DEPENDS ${MKLML_PROJECT}) + ENDIF() + ExternalProject_Add( - gtest + extern_gtest ${EXTERNAL_PROJECT_LOG_ARGS} + DEPENDS ${GTEST_DEPENDS} GIT_REPOSITORY "https://github.com/google/googletest.git" GIT_TAG "release-1.8.0" PREFIX ${GTEST_SOURCES_DIR} @@ -55,5 +61,14 @@ IF(WITH_TESTING) -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON -DCMAKE_BUILD_TYPE:STRING=Release ) - LIST(APPEND external_project_dependencies gtest) + + ADD_LIBRARY(gtest STATIC IMPORTED GLOBAL) + SET_PROPERTY(TARGET gtest PROPERTY IMPORTED_LOCATION ${GTEST_LIBRARIES}) + ADD_DEPENDENCIES(gtest extern_gtest) + + ADD_LIBRARY(gtest_main STATIC IMPORTED GLOBAL) + SET_PROPERTY(TARGET gtest_main PROPERTY IMPORTED_LOCATION ${GTEST_MAIN_LIBRARIES}) + ADD_DEPENDENCIES(gtest_main extern_gtest) + + LIST(APPEND external_project_dependencies gtest gtest_main) ENDIF(WITH_TESTING) diff --git a/cmake/external/mkldnn.cmake b/cmake/external/mkldnn.cmake new file mode 100644 index 0000000000000000000000000000000000000000..25c6b4ef52d3f8ebff1572ae8d348be7c577c08c --- /dev/null +++ b/cmake/external/mkldnn.cmake @@ -0,0 +1,67 @@ +# Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +IF(NOT ${WITH_MKLDNN}) + return() +ENDIF(NOT ${WITH_MKLDNN}) + +INCLUDE(ExternalProject) + +SET(MKLDNN_PROJECT "extern_mkldnn") +SET(MKLDNN_SOURCES_DIR ${THIRD_PARTY_PATH}/mkldnn) +SET(MKLDNN_INSTALL_DIR ${THIRD_PARTY_PATH}/install/mkldnn) +SET(MKLDNN_INC_DIR "${MKLDNN_INSTALL_DIR}/include" CACHE PATH "mkldnn include directory." FORCE) + +IF(WIN32 OR APPLE) + MESSAGE(WARNING + "Windows or Mac is not supported with MKLDNN in Paddle yet." + "Force WITH_MKLDNN=OFF") + SET(WITH_MKLDNN OFF CACHE STRING "Disable MKLDNN in Windows and MacOS" FORCE) + return() +ENDIF() + +SET(MKLDNN_LIB "${MKLDNN_INSTALL_DIR}/lib/libmkldnn.so" CACHE FILEPATH "mkldnn library." FORCE) +MESSAGE(STATUS "Set ${MKLDNN_INSTALL_DIR}/lib to runtime path") +SET(CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE) +SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${MKLDNN_INSTALL_DIR}/lib") + +INCLUDE_DIRECTORIES(${MKLDNN_INC_DIR}) + +IF(${CBLAS_PROVIDER} STREQUAL "MKLML") + SET(MKLDNN_DEPENDS ${MKLML_PROJECT}) + SET(MKLDNN_MKLROOT ${MKLML_ROOT}) + SET(MKLDNN_IOMP_LIB ${MKLML_IOMP_LIB}) + SET(MKLDNN_IOMP_DIR ${MKLML_LIB_DIR}) + MESSAGE(STATUS "Build MKLDNN with ${MKLDNN_MKLROOT}") +ENDIF() + +ExternalProject_Add( + ${MKLDNN_PROJECT} + ${EXTERNAL_PROJECT_LOG_ARGS} + DEPENDS ${MKLDNN_DEPENDS} + GIT_REPOSITORY "https://github.com/01org/mkl-dnn.git" + GIT_TAG "v0.9" + PREFIX ${MKLDNN_SOURCES_DIR} + UPDATE_COMMAND "" + CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${MKLDNN_INSTALL_DIR} + CMAKE_ARGS -DMKLROOT=${MKLDNN_MKLROOT} + CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${MKLDNN_INSTALL_DIR} + -DMKLROOT:PATH=${MKLDNN_MKLROOT} +) + +ADD_LIBRARY(mkldnn SHARED IMPORTED GLOBAL) +SET_PROPERTY(TARGET mkldnn PROPERTY IMPORTED_LOCATION ${MKLDNN_LIB}) +ADD_DEPENDENCIES(mkldnn ${MKLDNN_PROJECT}) +MESSAGE(STATUS "Mkldnn library: ${MKLDNN_LIB}") +LIST(APPEND external_project_dependencies mkldnn) diff --git a/cmake/external/mklml.cmake b/cmake/external/mklml.cmake new file mode 100644 index 0000000000000000000000000000000000000000..e9fd3d4bedc983ae7c544cf289dc841cf22f9de4 --- /dev/null +++ b/cmake/external/mklml.cmake @@ -0,0 +1,67 @@ +# Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +IF(NOT ${WITH_MKLML}) + return() +ENDIF(NOT ${WITH_MKLML}) + +IF(WIN32 OR APPLE) + MESSAGE(WARNING + "Windows or Mac is not supported with MKLML in Paddle yet." + "Force WITH_MKLML=OFF") + SET(WITH_MKLML OFF CACHE STRING "Disable MKLML package in Windows and MacOS" FORCE) + return() +ENDIF() + +INCLUDE(ExternalProject) + +SET(MKLML_PROJECT "extern_mklml") +SET(MKLML_VER "mklml_lnx_2018.0.20170720") +SET(MKLML_URL "https://github.com/01org/mkl-dnn/releases/download/v0.9/${MKLML_VER}.tgz") +SET(MKLML_SOURCE_DIR "${THIRD_PARTY_PATH}/mklml") +SET(MKLML_DOWNLOAD_DIR "${MKLML_SOURCE_DIR}/src/${MKLML_PROJECT}") +SET(MKLML_DST_DIR "mklml") +SET(MKLML_INSTALL_ROOT "${THIRD_PARTY_PATH}/install") +SET(MKLML_INSTALL_DIR ${MKLML_INSTALL_ROOT}/${MKLML_DST_DIR}) +SET(MKLML_ROOT ${MKLML_INSTALL_DIR}/${MKLML_VER}) +SET(MKLML_INC_DIR ${MKLML_ROOT}/include) +SET(MKLML_LIB_DIR ${MKLML_ROOT}/lib) +SET(MKLML_LIB ${MKLML_LIB_DIR}/libmklml_intel.so) +SET(MKLML_IOMP_LIB ${MKLML_LIB_DIR}/libiomp5.so) +SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${MKLML_ROOT}/lib") + +INCLUDE_DIRECTORIES(${MKLML_INC_DIR}) + +FILE(WRITE ${MKLML_DOWNLOAD_DIR}/CMakeLists.txt + "PROJECT(MKLML)\n" + "cmake_minimum_required(VERSION 3.0)\n" + "install(DIRECTORY ${MKLML_VER}\n" + " DESTINATION ${MKLML_DST_DIR})\n") + +ExternalProject_Add( + ${MKLML_PROJECT} + ${EXTERNAL_PROJECT_LOG_ARGS} + PREFIX ${MKLML_SOURCE_DIR} + DOWNLOAD_DIR ${MKLML_DOWNLOAD_DIR} + DOWNLOAD_COMMAND wget --no-check-certificate -qO- ${MKLML_URL} | tar xz -C ${MKLML_DOWNLOAD_DIR} + DOWNLOAD_NO_PROGRESS 1 + UPDATE_COMMAND "" + CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${MKLML_INSTALL_ROOT} + CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${MKLML_INSTALL_ROOT} +) + +ADD_LIBRARY(mklml SHARED IMPORTED GLOBAL) +SET_PROPERTY(TARGET mklml PROPERTY IMPORTED_LOCATION ${MKLML_LIB}) +ADD_DEPENDENCIES(mklml ${MKLML_PROJECT}) +LIST(APPEND external_project_dependencies mklml) diff --git a/cmake/external/nnpack.cmake b/cmake/external/nnpack.cmake new file mode 100644 index 0000000000000000000000000000000000000000..d42bcb0f329041462bd8b568052fbb8226d18e4e --- /dev/null +++ b/cmake/external/nnpack.cmake @@ -0,0 +1,30 @@ +# Find the NNPACK library +# NNPACK_ROOT - where to find NNPACK include and library. +# + +set(NNPACK_FOUND OFF) +set(NNPACK_ROOT $ENV{NNPACK_ROOT} CACHE PATH "Folder contains NNPACK") +find_path(NNPACK_INC_DIR nnpack.h PATHS ${NNPACK_ROOT}/include) +find_library(NNPACK_LIB NAMES nnpack PATHS ${NNPACK_ROOT}/lib) +find_library(PTHREADPOOL_LIB NAMES pthreadpool PATHS ${NNPACK_ROOT}/lib) +find_library(NNPACK_UKERNELS_LIB NAMES nnpack_ukernels PATHS ${NNPACK_ROOT}/lib) +find_library(NNPACK_CPUFEATURES_LIB NAMES cpufeatures PATHS ${NNPACK_ROOT}/lib) + +if(NNPACK_INC_DIR AND NNPACK_LIB AND PTHREADPOOL_LIB) + set(NNPACK_FOUND ON) + INCLUDE_DIRECTORIES(${NNPACK_INC_DIR}) + + set(NNPACK_LIBS) + list(APPEND NNPACK_LIBS ${NNPACK_LIB} ${PTHREADPOOL_LIB}) + if (NNPACK_UKERNELS_LIB) + list(APPEND NNPACK_LIBS ${NNPACK_UKERNELS_LIB}) + endif() + if (NNPACK_CPUFEATURES_LIB) + list(APPEND NNPACK_LIBS ${NNPACK_CPUFEATURES_LIB}) + endif() + if(NOT ANDROID) + list(APPEND NNPACK_LIBS "rt") + endif() +else() + message(FATAL_ERROR "Cannot find NNPACK in (${NNPACK_ROOT})") +endif() diff --git a/cmake/external/openblas.cmake b/cmake/external/openblas.cmake index b6bd24fe8ae28b290f93d74dc5ca2b98302bf2a5..db09232c0e69016bf18c1d981e4620e9e804ff7c 100644 --- a/cmake/external/openblas.cmake +++ b/cmake/external/openblas.cmake @@ -21,27 +21,38 @@ IF(NOT ${CBLAS_FOUND}) SET(CBLAS_INSTALL_DIR ${THIRD_PARTY_PATH}/install/openblas) SET(CBLAS_INC_DIR "${CBLAS_INSTALL_DIR}/include" CACHE PATH "openblas include directory." FORCE) - SET(CBLAS_LIBRARIES "${CBLAS_INSTALL_DIR}/lib/${LIBRARY_PREFIX}openblas${STATIC_LIBRARY_SUFFIX}" + SET(CBLAS_LIBRARIES + "${CBLAS_INSTALL_DIR}/lib/${CMAKE_STATIC_LIBRARY_PREFIX}openblas${CMAKE_STATIC_LIBRARY_SUFFIX}" CACHE FILEPATH "openblas library." FORCE) - SET(COMMON_ARGS CC=${CMAKE_C_COMPILER} NO_SHARED=1 NO_LAPACK=1) + SET(COMMON_ARGS CC=${CMAKE_C_COMPILER} NO_SHARED=1 NO_LAPACK=1 libs) - IF(ANDROID) - # arm_soft_fp_abi branch of OpenBLAS to support softfp - # https://github.com/xianyi/OpenBLAS/tree/arm_soft_fp_abi - SET(OPENBLAS_COMMIT "b5c96fcfcdc82945502a2303116a64d89985daf5") - SET(OPTIONAL_ARGS HOSTCC=${HOST_C_COMPILER} TARGET=ARMV7 ARM_SOFTFP_ABI=1 USE_THREAD=0 libs) - ELSEIF(RPI) - # use hardfp - SET(OPENBLAS_COMMIT "v0.2.19") - SET(OPTIONAL_ARGS HOSTCC=${HOST_C_COMPILER} TARGET=ARMV7 USE_THREAD=0 libs) + IF(CMAKE_CROSSCOMPILING) + IF(ANDROID) + # arm_soft_fp_abi branch of OpenBLAS to support softfp + # https://github.com/xianyi/OpenBLAS/tree/arm_soft_fp_abi + SET(OPENBLAS_COMMIT "b5c96fcfcdc82945502a2303116a64d89985daf5") + IF(ANDROID_ABI MATCHES "^armeabi(-v7a)?$") + SET(TARGET "ARMV7") + ELSEIF(ANDROID_ABI STREQUAL "arm64-v8a") + SET(TARGET "ARMV8") + ENDIF() + SET(OPTIONAL_ARGS HOSTCC=${HOST_C_COMPILER} TARGET=${TARGET} ARM_SOFTFP_ABI=1 USE_THREAD=0) + ELSEIF(RPI) + # use hardfp + SET(OPENBLAS_COMMIT "v0.2.19") + SET(OPTIONAL_ARGS HOSTCC=${HOST_C_COMPILER} TARGET=ARMV7 USE_THREAD=0) + ENDIF() ELSE() SET(OPENBLAS_COMMIT "v0.2.19") - SET(OPENBLAS_ARGS DYNAMIC_ARCH=1 libs) + SET(OPTIONAL_ARGS "") + IF(CMAKE_SYSTEM_PROCESSOR MATCHES "^x86(_64)?$") + SET(OPTIONAL_ARGS DYNAMIC_ARCH=1 NUM_THREADS=64) + ENDIF() ENDIF() ExternalProject_Add( - openblas + extern_openblas ${EXTERNAL_PROJECT_LOG_ARGS} GIT_REPOSITORY https://github.com/xianyi/OpenBLAS.git GIT_TAG ${OPENBLAS_COMMIT} @@ -53,8 +64,19 @@ IF(NOT ${CBLAS_FOUND}) UPDATE_COMMAND "" CONFIGURE_COMMAND "" ) - LIST(APPEND external_project_dependencies openblas) ENDIF(NOT ${CBLAS_FOUND}) MESSAGE(STATUS "BLAS library: ${CBLAS_LIBRARIES}") INCLUDE_DIRECTORIES(${CBLAS_INC_DIR}) + +# FIXME(gangliao): generate cblas target to track all high performance +# linear algebra libraries for cc_library(xxx SRCS xxx.c DEPS cblas) +SET(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/cblas_dummy.c) +FILE(WRITE ${dummyfile} "const char * dummy = \"${dummyfile}\";") +ADD_LIBRARY(cblas STATIC ${dummyfile}) +TARGET_LINK_LIBRARIES(cblas ${CBLAS_LIBRARIES}) + +IF(NOT ${CBLAS_FOUND}) + ADD_DEPENDENCIES(cblas extern_openblas) + LIST(APPEND external_project_dependencies cblas) +ENDIF(NOT ${CBLAS_FOUND}) diff --git a/cmake/external/protobuf.cmake b/cmake/external/protobuf.cmake index b35e6839cdc2ee062a9066585f0c83948d87e385..e629d61585c2d2ff916187ee28d4fd089a5bd857 100644 --- a/cmake/external/protobuf.cmake +++ b/cmake/external/protobuf.cmake @@ -13,24 +13,153 @@ # limitations under the License. INCLUDE(ExternalProject) +# Always invoke `FIND_PACKAGE(Protobuf)` for importing function protobuf_generate_cpp +FIND_PACKAGE(Protobuf QUIET) +SET(PROTOBUF_FOUND "OFF") + +if(NOT COMMAND protobuf_generate_python) # before cmake 3.4, protobuf_genrerate_python is not defined. + function(protobuf_generate_python SRCS) + # shameless copy from https://github.com/Kitware/CMake/blob/master/Modules/FindProtobuf.cmake + if(NOT ARGN) + message(SEND_ERROR "Error: PROTOBUF_GENERATE_PYTHON() called without any proto files") + return() + endif() + + if(PROTOBUF_GENERATE_CPP_APPEND_PATH) + # Create an include path for each file specified + foreach(FIL ${ARGN}) + get_filename_component(ABS_FIL ${FIL} ABSOLUTE) + get_filename_component(ABS_PATH ${ABS_FIL} PATH) + list(FIND _protobuf_include_path ${ABS_PATH} _contains_already) + if(${_contains_already} EQUAL -1) + list(APPEND _protobuf_include_path -I ${ABS_PATH}) + endif() + endforeach() + else() + set(_protobuf_include_path -I ${CMAKE_CURRENT_SOURCE_DIR}) + endif() + + if(DEFINED PROTOBUF_IMPORT_DIRS AND NOT DEFINED Protobuf_IMPORT_DIRS) + set(Protobuf_IMPORT_DIRS "${PROTOBUF_IMPORT_DIRS}") + endif() + + if(DEFINED Protobuf_IMPORT_DIRS) + foreach(DIR ${Protobuf_IMPORT_DIRS}) + get_filename_component(ABS_PATH ${DIR} ABSOLUTE) + list(FIND _protobuf_include_path ${ABS_PATH} _contains_already) + if(${_contains_already} EQUAL -1) + list(APPEND _protobuf_include_path -I ${ABS_PATH}) + endif() + endforeach() + endif() + + set(${SRCS}) + foreach(FIL ${ARGN}) + get_filename_component(ABS_FIL ${FIL} ABSOLUTE) + get_filename_component(FIL_WE ${FIL} NAME_WE) + if(NOT PROTOBUF_GENERATE_CPP_APPEND_PATH) + get_filename_component(FIL_DIR ${FIL} DIRECTORY) + if(FIL_DIR) + set(FIL_WE "${FIL_DIR}/${FIL_WE}") + endif() + endif() + + list(APPEND ${SRCS} "${CMAKE_CURRENT_BINARY_DIR}/${FIL_WE}_pb2.py") + add_custom_command( + OUTPUT "${CMAKE_CURRENT_BINARY_DIR}/${FIL_WE}_pb2.py" + COMMAND ${Protobuf_PROTOC_EXECUTABLE} --python_out ${CMAKE_CURRENT_BINARY_DIR} ${_protobuf_include_path} ${ABS_FIL} + DEPENDS ${ABS_FIL} ${Protobuf_PROTOC_EXECUTABLE} + COMMENT "Running Python protocol buffer compiler on ${FIL}" + VERBATIM ) + endforeach() + + set(${SRCS} ${${SRCS}} PARENT_SCOPE) + endfunction() +endif() + +# Print and set the protobuf library information, +# finish this cmake process and exit from this file. +macro(PROMPT_PROTOBUF_LIB) + SET(protobuf_DEPS ${ARGN}) + + MESSAGE(STATUS "Protobuf protoc executable: ${PROTOBUF_PROTOC_EXECUTABLE}") + MESSAGE(STATUS "Protobuf library: ${PROTOBUF_LIBRARY}") + MESSAGE(STATUS "Protobuf version: ${PROTOBUF_VERSION}") + INCLUDE_DIRECTORIES(${PROTOBUF_INCLUDE_DIR}) + + # Assuming that all the protobuf libraries are of the same type. + IF(${PROTOBUF_LIBRARY} MATCHES "${CMAKE_STATIC_LIBRARY_SUFFIX}$") + SET(protobuf_LIBTYPE STATIC) + ELSEIF(${PROTOBUF_LIBRARY} MATCHES "${CMAKE_SHARED_LIBRARY_SUFFIX}$") + SET(protobuf_LIBTYPE SHARED) + ELSE() + MESSAGE(FATAL_ERROR "Unknown library type: ${PROTOBUF_LIBRARY}") + ENDIF() + + ADD_LIBRARY(protobuf ${protobuf_LIBTYPE} IMPORTED GLOBAL) + SET_PROPERTY(TARGET protobuf PROPERTY IMPORTED_LOCATION ${PROTOBUF_LIBRARY}) + + ADD_LIBRARY(protobuf_lite ${protobuf_LIBTYPE} IMPORTED GLOBAL) + SET_PROPERTY(TARGET protobuf_lite PROPERTY IMPORTED_LOCATION ${PROTOBUF_LITE_LIBRARY}) + + ADD_LIBRARY(libprotoc ${protobuf_LIBTYPE} IMPORTED GLOBAL) + SET_PROPERTY(TARGET libprotoc PROPERTY IMPORTED_LOCATION ${PROTOC_LIBRARY}) + + ADD_EXECUTABLE(protoc IMPORTED GLOBAL) + SET_PROPERTY(TARGET protoc PROPERTY IMPORTED_LOCATION ${PROTOBUF_PROTOC_EXECUTABLE}) + # FIND_Protobuf.cmake uses `Protobuf_PROTOC_EXECUTABLE`. + # make `protobuf_generate_cpp` happy. + SET(Protobuf_PROTOC_EXECUTABLE ${PROTOBUF_PROTOC_EXECUTABLE}) + + FOREACH(dep ${protobuf_DEPS}) + ADD_DEPENDENCIES(protobuf ${dep}) + ADD_DEPENDENCIES(protobuf_lite ${dep}) + ADD_DEPENDENCIES(libprotoc ${dep}) + ADD_DEPENDENCIES(protoc ${dep}) + ENDFOREACH() + + LIST(APPEND external_project_dependencies protobuf) + RETURN() +endmacro() +macro(SET_PROTOBUF_VERSION) + EXEC_PROGRAM(${PROTOBUF_PROTOC_EXECUTABLE} ARGS --version OUTPUT_VARIABLE PROTOBUF_VERSION) + STRING(REGEX MATCH "[0-9]+.[0-9]+" PROTOBUF_VERSION "${PROTOBUF_VERSION}") +endmacro() + +set(PROTOBUF_ROOT "" CACHE PATH "Folder contains protobuf") +if (NOT "${PROTOBUF_ROOT}" STREQUAL "") + find_path(PROTOBUF_INCLUDE_DIR google/protobuf/message.h PATHS ${PROTOBUF_ROOT}/include) + find_library(PROTOBUF_LIBRARY protobuf PATHS ${PROTOBUF_ROOT}/lib) + find_library(PROTOBUF_LITE_LIBRARY protobuf-lite PATHS ${PROTOBUF_ROOT}/lib) + find_library(PROTOBUF_PROTOC_LIBRARY protoc PATHS ${PROTOBUF_ROOT}/lib) + find_program(PROTOBUF_PROTOC_EXECUTABLE protoc PATHS ${PROTOBUF_ROOT}/bin) + if (PROTOBUF_INCLUDE_DIR AND PROTOBUF_LIBRARY AND PROTOBUF_LITE_LIBRARY AND PROTOBUF_PROTOC_LIBRARY AND PROTOBUF_PROTOC_EXECUTABLE) + message(STATUS "Using custom protobuf library in ${PROTOBUF_ROOT}.") + SET_PROTOBUF_VERSION() + PROMPT_PROTOBUF_LIB() + else() + message(WARNING "Cannot find protobuf library in ${PROTOBUF_ROOT}.") + endif() +endif() FUNCTION(build_protobuf TARGET_NAME BUILD_FOR_HOST) - SET(PROTOBUF_SOURCES_DIR ${THIRD_PARTY_PATH}/${TARGET_NAME}) - SET(PROTOBUF_INSTALL_DIR ${THIRD_PARTY_PATH}/install/${TARGET_NAME}) + STRING(REPLACE "extern_" "" TARGET_DIR_NAME "${TARGET_NAME}") + SET(PROTOBUF_SOURCES_DIR ${THIRD_PARTY_PATH}/${TARGET_DIR_NAME}) + SET(PROTOBUF_INSTALL_DIR ${THIRD_PARTY_PATH}/install/${TARGET_DIR_NAME}) SET(${TARGET_NAME}_INCLUDE_DIR "${PROTOBUF_INSTALL_DIR}/include" PARENT_SCOPE) SET(PROTOBUF_INCLUDE_DIR "${PROTOBUF_INSTALL_DIR}/include" PARENT_SCOPE) SET(${TARGET_NAME}_LITE_LIBRARY - "${PROTOBUF_INSTALL_DIR}/lib/libprotobuf-lite${STATIC_LIBRARY_SUFFIX}" + "${PROTOBUF_INSTALL_DIR}/lib/libprotobuf-lite${CMAKE_STATIC_LIBRARY_SUFFIX}" PARENT_SCOPE) SET(${TARGET_NAME}_LIBRARY - "${PROTOBUF_INSTALL_DIR}/lib/libprotobuf${STATIC_LIBRARY_SUFFIX}" + "${PROTOBUF_INSTALL_DIR}/lib/libprotobuf${CMAKE_STATIC_LIBRARY_SUFFIX}" PARENT_SCOPE) SET(${TARGET_NAME}_PROTOC_LIBRARY - "${PROTOBUF_INSTALL_DIR}/lib/libprotoc${STATIC_LIBRARY_SUFFIX}" + "${PROTOBUF_INSTALL_DIR}/lib/libprotoc${CMAKE_STATIC_LIBRARY_SUFFIX}" PARENT_SCOPE) SET(${TARGET_NAME}_PROTOC_EXECUTABLE - "${PROTOBUF_INSTALL_DIR}/bin/protoc${EXECUTABLE_SUFFIX}" + "${PROTOBUF_INSTALL_DIR}/bin/protoc${CMAKE_EXECUTABLE_SUFFIX}" PARENT_SCOPE) SET(OPTIONAL_CACHE_ARGS "") @@ -74,17 +203,7 @@ FUNCTION(build_protobuf TARGET_NAME BUILD_FOR_HOST) ENDFUNCTION() SET(PROTOBUF_VERSION 3.1) -IF(NOT CMAKE_CROSSCOMPILING) - FIND_PACKAGE(Protobuf ${PROTOBUF_VERSION}) - - IF(PROTOBUF_FOUND) - EXEC_PROGRAM(${PROTOBUF_PROTOC_EXECUTABLE} ARGS --version OUTPUT_VARIABLE PROTOBUF_VERSION) - STRING(REGEX MATCH "[0-9]+.[0-9]+" PROTOBUF_VERSION "${PROTOBUF_VERSION}") - IF("${PROTOBUF_VERSION}" VERSION_LESS "3.1.0") - SET(PROTOBUF_FOUND OFF) - ENDIF() - ENDIF(PROTOBUF_FOUND) -ELSE() +IF(CMAKE_CROSSCOMPILING) build_protobuf(protobuf_host TRUE) LIST(APPEND external_project_dependencies protobuf_host) @@ -93,20 +212,22 @@ ELSE() ENDIF() IF(NOT PROTOBUF_FOUND) - build_protobuf(protobuf FALSE) - LIST(APPEND external_project_dependencies protobuf) + build_protobuf(extern_protobuf FALSE) - SET(PROTOBUF_INCLUDE_DIR ${protobuf_INCLUDE_DIR} + SET(PROTOBUF_INCLUDE_DIR ${extern_protobuf_INCLUDE_DIR} CACHE PATH "protobuf include directory." FORCE) - IF(NOT CMAKE_CROSSCOMPILING) - SET(PROTOBUF_PROTOC_EXECUTABLE ${protobuf_PROTOC_EXECUTABLE} + SET(PROTOBUF_LITE_LIBRARY ${extern_protobuf_LITE_LIBRARY} + CACHE FILEPATH "protobuf lite library." FORCE) + SET(PROTOBUF_LIBRARY ${extern_protobuf_LIBRARY} + CACHE FILEPATH "protobuf library." FORCE) + SET(PROTOBUF_PROTOC_LIBRARY ${extern_protobuf_PROTOC_LIBRARY} + CACHE FILEPATH "protoc library." FORCE) + + IF(CMAKE_CROSSCOMPILING) + PROMPT_PROTOBUF_LIB(protobuf_host extern_protobuf) + ELSE() + SET(PROTOBUF_PROTOC_EXECUTABLE ${extern_protobuf_PROTOC_EXECUTABLE} CACHE FILEPATH "protobuf executable." FORCE) + PROMPT_PROTOBUF_LIB(extern_protobuf) ENDIF() - SET(PROTOBUF_LITE_LIBRARY ${protobuf_LITE_LIBRARY} CACHE FILEPATH "protobuf lite library." FORCE) - SET(PROTOBUF_LIBRARY ${protobuf_LIBRARY} CACHE FILEPATH "protobuf library." FORCE) - SET(PROTOBUF_PROTOC_LIBRARY ${protobuf_PROTOC_LIBRARY} CACHE FILEPATH "protoc library." FORCE) ENDIF(NOT PROTOBUF_FOUND) - -MESSAGE(STATUS "Protobuf protoc executable: ${PROTOBUF_PROTOC_EXECUTABLE}") -MESSAGE(STATUS "Protobuf library: ${PROTOBUF_LIBRARY}") -INCLUDE_DIRECTORIES(${PROTOBUF_INCLUDE_DIR}) diff --git a/cmake/external/pybind11.cmake b/cmake/external/pybind11.cmake new file mode 100644 index 0000000000000000000000000000000000000000..9391c285c7544669a5b1a078b7473d7a656c1bb4 --- /dev/null +++ b/cmake/external/pybind11.cmake @@ -0,0 +1,30 @@ +INCLUDE(ExternalProject) + +SET(PYBIND_SOURCE_DIR ${THIRD_PARTY_PATH}/pybind) + +INCLUDE_DIRECTORIES(${PYBIND_SOURCE_DIR}/src/extern_pybind/include) + +ExternalProject_Add( + extern_pybind + ${EXTERNAL_PROJECT_LOG_ARGS} + GIT_REPOSITORY "https://github.com/pybind/pybind11.git" + GIT_TAG "v2.1.1" + PREFIX ${PYBIND_SOURCE_DIR} + UPDATE_COMMAND "" + CONFIGURE_COMMAND "" + BUILD_COMMAND "" + INSTALL_COMMAND "" + TEST_COMMAND "" +) + +if (${CMAKE_VERSION} VERSION_LESS "3.3.0") + set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/pybind_dummy.c) + file(WRITE ${dummyfile} "const char * dummy_any = \"${dummyfile}\";") + add_library(pybind STATIC ${dummyfile}) +else() + add_library(pybind INTERFACE) +endif() + +add_dependencies(pybind extern_pybind) + +LIST(APPEND external_project_dependencies pybind) diff --git a/cmake/external/python.cmake b/cmake/external/python.cmake index f4d0daab06c9fcf17f4af59c25f62b415074a52f..490c87d67ed79a238dd506127cd4d9855fab6626 100644 --- a/cmake/external/python.cmake +++ b/cmake/external/python.cmake @@ -18,10 +18,12 @@ INCLUDE(python_module) FIND_PACKAGE(PythonInterp 2.7) IF(WITH_PYTHON) FIND_PACKAGE(PythonLibs 2.7) + # Fixme: Maybe find a static library. Get SHARED/STATIC by FIND_PACKAGE. + ADD_LIBRARY(python SHARED IMPORTED GLOBAL) + SET_PROPERTY(TARGET python PROPERTY IMPORTED_LOCATION ${PYTHON_LIBRARIES}) ENDIF(WITH_PYTHON) SET(py_env "") -SET(USE_VIRTUALENV_FOR_TEST 1) IF(PYTHONINTERP_FOUND) find_python_module(pip REQUIRED) find_python_module(numpy REQUIRED) @@ -32,193 +34,6 @@ IF(PYTHONINTERP_FOUND) MESSAGE(FATAL_ERROR "Found Python Protobuf ${PY_GOOGLE.PROTOBUF_VERSION} < 3.0.0, " "please use pip to upgrade protobuf. pip install -U protobuf") ENDIF() -ELSE(PYTHONINTERP_FOUND) - MESSAGE(FATAL_ERROR "Please install python 2.7 before building PaddlePaddle.") - ##################################### PYTHON ######################################## - SET(PYTHON_SOURCES_DIR ${THIRD_PARTY_PATH}/python) - SET(PYTHON_INSTALL_DIR ${THIRD_PARTY_PATH}/install/python) - SET(_python_DIR ${PYTHON_INSTALL_DIR}) - - IF(UNIX) - SET(PYTHON_FOUND ON) - SET(PYTHON_INCLUDE_DIR "${PYTHON_INSTALL_DIR}/include/python2.7" CACHE PATH "Python include dir" FORCE) - SET(PYTHON_LIBRARIES "${PYTHON_INSTALL_DIR}/lib/libpython2.7.a" CACHE FILEPATH "Python library" FORCE) - SET(PYTHON_EXECUTABLE ${PYTHON_INSTALL_DIR}/bin/python CACHE FILEPATH "Python executable" FORCE) - SET(PY_SITE_PACKAGES_PATH "${PYTHON_INSTALL_DIR}/lib/python2.7/site-packages" CACHE PATH "Python site-packages path" FORCE) - ELSEIF(WIN32) - SET(PYTHON_FOUND ON) - SET(PYTHON_INCLUDE_DIR "${PYTHON_INSTALL_DIR}/include" CACHE PATH "Python include dir" FORCE) - SET(PYTHON_LIBRARIES "${PYTHON_INSTALL_DIR}/libs/python27.lib" CACHE FILEPATH "Python library" FORCE) - SET(PYTHON_EXECUTABLE "${PYTHON_INSTALL_DIR}/bin/python.exe" CACHE FILEPATH "Python executable" FORCE) - SET(PY_SITE_PACKAGES_PATH "${PYTHON_INSTALL_DIR}/Lib/site-packages" CACHE PATH "Python site-packages path" FORCE) - ELSE() - MESSAGE(FATAL_ERROR "Unknown system !") - ENDIF() - - IF(APPLE) - LIST(APPEND EXTERNAL_PROJECT_OPTIONAL_CMAKE_ARGS - -DCMAKE_BUILD_WITH_INSTALL_RPATH:BOOL=ON - ) - ENDIF() - - SET(EXTERNAL_PROJECT_OPTIONAL_CMAKE_CACHE_ARGS) - - # Force Python build to "Release". - IF(CMAKE_CONFIGURATION_TYPES) - SET(SAVED_CMAKE_CFG_INTDIR ${CMAKE_CFG_INTDIR}) - SET(CMAKE_CFG_INTDIR "Release") - ELSE() - LIST(APPEND EXTERNAL_PROJECT_OPTIONAL_CMAKE_CACHE_ARGS - -DCMAKE_BUILD_TYPE:STRING=Release - ) - ENDIF() - - ExternalProject_Add(python - ${EXTERNAL_PROJECT_LOG_ARGS} - GIT_REPOSITORY "https://github.com/python-cmake-buildsystem/python-cmake-buildsystem.git" - PREFIX ${PYTHON_SOURCES_DIR} - UPDATE_COMMAND "" - CMAKE_ARGS -DPYTHON_VERSION=2.7.12 - CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} - CMAKE_ARGS -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} - CMAKE_CACHE_ARGS - -DCMAKE_INSTALL_PREFIX:PATH=${PYTHON_INSTALL_DIR} - -DBUILD_LIBPYTHON_SHARED:BOOL=OFF - -DUSE_SYSTEM_LIBRARIES:BOOL=OFF - -DZLIB_ROOT:FILEPATH=${ZLIB_ROOT} - -DZLIB_INCLUDE_DIR:PATH=${ZLIB_INCLUDE_DIR} - -DZLIB_LIBRARY:FILEPATH=${ZLIB_LIBRARIES} - -DDOWNLOAD_SOURCES:BOOL=ON - -DINSTALL_WINDOWS_TRADITIONAL:BOOL=OFF - ${EXTERNAL_PROJECT_OPTIONAL_CMAKE_CACHE_ARGS} - ${EXTERNAL_PROJECT_OPTIONAL_CMAKE_ARGS} - DEPENDS zlib - ) - - SET(py_env - PATH=${PYTHON_INSTALL_DIR}/bin - PYTHONHOME=${PYTHON_INSTALL_DIR} - PYTHONPATH=${PYTHON_INSTALL_DIR}/lib:${PYTHON_INSTALL_DIR}/lib/python2.7:${PY_SITE_PACKAGES_PATH}) - #################################################################################### - - ##################################### SETUPTOOLS ################################### - SET(SETUPTOOLS_SOURCES_DIR ${PYTHON_SOURCES_DIR}/setuptools) - ExternalProject_Add(setuptools - ${EXTERNAL_PROJECT_LOG_ARGS} - PREFIX ${SETUPTOOLS_SOURCES_DIR} - URL "https://pypi.python.org/packages/source/s/setuptools/setuptools-18.3.2.tar.gz" - BUILD_IN_SOURCE 1 - PATCH_COMMAND "" - UPDATE_COMMAND "" - CONFIGURE_COMMAND "" - INSTALL_COMMAND "" - BUILD_COMMAND env ${py_env} ${PYTHON_EXECUTABLE} setup.py install - DEPENDS python zlib - ) - ##################################################################################### - - ##################################### SIX ########################################### - SET(SIX_SOURCES_DIR ${PYTHON_SOURCES_DIR}/six) - ExternalProject_Add(six - ${EXTERNAL_PROJECT_LOG_ARGS} - PREFIX ${SIX_SOURCES_DIR} - URL https://pypi.python.org/packages/source/s/six/six-1.10.0.tar.gz - BUILD_IN_SOURCE 1 - PATCH_COMMAND "" - UPDATE_COMMAND "" - CONFIGURE_COMMAND "" - INSTALL_COMMAND "" - BUILD_COMMAND env ${py_env} ${PYTHON_EXECUTABLE} setup.py install - DEPENDS python setuptools - ) - ##################################################################################### - - ##################################### CYTHON ######################################## - SET(CYTHON_SOURCES_DIR ${PYTHON_SOURCES_DIR}/cython) - ExternalProject_Add(cython - ${EXTERNAL_PROJECT_LOG_ARGS} - PREFIX ${CYTHON_SOURCES_DIR} - URL https://github.com/cython/cython/archive/0.25.2.tar.gz - GIT_TAG 0.25.2 - BUILD_IN_SOURCE 1 - CONFIGURE_COMMAND "" - PATCH_COMMAND "" - UPDATE_COMMAND "" - INSTALL_COMMAND "" - BUILD_COMMAND env ${py_env} ${PYTHON_EXECUTABLE} setup.py install - DEPENDS python - ) - #################################################################################### - - ##################################### NUMPY ######################################## - SET(NUMPY_SOURCES_DIR ${PYTHON_SOURCES_DIR}/numpy) - SET(NUMPY_TAG_VERSION "v1.11.3") - SET(NUMPY_VERSION "1.11.3") - - SET(EGG_NAME "") - SET(PYTHON_NUMPY_INCLUDE_DIR "") - IF(WIN32) - SET(EGG_NAME "numpy-${NUMPY_VERSION}-py2.7-${HOST_SYSTEM}.egg") - ELSE(WIN32) - IF(APPLE) - SET(EGG_NAME "numpy-${NUMPY_VERSION}-py2.7-${HOST_SYSTEM}-${MACOS_VERSION}") - ELSE(APPLE) - SET(EGG_NAME "numpy-${NUMPY_VERSION}-py2.7-linux") - SET(EGG_NAME "numpy-${NUMPY_VERSION}-py2.7-linux") - ENDIF(APPLE) - - FOREACH(suffix x86_64 intel fat64 fat32 universal) - LIST(APPEND PYTHON_NUMPY_INCLUDE_DIR ${PY_SITE_PACKAGES_PATH}/${EGG_NAME}-${suffix}.egg/numpy/core/include) - ENDFOREACH() - ENDIF(WIN32) - - ExternalProject_Add(numpy - ${EXTERNAL_PROJECT_LOG_ARGS} - GIT_REPOSITORY https://github.com/numpy/numpy.git - GIT_TAG ${NUMPY_TAG_VERSION} - CONFIGURE_COMMAND "" - UPDATE_COMMAND "" - PREFIX ${NUMPY_SOURCES_DIR} - BUILD_COMMAND env ${py_env} ${PYTHON_EXECUTABLE} setup.py build - INSTALL_COMMAND env ${py_env} ${PYTHON_EXECUTABLE} setup.py install - BUILD_IN_SOURCE 1 - DEPENDS python setuptools cython - ) - #################################################################################### - - ##################################### WHEEL ######################################## - SET(WHEEL_SOURCES_DIR ${PYTHON_SOURCES_DIR}/wheel) - ExternalProject_Add(wheel - ${EXTERNAL_PROJECT_LOG_ARGS} - URL https://pypi.python.org/packages/source/w/wheel/wheel-0.29.0.tar.gz - PREFIX ${WHEEL_SOURCES_DIR} - CONFIGURE_COMMAND "" - UPDATE_COMMAND "" - BUILD_COMMAND "" - INSTALL_COMMAND env ${py_env} ${PYTHON_EXECUTABLE} setup.py install - BUILD_IN_SOURCE 1 - DEPENDS python setuptools - ) - #################################################################################### - - ################################### PROTOBUF ####################################### - SET(PY_PROTOBUF_SOURCES_DIR ${PYTHON_SOURCES_DIR}/protobuf) - ExternalProject_Add(python-protobuf - ${EXTERNAL_PROJECT_LOG_ARGS} - URL https://pypi.python.org/packages/e0/b0/0a1b364fe8a7d177b4b7d4dca5b798500dc57a7273b93cca73931b305a6a/protobuf-3.1.0.post1.tar.gz - URL_MD5 38b5fb160c768d2f8444d0c6d637ff91 - PREFIX ${PY_PROTOBUF_SOURCES_DIR} - BUILD_IN_SOURCE 1 - PATCH_COMMAND "" - CONFIGURE_COMMAND "" - BUILD_COMMAND env ${py_env} ${PYTHON_EXECUTABLE} setup.py build - INSTALL_COMMAND env ${py_env} ${PYTHON_EXECUTABLE} setup.py install - DEPENDS python setuptools six - ) - #################################################################################### - - LIST(APPEND external_project_dependencies python setuptools six cython wheel python-protobuf numpy) - ENDIF(PYTHONINTERP_FOUND) IF(WITH_PYTHON) diff --git a/cmake/external/warpctc.cmake b/cmake/external/warpctc.cmake index 293070c3cfcc1196001f64469f3254289b0de792..2d7daed9bcd5b8d854ffae6dc1ea191d154c16fe 100644 --- a/cmake/external/warpctc.cmake +++ b/cmake/external/warpctc.cmake @@ -1,11 +1,11 @@ # Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. -# +# # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at -# +# # http://www.apache.org/licenses/LICENSE-2.0 -# +# # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -43,7 +43,7 @@ ELSE() ENDIF() ExternalProject_Add( - warpctc + extern_warpctc ${EXTERNAL_PROJECT_LOG_ARGS} GIT_REPOSITORY "https://github.com/gangliao/warp-ctc.git" PREFIX ${WARPCTC_SOURCES_DIR} @@ -65,4 +65,8 @@ ExternalProject_Add( -DCMAKE_INSTALL_PREFIX:PATH=${WARPCTC_INSTALL_DIR} ) +ADD_LIBRARY(warpctc STATIC IMPORTED GLOBAL) +SET_PROPERTY(TARGET warpctc PROPERTY IMPORTED_LOCATION ${WARPCTC_LIBRARIES}) +ADD_DEPENDENCIES(warpctc extern_warpctc) + LIST(APPEND external_project_dependencies warpctc) diff --git a/cmake/flags.cmake b/cmake/flags.cmake index 7a996dea92b13bdac054a987a004a3d54ff02da2..b27eb71550b68b5c27e47bf067ae0df329bbd628 100644 --- a/cmake/flags.cmake +++ b/cmake/flags.cmake @@ -9,6 +9,13 @@ function(CheckCompilerCXX11Flag) if(${CMAKE_CXX_COMPILER_VERSION} VERSION_LESS 4.8) message(FATAL_ERROR "Unsupported GCC version. GCC >= 4.8 required.") endif() + if(NOT ANDROID) + # TODO(qijun) gcc 4.9 or later versions raise SEGV due to the optimization problem. + # Use Debug mode instead for now. + if(CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 4.9 OR CMAKE_CXX_COMPILER_VERSION VERSION_EQUAL 4.9) + set(CMAKE_BUILD_TYPE "Debug" CACHE STRING "" FORCE) + endif() + endif() elseif(CMAKE_CXX_COMPILER_ID STREQUAL "AppleClang" OR CMAKE_CXX_COMPILER_ID STREQUAL "Clang") # cmake >= 3.0 compiler id "AppleClang" on Mac OS X, otherwise "Clang" # Apple Clang is a different compiler than upstream Clang which havs different version numbers. @@ -109,7 +116,9 @@ set(COMMON_FLAGS -Wno-unused-function -Wno-error=literal-suffix -Wno-error=sign-compare - -Wno-error=unused-local-typedefs) + -Wno-error=unused-local-typedefs + -Wno-error=parentheses-equality # Warnings in pybind11 +) set(GPU_COMMON_FLAGS -fPIC @@ -122,6 +131,7 @@ set(GPU_COMMON_FLAGS -Wno-error=literal-suffix -Wno-error=unused-local-typedefs -Wno-error=unused-function # Warnings in Numpy Header. + -Wno-error=array-bounds # Warnings in Eigen::array ) if (APPLE) @@ -150,7 +160,7 @@ set(CUDA_PROPAGATE_HOST_FLAGS OFF) # Release/Debug flags set by cmake. Such as -O3 -g -DNDEBUG etc. # So, don't set these flags here. -LIST(APPEND CUDA_NVCC_FLAGS -std=c++11) +LIST(APPEND CUDA_NVCC_FLAGS -std=c++11 --default-stream per-thread) LIST(APPEND CUDA_NVCC_FLAGS --use_fast_math) if(CMAKE_BUILD_TYPE STREQUAL "Debug") @@ -187,6 +197,7 @@ endif() # Modern gpu architectures: Pascal if (CUDA_VERSION VERSION_GREATER "8.0" OR CUDA_VERSION VERSION_EQUAL "8.0") list(APPEND __arch_flags " -gencode arch=compute_60,code=sm_60") + list(APPEND CUDA_NVCC_FLAGS --expt-relaxed-constexpr) endif() # Custom gpu architecture diff --git a/cmake/generic.cmake b/cmake/generic.cmake index 3ca735189da70ca826099843acf4528ee271e02f..d2aab938d4636b1583062e27b73cb30f5d56b7b0 100644 --- a/cmake/generic.cmake +++ b/cmake/generic.cmake @@ -1,141 +1,418 @@ # Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. -# +# # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at -# +# # http://www.apache.org/licenses/LICENSE-2.0 -# +# # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +# -# To simplify the build process of PaddlePaddle, we defined couple of -# fundamental abstractions, e.g., how to build library, binary and -# test in C++, CUDA and Go. -# +# generic.cmake defines CMakes functions that look like Bazel's +# building rules (https://bazel.build/). +# +# # ------------------------------------------- -# C++ CUDA C++ Go +# C++ CUDA C++ Go # ------------------------------------------- -# cc_library nv_library go_library -# cc_binary nv_binary go_binary -# cc_test nv_test go_test +# cc_library nv_library go_library +# cc_binary nv_binary go_binary +# cc_test nv_test go_test # ------------------------------------------- # -# cmake_parse_arguments can help us to achieve this goal. -# https://cmake.org/cmake/help/v3.0/module/CMakeParseArguments.html +# To build a static library example.a from example.cc using the system +# compiler (like GCC): +# +# cc_library(example SRCS example.cc) +# +# To build a static library example.a from multiple source files +# example{1,2,3}.cc: +# +# cc_library(example SRCS example1.cc example2.cc example3.cc) +# +# To build a shared library example.so from example.cc: +# +# cc_library(example SHARED SRCS example.cc) +# +# To build a library using Nvidia's NVCC from .cu file(s), use the nv_ +# prefixed version: +# +# nv_library(example SRCS example.cu) +# +# To specify that a library new_example.a depends on other libraies: +# +# cc_library(new_example SRCS new_example.cc DEPS example) +# +# Static libraries can be composed of other static libraries: +# +# cc_library(composed DEPS dependent1 dependent2 dependent3) +# +# To build an executable binary file from some source files and +# dependent libraries: +# +# cc_binary(example SRCS main.cc something.cc DEPS example1 example2) +# +# To build an executable binary file using NVCC, use the nv_ prefixed +# version: +# +# nv_binary(example SRCS main.cc something.cu DEPS example1 example2) +# +# To build a unit test binary, which is an executable binary with +# GoogleTest linked: +# +# cc_test(example_test SRCS example_test.cc DEPS example) +# +# To build a unit test binary using NVCC, use the nv_ prefixed version: +# +# nv_test(example_test SRCS example_test.cu DEPS example) +# +# It is pretty often that executable and test binaries depend on +# pre-defined external libaries like glog and gflags defined in +# /cmake/external/*.cmake: +# +# cc_test(example_test SRCS example_test.cc DEPS example glog gflags) +# +# To build a go static library using Golang, use the go_ prefixed version: +# +# go_library(example STATIC) +# +# To build a go shared library using Golang, use the go_ prefixed version: +# +# go_library(example SHARED) +# + +# including binary directory for generated headers. +include_directories(${CMAKE_CURRENT_BINARY_DIR}) + +if(NOT APPLE AND NOT ANDROID) + find_package(Threads REQUIRED) + link_libraries(${CMAKE_THREAD_LIBS_INIT}) + set(CMAKE_CXX_LINK_EXECUTABLE "${CMAKE_CXX_LINK_EXECUTABLE} -ldl -lrt") +endif(NOT APPLE AND NOT ANDROID) + +function(merge_static_libs TARGET_NAME) + set(libs ${ARGN}) + list(REMOVE_DUPLICATES libs) + + # Get all propagation dependencies from the merged libraries + foreach(lib ${libs}) + list(APPEND libs_deps ${${lib}_LIB_DEPENDS}) + endforeach() + list(REMOVE_DUPLICATES libs_deps) + + if(APPLE) # Use OSX's libtool to merge archives + # To produce a library we need at least one source file. + # It is created by add_custom_command below and will helps + # also help to track dependencies. + set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/${TARGET_NAME}_dummy.c) + + # Make the generated dummy source file depended on all static input + # libs. If input lib changes,the source file is touched + # which causes the desired effect (relink). + add_custom_command(OUTPUT ${dummyfile} + COMMAND ${CMAKE_COMMAND} -E touch ${dummyfile} + DEPENDS ${libs}) + + # Generate dummy staic lib + file(WRITE ${dummyfile} "const char * dummy = \"${dummyfile}\";") + add_library(${TARGET_NAME} STATIC ${dummyfile}) + target_link_libraries(${TARGET_NAME} ${libs_deps}) + + foreach(lib ${libs}) + # Get the file names of the libraries to be merged + set(libfiles ${libfiles} $) + endforeach() + add_custom_command(TARGET ${TARGET_NAME} POST_BUILD + COMMAND rm "${CMAKE_CURRENT_BINARY_DIR}/lib${TARGET_NAME}.a" + COMMAND /usr/bin/libtool -static -o "${CMAKE_CURRENT_BINARY_DIR}/lib${TARGET_NAME}.a" ${libfiles}) + else() # general UNIX: use "ar" to extract objects and re-add to a common lib + foreach(lib ${libs}) + set(objlistfile ${lib}.objlist) # list of objects in the input library + set(objdir ${lib}.objdir) + + add_custom_command(OUTPUT ${objdir} + COMMAND ${CMAKE_COMMAND} -E make_directory ${objdir} + DEPENDS ${lib}) + + add_custom_command(OUTPUT ${objlistfile} + COMMAND ${CMAKE_AR} -x "$" + COMMAND ${CMAKE_AR} -t "$" > ../${objlistfile} + DEPENDS ${lib} ${objdir} + WORKING_DIRECTORY ${objdir}) + + # Empty dummy source file that goes into merged library + set(mergebase ${lib}.mergebase.c) + add_custom_command(OUTPUT ${mergebase} + COMMAND ${CMAKE_COMMAND} -E touch ${mergebase} + DEPENDS ${objlistfile}) + + list(APPEND mergebases "${mergebase}") + endforeach() + + add_library(${TARGET_NAME} STATIC ${mergebases}) + target_link_libraries(${TARGET_NAME} ${libs_deps}) + + # Get the file name of the generated library + set(outlibfile "$") + + foreach(lib ${libs}) + add_custom_command(TARGET ${TARGET_NAME} POST_BUILD + COMMAND ${CMAKE_AR} cr ${outlibfile} *.o + COMMAND ${CMAKE_RANLIB} ${outlibfile} + WORKING_DIRECTORY ${lib}.objdir) + endforeach() + endif() +endfunction(merge_static_libs) -# cc_library parses tensor.cc and figures out that target also depend on tensor.h. -# cc_library(tensor -# SRCS -# tensor.cc -# DEPS -# variant) function(cc_library TARGET_NAME) - set(options OPTIONAL) + set(options STATIC static SHARED shared) set(oneValueArgs "") set(multiValueArgs SRCS DEPS) cmake_parse_arguments(cc_library "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) - if (${cc_library_OPTIONAL} STREQUAL "SHARED") - add_library(${TARGET_NAME} SHARED ${cc_library_SRCS}) - else() - add_library(${TARGET_NAME} STATIC ${cc_library_SRCS}) - endif() - add_dependencies(${TARGET_NAME} ${cc_library_DEPS} ${external_project_dependencies}) + if (cc_library_SRCS) + if (cc_library_SHARED OR cc_library_shared) # build *.so + add_library(${TARGET_NAME} SHARED ${cc_library_SRCS}) + else() + add_library(${TARGET_NAME} STATIC ${cc_library_SRCS}) + endif() + if (cc_library_DEPS) + add_dependencies(${TARGET_NAME} ${cc_library_DEPS}) + target_link_libraries(${TARGET_NAME} ${cc_library_DEPS}) + endif() + + # cpplint code style + foreach(source_file ${cc_library_SRCS}) + string(REGEX REPLACE "\\.[^.]*$" "" source ${source_file}) + if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${source}.h) + list(APPEND cc_library_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/${source}.h) + endif() + endforeach() + add_style_check_target(${TARGET_NAME} ${cc_library_SRCS} ${cc_library_HEADERS}) + + else(cc_library_SRCS) + if (cc_library_DEPS) + merge_static_libs(${TARGET_NAME} ${cc_library_DEPS}) + else() + message(FATAL "Please specify source file or library in cc_library.") + endif() + endif(cc_library_SRCS) endfunction(cc_library) -# cc_binary parses tensor.cc and figures out that target also depend on tensor.h. -# cc_binary(tensor -# SRCS -# tensor.cc) function(cc_binary TARGET_NAME) - set(options OPTIONAL) + set(options "") set(oneValueArgs "") set(multiValueArgs SRCS DEPS) cmake_parse_arguments(cc_binary "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) add_executable(${TARGET_NAME} ${cc_binary_SRCS}) - link_paddle_exe(${TARGET_NAME}) - if(cc_binary_DEPS) + if(cc_binary_DEPS) target_link_libraries(${TARGET_NAME} ${cc_binary_DEPS}) add_dependencies(${TARGET_NAME} ${cc_binary_DEPS}) endif() endfunction(cc_binary) -# The dependency to target tensor implies that if any of -# tensor{.h,.cc,_test.cc} is changed, tensor_test need to be re-built. -# cc_test(tensor_test -# SRCS -# tensor_test.cc -# DEPS -# tensor) function(cc_test TARGET_NAME) - set(options "") - set(oneValueArgs "") - set(multiValueArgs SRCS DEPS) - cmake_parse_arguments(cc_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) - add_executable(${TARGET_NAME} ${cc_test_SRCS}) - link_paddle_test(${TARGET_NAME}) - if(cc_test_DEPS) - target_link_libraries(${TARGET_NAME} ${cc_test_DEPS}) - add_dependencies(${TARGET_NAME} ${cc_test_DEPS}) + if(WITH_TESTING) + set(options "") + set(oneValueArgs "") + set(multiValueArgs SRCS DEPS) + cmake_parse_arguments(cc_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) + add_executable(${TARGET_NAME} ${cc_test_SRCS}) + target_link_libraries(${TARGET_NAME} ${cc_test_DEPS} gtest gtest_main) + add_dependencies(${TARGET_NAME} ${cc_test_DEPS} gtest gtest_main) + add_test(NAME ${TARGET_NAME} COMMAND ${TARGET_NAME} WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}) endif() - add_test(${TARGET_NAME} ${TARGET_NAME}) endfunction(cc_test) -# Suppose that ops.cu includes global functions that take Tensor as -# their parameters, so ops depend on tensor. This implies that if -# any of tensor.{h.cc}, ops.{h,cu} is changed, ops need to be re-built. -# nv_library(ops -# SRCS -# ops.cu -# DEPS -# tensor) function(nv_library TARGET_NAME) - set(options OPTIONAL) - set(oneValueArgs "") - set(multiValueArgs SRCS DEPS) - cmake_parse_arguments(nv_library "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) - if (${nv_library_OPTIONAL} STREQUAL "SHARED") - cuda_add_library(${TARGET_NAME} SHARED ${nv_library_SRCS}) - else() - cuda_add_library(${TARGET_NAME} STATIC ${nv_library_SRCS}) + if (WITH_GPU) + set(options STATIC static SHARED shared) + set(oneValueArgs "") + set(multiValueArgs SRCS DEPS) + cmake_parse_arguments(nv_library "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) + if(nv_library_SRCS) + if (nv_library_SHARED OR nv_library_shared) # build *.so + cuda_add_library(${TARGET_NAME} SHARED ${nv_library_SRCS}) + else() + cuda_add_library(${TARGET_NAME} STATIC ${nv_library_SRCS}) + endif() + if (nv_library_DEPS) + add_dependencies(${TARGET_NAME} ${nv_library_DEPS}) + target_link_libraries(${TARGET_NAME} ${nv_library_DEPS}) + endif() + # cpplint code style + foreach(source_file ${nv_library_SRCS}) + string(REGEX REPLACE "\\.[^.]*$" "" source ${source_file}) + if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${source}.h) + list(APPEND cc_library_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/${source}.h) + endif() + endforeach() + add_style_check_target(${TARGET_NAME} ${nv_library_SRCS} ${nv_library_HEADERS}) + else(nv_library_SRCS) + if (nv_library_DEPS) + merge_static_libs(${TARGET_NAME} ${nv_library_DEPS}) + else() + message(FATAL "Please specify source file or library in nv_library.") + endif() + endif(nv_library_SRCS) endif() - add_dependencies(${TARGET_NAME} ${nv_library_DEPS} ${external_project_dependencies}) endfunction(nv_library) function(nv_binary TARGET_NAME) - set(options "") - set(oneValueArgs "") - set(multiValueArgs SRCS DEPS) - cmake_parse_arguments(nv_binary "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) - cuda_add_executable(${TARGET_NAME} ${nv_binary_SRCS}) - link_paddle_exe(${TARGET_NAME}) - if(nv_binary_DEPS) - target_link_libraries(${TARGET_NAME} ${nv_binary_DEPS}) - add_dependencies(${TARGET_NAME} ${nv_binary_DEPS}) + if (WITH_GPU) + set(options "") + set(oneValueArgs "") + set(multiValueArgs SRCS DEPS) + cmake_parse_arguments(nv_binary "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) + cuda_add_executable(${TARGET_NAME} ${nv_binary_SRCS}) + if(nv_binary_DEPS) + target_link_libraries(${TARGET_NAME} ${nv_binary_DEPS}) + add_dependencies(${TARGET_NAME} ${nv_binary_DEPS}) + endif() endif() endfunction(nv_binary) -# The dependency to target tensor implies that if any of -# ops{.h,.cu,_test.cu} is changed, ops_test need to be re-built. -# nv_test(ops_test -# SRCS -# ops_test.cu -# DEPS -# ops) function(nv_test TARGET_NAME) - set(options "") + if (WITH_GPU AND WITH_TESTING) + set(options "") + set(oneValueArgs "") + set(multiValueArgs SRCS DEPS) + cmake_parse_arguments(nv_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) + cuda_add_executable(${TARGET_NAME} ${nv_test_SRCS}) + target_link_libraries(${TARGET_NAME} ${nv_test_DEPS} gtest gtest_main) + add_dependencies(${TARGET_NAME} ${nv_test_DEPS} gtest gtest_main) + add_test(${TARGET_NAME} ${TARGET_NAME}) + endif() +endfunction(nv_test) + +function(go_library TARGET_NAME) + set(options STATIC static SHARED shared) + set(oneValueArgs "") + set(multiValueArgs DEPS) + cmake_parse_arguments(go_library "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) + + if (go_library_SHARED OR go_library_shared) + set(BUILD_MODE "-buildmode=c-shared") + set(${TARGET_NAME}_LIB_NAME "${CMAKE_SHARED_LIBRARY_PREFIX}${TARGET_NAME}${CMAKE_SHARED_LIBRARY_SUFFIX}" CACHE STRING "output library name for target ${TARGET_NAME}") + else() + set(BUILD_MODE "-buildmode=c-archive") + set(${TARGET_NAME}_LIB_NAME "${CMAKE_STATIC_LIBRARY_PREFIX}${TARGET_NAME}${CMAKE_STATIC_LIBRARY_SUFFIX}" CACHE STRING "output library name for target ${TARGET_NAME}") + endif() + + set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/${TARGET_NAME}_dummy.c) + + # This custom command will always run since it depends on a not + # existing file. + add_custom_command( + OUTPUT dummy_rebulid_${TARGET_NAME} + COMMAND cmake -E touch ${dummyfile} + ) + # Create a custom target that depends on the custom command output + # file, so the custom command can be referenced as a dependency by + # `add_dependencies`. + add_custom_target(rebuild_${TARGET_NAME} + DEPENDS dummy_rebulid_${TARGET_NAME} + ) + + # Add dummy code to support `make target_name` under Terminal Command + file(WRITE ${dummyfile} "const char * dummy = \"${dummyfile}\";") + if (go_library_SHARED OR go_library_shared) + add_library(${TARGET_NAME} SHARED ${dummyfile}) + else() + add_library(${TARGET_NAME} STATIC ${dummyfile}) + endif() + if(go_library_DEPS) + add_dependencies(${TARGET_NAME} ${go_library_DEPS}) + endif(go_library_DEPS) + + # The "source file" of the library is `${dummyfile}` which never + # change, so the target will never rebuild. Make the target depends + # on the custom command that touches the library "source file", so + # rebuild will always happen. + add_dependencies(${TARGET_NAME} rebuild_${TARGET_NAME}) + + set(${TARGET_NAME}_LIB_PATH "${CMAKE_CURRENT_BINARY_DIR}/${${TARGET_NAME}_LIB_NAME}" CACHE STRING "output library path for target ${TARGET_NAME}") + + file(GLOB GO_SOURCE RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "*.go") + string(REPLACE "${PADDLE_GO_PATH}/" "" CMAKE_CURRENT_SOURCE_REL_DIR ${CMAKE_CURRENT_SOURCE_DIR}) + + add_custom_command(TARGET ${TARGET_NAME} POST_BUILD + COMMAND rm "${${TARGET_NAME}_LIB_PATH}" + # Golang build source code + COMMAND GOPATH=${GOPATH} ${CMAKE_Go_COMPILER} build ${BUILD_MODE} + -o "${${TARGET_NAME}_LIB_PATH}" + "./${CMAKE_CURRENT_SOURCE_REL_DIR}/${GO_SOURCE}" + # must run under GOPATH + WORKING_DIRECTORY "${PADDLE_IN_GOPATH}/go") + add_dependencies(${TARGET_NAME} go_vendor) +endfunction(go_library) + +function(go_binary TARGET_NAME) + set(options OPTIONAL) set(oneValueArgs "") set(multiValueArgs SRCS DEPS) - cmake_parse_arguments(nv_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) - cuda_add_executable(${TARGET_NAME} ${nv_test_SRCS}) - link_paddle_test(${TARGET_NAME}) - if(nv_test_DEPS) - target_link_libraries(${TARGET_NAME} ${nv_test_DEPS}) - add_dependencies(${TARGET_NAME} ${nv_test_DEPS}) + cmake_parse_arguments(go_binary "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) + string(REPLACE "${PADDLE_GO_PATH}/" "" CMAKE_CURRENT_SOURCE_REL_DIR ${CMAKE_CURRENT_SOURCE_DIR}) + + add_custom_command(OUTPUT ${TARGET_NAME}_timestamp + COMMAND env GOPATH=${GOPATH} ${CMAKE_Go_COMPILER} build + -o "${CMAKE_CURRENT_BINARY_DIR}/${TARGET_NAME}" + "./${CMAKE_CURRENT_SOURCE_REL_DIR}/${go_binary_SRCS}" + WORKING_DIRECTORY "${PADDLE_IN_GOPATH}/go") + add_custom_target(${TARGET_NAME} ALL DEPENDS go_vendor ${TARGET_NAME}_timestamp ${go_binary_DEPS}) + install(PROGRAMS ${CMAKE_CURRENT_BINARY_DIR}/${TARGET_NAME} DESTINATION bin) +endfunction(go_binary) + +function(go_test TARGET_NAME) + set(options OPTIONAL) + set(oneValueArgs "") + set(multiValueArgs DEPS) + cmake_parse_arguments(go_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) + string(REPLACE "${PADDLE_GO_PATH}" "" CMAKE_CURRENT_SOURCE_REL_DIR ${CMAKE_CURRENT_SOURCE_DIR}) + add_custom_target(${TARGET_NAME} ALL DEPENDS go_vendor ${go_test_DEPS}) + add_custom_command(TARGET ${TARGET_NAME} POST_BUILD + COMMAND env GOPATH=${GOPATH} ${CMAKE_Go_COMPILER} test -race + -c -o "${CMAKE_CURRENT_BINARY_DIR}/${TARGET_NAME}" + ".${CMAKE_CURRENT_SOURCE_REL_DIR}" + WORKING_DIRECTORY "${PADDLE_IN_GOPATH}/go") + add_test(NAME ${TARGET_NAME} + COMMAND ${CMAKE_CURRENT_BINARY_DIR}/${TARGET_NAME} + WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}) +endfunction(go_test) + +function(proto_library TARGET_NAME) + set(oneValueArgs "") + set(multiValueArgs SRCS DEPS) + cmake_parse_arguments(proto_library "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) + set(proto_srcs) + set(proto_hdrs) + protobuf_generate_cpp(proto_srcs proto_hdrs ${proto_library_SRCS}) + cc_library(${TARGET_NAME} SRCS ${proto_srcs} DEPS ${proto_library_DEPS} protobuf) +endfunction() + +function(py_proto_compile TARGET_NAME) + set(oneValueArgs "") + set(multiValueArgs SRCS) + cmake_parse_arguments(py_proto_compile "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) + set(py_srcs) + protobuf_generate_python(py_srcs ${py_proto_compile_SRCS}) + add_custom_target(${TARGET_NAME} ALL DEPENDS ${py_srcs}) +endfunction() + +function(py_test TARGET_NAME) + if(WITH_TESTING) + set(options STATIC static SHARED shared) + set(oneValueArgs "") + set(multiValueArgs SRCS DEPS) + cmake_parse_arguments(py_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) + add_test(NAME ${TARGET_NAME} + COMMAND env PYTHONPATH=${PADDLE_PYTHON_BUILD_DIR}/lib-python + python2 ${py_test_SRCS} + WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}) endif() - add_test(${TARGET_NAME} ${TARGET_NAME}) -endfunction(nv_test) +endfunction() diff --git a/cmake/package.cmake b/cmake/package.cmake index ff49a2d08e8f6004320acfce266339aa301eb9c4..79e02147f3f7cc19c1bf45d8a1d208a9a32416ff 100644 --- a/cmake/package.cmake +++ b/cmake/package.cmake @@ -12,7 +12,7 @@ set(CPACK_PACKAGE_DESCRIPTION "") set(CPACK_DEBIAN_PACKAGE_DEPENDS "libpython2.7-dev, libstdc++6, python-pip, curl, libgfortran3, python-pip-whl") set(CPACK_DEBIAN_PACKAGE_SECTION Devel) set(CPACK_DEBIAN_PACKAGE_VERSION ${PADDLE_VERSION}) -set(CPACK_DEBIAN_PACKAGE_CONTROL_EXTRA "${PROJ_ROOT}/paddle/scripts/deb/postinst") +set(CPACK_DEBIAN_PACKAGE_CONTROL_EXTRA "${PADDLE_SOURCE_DIR}/paddle/scripts/deb/postinst") #set(CPACK_GENERATOR "DEB") # Start cpack include (CMakePackageConfigHelpers) diff --git a/cmake/rdma.cmake b/cmake/rdma.cmake index 9ff1a77cac74fb1bdfe470a78d225ed1767bb1b5..b698f3bdc3ff586a72badee3e0109e29285b457f 100644 --- a/cmake/rdma.cmake +++ b/cmake/rdma.cmake @@ -10,7 +10,7 @@ if(WITH_RDMA) function(generate_rdma_links) #redirect to current DIR to isolate the pollution from system runtime environment - #it can benifits unified control for different gcc environment. + #it can benifits unified control for different gcc environment. #e.g, by default gcc48 did not refer /usr/lib64 which could contain low version #runtime libraries that will crash process while loading it. That redirect trick #can fix it. @@ -19,7 +19,9 @@ if(WITH_RDMA) COMMAND ln -s -f /usr/lib64/libibverbs.so.1.0.0 librdma/libibverbs.so.1 COMMAND ln -s -f /usr/lib64/libibverbs.so.1.0.0 librdma/libibverbs.so COMMAND ln -s -f /usr/lib64/librdmacm.so.1.0.0 librdma/librdmacm.so.1 - COMMAND ln -s -f /usr/lib64/librdmacm.so.1.0.0 librdma/librdmacm.so + COMMAND ln -s -f /usr/lib64/librdmacm.so.1.0.0 librdma/librdmacm.so + COMMAND ln -s -f /lib64/libnl.so.1.1.4 librdma/libnl.so.1 + COMMAND ln -s -f /lib64/libnl.so.1.1.4 librdma/libnl.so WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR} ) endfunction(generate_rdma_links) @@ -44,7 +46,7 @@ if(WITH_RDMA) RDMA_INC_XIO AND RDMA_INC_EVENT AND RDMA_INC_NUMA AND - RDMA_LIB_SXISOCK AND + RDMA_LIB_SXISOCK AND RDMA_LIB_XIO AND RDMA_LIB_EVENT AND RDMA_LIB_EVENT_CORE AND @@ -53,19 +55,19 @@ if(WITH_RDMA) RDMA_LIB_NUMA ) - set(RDMA_INC_DIR - ${RDMA_INC_SXISOCK} + set(RDMA_INC_DIR + ${RDMA_INC_SXISOCK} ${RDMA_INC_XIO} ${RDMA_INC_EVENT} ${RDMA_INC_NUMA}) - set(RDMA_LIBS - ${RDMA_LIB_SXISOCK} - ${RDMA_LIB_XIO} - ${RDMA_LIB_EVENT} - ${RDMA_LIB_EVENT_CORE} - ${RDMA_LIB_EVENT_EXTRA} - ${RDMA_LIB_EVENT_PTHREADS} - ${RDMA_LIB_NUMA} + set(RDMA_LIBS + ${RDMA_LIB_SXISOCK} + ${RDMA_LIB_XIO} + ${RDMA_LIB_EVENT} + ${RDMA_LIB_EVENT_CORE} + ${RDMA_LIB_EVENT_EXTRA} + ${RDMA_LIB_EVENT_PTHREADS} + ${RDMA_LIB_NUMA} ) set(RDMA_LD_FLAGS "-L./librdma -libverbs -lrdmacm -Xlinker -rpath ./librdma") include_directories("${RDMA_INC_DIR}") diff --git a/cmake/system.cmake b/cmake/system.cmake index 904652413e026e3a7f3f2a19f48f4e906ce6babb..adf5e2c539740076ad1808353522c7467d765e64 100644 --- a/cmake/system.cmake +++ b/cmake/system.cmake @@ -33,6 +33,7 @@ ELSE(WIN32) SET(CMAKE_OSX_DEPLOYMENT_TARGET ${MACOS_VERSION} CACHE STRING "Minimum OS X version to target for deployment (at runtime); newer APIs weak linked. Set to empty string for default value.") ENDIF() + set(CMAKE_EXE_LINKER_FLAGS "-framework CoreFoundation -framework Security") ELSE(APPLE) IF(EXISTS "/etc/issue") @@ -84,24 +85,6 @@ IF(DEFINED CMAKE_SYSTEM_NAME) ENDIF() ENDIF() -# prefix and suffix on different os -IF(WIN32) - SET(LIBRARY_PREFIX "") - SET(SHARED_LIBRARY_SUFFIX ".dll") - SET(STATIC_LIBRARY_SUFFIX ".lib") - SET(EXECUTABLE_SUFFIX ".exe") -ELSE(WIN32) - SET(LIBRARY_PREFIX "lib") - IF(APPLE) - SET(SHARED_LIBRARY_SUFFIX ".dylib") - ELSE(APPLE) - SET(SHARED_LIBRARY_SUFFIX ".so") - ENDIF(APPLE) - - SET(STATIC_LIBRARY_SUFFIX ".a") - SET(EXECUTABLE_SUFFIX "") -ENDIF(WIN32) - # external dependencies log output SET(EXTERNAL_PROJECT_LOG_ARGS LOG_DOWNLOAD 0 # Wrap download in script to log output diff --git a/cmake/util.cmake b/cmake/util.cmake index b828eef322bc570c07f5c357353641117a094c16..0da4969d310368ab27b0ed65237813c07d6e59f0 100644 --- a/cmake/util.cmake +++ b/cmake/util.cmake @@ -84,6 +84,7 @@ function(link_paddle_exe TARGET_NAME) paddle_parameter paddle_proto paddle_cuda + paddle_optimizer ${EXTERNAL_LIBS} ${CMAKE_THREAD_LIBS_INIT} ${CMAKE_DL_LIBS} @@ -117,7 +118,6 @@ endfunction() macro(add_unittest_without_exec TARGET_NAME) add_executable(${TARGET_NAME} ${ARGN}) link_paddle_test(${TARGET_NAME}) - add_style_check_target(${TARGET_NAME} ${ARGN}) endmacro() # add_unittest @@ -141,16 +141,20 @@ endmacro() function(create_resources res_file output_file) add_custom_command( OUTPUT ${output_file} - COMMAND python ARGS ${PROJ_ROOT}/cmake/make_resource.py ${res_file} ${output_file} - DEPENDS ${res_file} ${PROJ_ROOT}/cmake/make_resource.py) + COMMAND python ARGS ${PADDLE_SOURCE_DIR}/cmake/make_resource.py ${res_file} ${output_file} + DEPENDS ${res_file} ${PADDLE_SOURCE_DIR}/cmake/make_resource.py) endfunction() # Create a python unittest using run_python_tests.sh, # which takes care of making correct running environment function(add_python_test TEST_NAME) - add_test(NAME ${TEST_NAME} - COMMAND bash ${PROJ_ROOT}/paddle/scripts/run_python_tests.sh - ${USE_VIRTUALENV_FOR_TEST} ${PYTHON_EXECUTABLE} ${ARGN} - WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}) + foreach(arg ${ARGN}) + get_filename_component(py_fn ${arg} NAME_WE) + set(TRG_NAME ${TEST_NAME}_${py_fn}) + add_test(NAME ${TRG_NAME} + COMMAND env PYTHONPATH=${PADDLE_PYTHON_PACKAGE_DIR} + python2 ${arg} + WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}) + endforeach() endfunction() diff --git a/cmake/version.cmake b/cmake/version.cmake index ac1583a24c828629c46cb9cf4e965f8da2273732..cde650128a068faf32f4abfff5cdfdeb656d8577 100644 --- a/cmake/version.cmake +++ b/cmake/version.cmake @@ -4,7 +4,7 @@ set(tmp_version "HEAD") while ("${PADDLE_VERSION}" STREQUAL "") execute_process( COMMAND ${GIT_EXECUTABLE} describe --tags --abbrev=0 ${tmp_version} - WORKING_DIRECTORY ${PROJ_ROOT} + WORKING_DIRECTORY ${PADDLE_SOURCE_DIR} OUTPUT_VARIABLE GIT_TAG_NAME RESULT_VARIABLE GIT_RESULT ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE) diff --git a/demo/image_classification/.gitignore b/demo/image_classification/.gitignore deleted file mode 100644 index 6a05b8f6632db0977fceade8b48a89b9f7f6e6cc..0000000000000000000000000000000000000000 --- a/demo/image_classification/.gitignore +++ /dev/null @@ -1,9 +0,0 @@ -data/cifar-10-batches-py -data/cifar-out -cifar_vgg_model/* -plot.png -train.log -image_provider_copy_1.py -*pyc -train.list -test.list diff --git a/demo/image_classification/api_v2_resnet.py b/demo/image_classification/api_v2_resnet.py deleted file mode 100644 index 19d20540780becf504973a23b50445d4b65dc2ef..0000000000000000000000000000000000000000 --- a/demo/image_classification/api_v2_resnet.py +++ /dev/null @@ -1,74 +0,0 @@ -# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import paddle.v2 as paddle - -__all__ = ['resnet_cifar10'] - - -def conv_bn_layer(input, - ch_out, - filter_size, - stride, - padding, - active_type=paddle.activation.Relu(), - ch_in=None): - tmp = paddle.layer.img_conv( - input=input, - filter_size=filter_size, - num_channels=ch_in, - num_filters=ch_out, - stride=stride, - padding=padding, - act=paddle.activation.Linear(), - bias_attr=False) - return paddle.layer.batch_norm(input=tmp, act=active_type) - - -def shortcut(ipt, n_in, n_out, stride): - if n_in != n_out: - return conv_bn_layer(ipt, n_out, 1, stride, 0, - paddle.activation.Linear()) - else: - return ipt - - -def basicblock(ipt, ch_out, stride): - ch_in = ch_out * 2 - tmp = conv_bn_layer(ipt, ch_out, 3, stride, 1) - tmp = conv_bn_layer(tmp, ch_out, 3, 1, 1, paddle.activation.Linear()) - short = shortcut(ipt, ch_in, ch_out, stride) - return paddle.layer.addto(input=[tmp, short], act=paddle.activation.Relu()) - - -def layer_warp(block_func, ipt, features, count, stride): - tmp = block_func(ipt, features, stride) - for i in range(1, count): - tmp = block_func(tmp, features, 1) - return tmp - - -def resnet_cifar10(ipt, depth=32): - # depth should be one of 20, 32, 44, 56, 110, 1202 - assert (depth - 2) % 6 == 0 - n = (depth - 2) / 6 - nStages = {16, 64, 128} - conv1 = conv_bn_layer( - ipt, ch_in=3, ch_out=16, filter_size=3, stride=1, padding=1) - res1 = layer_warp(basicblock, conv1, 16, n, 1) - res2 = layer_warp(basicblock, res1, 32, n, 2) - res3 = layer_warp(basicblock, res2, 64, n, 2) - pool = paddle.layer.img_pool( - input=res3, pool_size=8, stride=1, pool_type=paddle.pooling.Avg()) - return pool diff --git a/demo/image_classification/api_v2_train.py b/demo/image_classification/api_v2_train.py deleted file mode 100644 index 53cffa6fb4e8b2e19725f4f44bf7b9ffffb25232..0000000000000000000000000000000000000000 --- a/demo/image_classification/api_v2_train.py +++ /dev/null @@ -1,92 +0,0 @@ -# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License - -import sys - -import paddle.v2 as paddle - -from api_v2_vgg import vgg_bn_drop - - -def main(): - datadim = 3 * 32 * 32 - classdim = 10 - - # PaddlePaddle init - paddle.init(use_gpu=False, trainer_count=1) - - image = paddle.layer.data( - name="image", type=paddle.data_type.dense_vector(datadim)) - - # Add neural network config - # option 1. resnet - # net = resnet_cifar10(image, depth=32) - # option 2. vgg - net = vgg_bn_drop(image) - - out = paddle.layer.fc(input=net, - size=classdim, - act=paddle.activation.Softmax()) - - lbl = paddle.layer.data( - name="label", type=paddle.data_type.integer_value(classdim)) - cost = paddle.layer.classification_cost(input=out, label=lbl) - - # Create parameters - parameters = paddle.parameters.create(cost) - - # Create optimizer - momentum_optimizer = paddle.optimizer.Momentum( - momentum=0.9, - regularization=paddle.optimizer.L2Regularization(rate=0.0002 * 128), - learning_rate=0.1 / 128.0, - learning_rate_decay_a=0.1, - learning_rate_decay_b=50000 * 100, - learning_rate_schedule='discexp', - batch_size=128) - - # End batch and end pass event handler - def event_handler(event): - if isinstance(event, paddle.event.EndIteration): - if event.batch_id % 100 == 0: - print "\nPass %d, Batch %d, Cost %f, %s" % ( - event.pass_id, event.batch_id, event.cost, event.metrics) - else: - sys.stdout.write('.') - sys.stdout.flush() - if isinstance(event, paddle.event.EndPass): - result = trainer.test( - reader=paddle.batch( - paddle.dataset.cifar.test10(), batch_size=128), - feeding={'image': 0, - 'label': 1}) - print "\nTest with Pass %d, %s" % (event.pass_id, result.metrics) - - # Create trainer - trainer = paddle.trainer.SGD(cost=cost, - parameters=parameters, - update_equation=momentum_optimizer) - trainer.train( - reader=paddle.batch( - paddle.reader.shuffle( - paddle.dataset.cifar.train10(), buf_size=50000), - batch_size=128), - num_passes=5, - event_handler=event_handler, - feeding={'image': 0, - 'label': 1}) - - -if __name__ == '__main__': - main() diff --git a/demo/image_classification/api_v2_vgg.py b/demo/image_classification/api_v2_vgg.py deleted file mode 100644 index 1e0e6b93adde30425f17aa9cd07542275f4fec37..0000000000000000000000000000000000000000 --- a/demo/image_classification/api_v2_vgg.py +++ /dev/null @@ -1,47 +0,0 @@ -# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import paddle.v2 as paddle - -__all__ = ['vgg_bn_drop'] - - -def vgg_bn_drop(input): - def conv_block(ipt, num_filter, groups, dropouts, num_channels=None): - return paddle.networks.img_conv_group( - input=ipt, - num_channels=num_channels, - pool_size=2, - pool_stride=2, - conv_num_filter=[num_filter] * groups, - conv_filter_size=3, - conv_act=paddle.activation.Relu(), - conv_with_batchnorm=True, - conv_batchnorm_drop_rate=dropouts, - pool_type=paddle.pooling.Max()) - - conv1 = conv_block(input, 64, 2, [0.3, 0], 3) - conv2 = conv_block(conv1, 128, 2, [0.4, 0]) - conv3 = conv_block(conv2, 256, 3, [0.4, 0.4, 0]) - conv4 = conv_block(conv3, 512, 3, [0.4, 0.4, 0]) - conv5 = conv_block(conv4, 512, 3, [0.4, 0.4, 0]) - - drop = paddle.layer.dropout(input=conv5, dropout_rate=0.5) - fc1 = paddle.layer.fc(input=drop, size=512, act=paddle.activation.Linear()) - bn = paddle.layer.batch_norm( - input=fc1, - act=paddle.activation.Relu(), - layer_attr=paddle.attr.Extra(drop_rate=0.5)) - fc2 = paddle.layer.fc(input=bn, size=512, act=paddle.activation.Linear()) - return fc2 diff --git a/demo/image_classification/data/download_cifar.sh b/demo/image_classification/data/download_cifar.sh deleted file mode 100755 index 532178d627fe19ab8ea79ecae73e5328b5294bea..0000000000000000000000000000000000000000 --- a/demo/image_classification/data/download_cifar.sh +++ /dev/null @@ -1,21 +0,0 @@ -#!/bin/bash -# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -set -e -wget https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz -tar zxf cifar-10-python.tar.gz -rm cifar-10-python.tar.gz -rm -rf cifar-out/* -echo Converting CIFAR data to images..... -python process_cifar.py ./cifar-10-batches-py ./cifar-out diff --git a/demo/image_classification/data/process_cifar.py b/demo/image_classification/data/process_cifar.py deleted file mode 100644 index db6666189e5b8008a6b66fb64afcdf98980e72bb..0000000000000000000000000000000000000000 --- a/demo/image_classification/data/process_cifar.py +++ /dev/null @@ -1,89 +0,0 @@ -# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import numpy as np -import sys -import os -import PIL.Image as Image -""" - Usage: python process_cifar input_dir output_dir -""" - - -def mkdir_not_exist(path): - """ - Make dir if the path does not exist. - path: the path to be created. - """ - if not os.path.exists(path): - os.mkdir(path) - - -def create_dir_structure(output_dir): - """ - Create the directory structure for the directory. - output_dir: the direcotry structure path. - """ - mkdir_not_exist(os.path.join(output_dir)) - mkdir_not_exist(os.path.join(output_dir, "train")) - mkdir_not_exist(os.path.join(output_dir, "test")) - - -def convert_batch(batch_path, label_set, label_map, output_dir, data_split): - """ - Convert CIFAR batch to the structure of Paddle format. - batch_path: the batch to be converted. - label_set: the set of labels. - output_dir: the output path. - data_split: whether it is training or testing data. - """ - data = np.load(batch_path) - for data, label, filename in zip(data['data'], data['labels'], - data['filenames']): - data = data.reshape((3, 32, 32)) - data = np.transpose(data, (1, 2, 0)) - label = label_map[label] - output_dir_this = os.path.join(output_dir, data_split, str(label)) - output_filename = os.path.join(output_dir_this, filename) - if not label in label_set: - label_set[label] = True - mkdir_not_exist(output_dir_this) - Image.fromarray(data).save(output_filename) - - -if __name__ == '__main__': - input_dir = sys.argv[1] - output_dir = sys.argv[2] - num_batch = 5 - create_dir_structure(output_dir) - label_map = { - 0: "airplane", - 1: "automobile", - 2: "bird", - 3: "cat", - 4: "deer", - 5: "dog", - 6: "frog", - 7: "horse", - 8: "ship", - 9: "truck" - } - labels = {} - for i in range(1, num_batch + 1): - convert_batch( - os.path.join(input_dir, "data_batch_%d" % i), labels, label_map, - output_dir, "train") - convert_batch( - os.path.join(input_dir, "test_batch"), {}, label_map, output_dir, - "test") diff --git a/demo/image_classification/image_provider.py b/demo/image_classification/image_provider.py deleted file mode 100644 index 6a315ff094c1af5f8250d8a22ff96740dddd9808..0000000000000000000000000000000000000000 --- a/demo/image_classification/image_provider.py +++ /dev/null @@ -1,89 +0,0 @@ -# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import io -import random - -import paddle.utils.image_util as image_util -from paddle.trainer.PyDataProvider2 import * - - -# -# {'img_size': 32, -# 'settings': a global object, -# 'color': True, -# 'mean_img_size': 32, -# 'meta': './data/cifar-out/batches/batches.meta', -# 'num_classes': 10, -# 'file_list': ('./data/cifar-out/batches/train_batch_000',), -# 'use_jpeg': True} -def hook(settings, img_size, mean_img_size, num_classes, color, meta, use_jpeg, - is_train, **kwargs): - settings.mean_img_size = mean_img_size - settings.img_size = img_size - settings.num_classes = num_classes - settings.color = color - settings.is_train = is_train - - if settings.color: - settings.img_raw_size = settings.img_size * settings.img_size * 3 - else: - settings.img_raw_size = settings.img_size * settings.img_size - - settings.meta_path = meta - settings.use_jpeg = use_jpeg - - settings.img_mean = image_util.load_meta(settings.meta_path, - settings.mean_img_size, - settings.img_size, settings.color) - - settings.logger.info('Image size: %s', settings.img_size) - settings.logger.info('Meta path: %s', settings.meta_path) - settings.input_types = { - 'image': dense_vector(settings.img_raw_size), - 'label': integer_value(settings.num_classes) - } - - settings.logger.info('DataProvider Initialization finished') - - -@provider(init_hook=hook, min_pool_size=0) -def processData(settings, file_list): - """ - The main function for loading data. - Load the batch, iterate all the images and labels in this batch. - file_list: the batch file list. - """ - with open(file_list, 'r') as fdata: - lines = [line.strip() for line in fdata] - random.shuffle(lines) - for file_name in lines: - with io.open(file_name.strip(), 'rb') as file: - data = cPickle.load(file) - indexes = list(range(len(data['images']))) - if settings.is_train: - random.shuffle(indexes) - for i in indexes: - if settings.use_jpeg == 1: - img = image_util.decode_jpeg(data['images'][i]) - else: - img = data['images'][i] - img_feat = image_util.preprocess_img( - img, settings.img_mean, settings.img_size, - settings.is_train, settings.color) - label = data['labels'][i] - yield { - 'image': img_feat.astype('float32'), - 'label': int(label) - } diff --git a/demo/image_classification/image_util.py b/demo/image_classification/image_util.py deleted file mode 100644 index f09605394a19e09d92e555eeefb0b5646625b618..0000000000000000000000000000000000000000 --- a/demo/image_classification/image_util.py +++ /dev/null @@ -1,221 +0,0 @@ -# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import numpy as np -from PIL import Image -from cStringIO import StringIO - - -def resize_image(img, target_size): - """ - Resize an image so that the shorter edge has length target_size. - img: the input image to be resized. - target_size: the target resized image size. - """ - percent = (target_size / float(min(img.size[0], img.size[1]))) - resized_size = int(round(img.size[0] * percent)), int( - round(img.size[1] * percent)) - img = img.resize(resized_size, Image.ANTIALIAS) - return img - - -def flip(im): - """ - Return the flipped image. - Flip an image along the horizontal direction. - im: input image, (H x W x K) ndarrays - """ - if len(im.shape) == 3: - return im[:, :, ::-1] - else: - return im[:, ::-1] - - -def crop_img(im, inner_size, color=True, test=True): - """ - Return cropped image. - The size of the cropped image is inner_size * inner_size. - im: (K x H x W) ndarrays - inner_size: the cropped image size. - color: whether it is color image. - test: whether in test mode. - If False, does random cropping and flipping. - If True, crop the center of images. - """ - if color: - height, width = max(inner_size, im.shape[1]), max(inner_size, - im.shape[2]) - padded_im = np.zeros((3, height, width)) - startY = (height - im.shape[1]) / 2 - startX = (width - im.shape[2]) / 2 - endY, endX = startY + im.shape[1], startX + im.shape[2] - padded_im[:, startY:endY, startX:endX] = im - else: - im = im.astype('float32') - height, width = max(inner_size, im.shape[0]), max(inner_size, - im.shape[1]) - padded_im = np.zeros((height, width)) - startY = (height - im.shape[0]) / 2 - startX = (width - im.shape[1]) / 2 - endY, endX = startY + im.shape[0], startX + im.shape[1] - padded_im[startY:endY, startX:endX] = im - if test: - startY = (height - inner_size) / 2 - startX = (width - inner_size) / 2 - else: - startY = np.random.randint(0, height - inner_size + 1) - startX = np.random.randint(0, width - inner_size + 1) - endY, endX = startY + inner_size, startX + inner_size - if color: - pic = padded_im[:, startY:endY, startX:endX] - else: - pic = padded_im[startY:endY, startX:endX] - if (not test) and (np.random.randint(2) == 0): - pic = flip(pic) - return pic - - -def decode_jpeg(jpeg_string): - np_array = np.array(Image.open(StringIO(jpeg_string))) - if len(np_array.shape) == 3: - np_array = np.transpose(np_array, (2, 0, 1)) - return np_array - - -def preprocess_img(im, img_mean, crop_size, is_train, color=True): - """ - Does data augmentation for images. - If is_train is false, cropping the center region from the image. - If is_train is true, randomly crop a region from the image, - and randomy does flipping. - im: (K x H x W) ndarrays - """ - im = im.astype('float32') - test = not is_train - pic = crop_img(im, crop_size, color, test) - pic -= img_mean - return pic.flatten() - - -def load_meta(meta_path, mean_img_size, crop_size, color=True): - """ - Return the loaded meta file. - Load the meta image, which is the mean of the images in the dataset. - The mean image is subtracted from every input image so that the expected mean - of each input image is zero. - """ - mean = np.load(meta_path)['data_mean'] - border = (mean_img_size - crop_size) / 2 - if color: - assert (mean_img_size * mean_img_size * 3 == mean.shape[0]) - mean = mean.reshape(3, mean_img_size, mean_img_size) - mean = mean[:, border:border + crop_size, border:border + - crop_size].astype('float32') - else: - assert (mean_img_size * mean_img_size == mean.shape[0]) - mean = mean.reshape(mean_img_size, mean_img_size) - mean = mean[border:border + crop_size, border:border + - crop_size].astype('float32') - return mean - - -def load_image(img_path, is_color=True): - """ - Load image and return. - img_path: image path. - is_color: is color image or not. - """ - img = Image.open(img_path) - img.load() - return img - - -def oversample(img, crop_dims): - """ - image : iterable of (H x W x K) ndarrays - crop_dims: (height, width) tuple for the crops. - Returned data contains ten crops of input image, namely, - four corner patches and the center patch as well as their - horizontal reflections. - """ - # Dimensions and center. - im_shape = np.array(img[0].shape) - crop_dims = np.array(crop_dims) - im_center = im_shape[:2] / 2.0 - - # Make crop coordinates - h_indices = (0, im_shape[0] - crop_dims[0]) - w_indices = (0, im_shape[1] - crop_dims[1]) - crops_ix = np.empty((5, 4), dtype=int) - curr = 0 - for i in h_indices: - for j in w_indices: - crops_ix[curr] = (i, j, i + crop_dims[0], j + crop_dims[1]) - curr += 1 - crops_ix[4] = np.tile(im_center, (1, 2)) + np.concatenate( - [-crop_dims / 2.0, crop_dims / 2.0]) - crops_ix = np.tile(crops_ix, (2, 1)) - - # Extract crops - crops = np.empty( - (10 * len(img), crop_dims[0], crop_dims[1], im_shape[-1]), - dtype=np.float32) - ix = 0 - for im in img: - for crop in crops_ix: - crops[ix] = im[crop[0]:crop[2], crop[1]:crop[3], :] - ix += 1 - crops[ix - 5:ix] = crops[ix - 5:ix, :, ::-1, :] # flip for mirrors - return crops - - -class ImageTransformer: - def __init__(self, - transpose=None, - channel_swap=None, - mean=None, - is_color=True): - self.transpose = transpose - self.channel_swap = None - self.mean = None - self.is_color = is_color - - def set_transpose(self, order): - if self.is_color: - assert 3 == len(order) - self.transpose = order - - def set_channel_swap(self, order): - if self.is_color: - assert 3 == len(order) - self.channel_swap = order - - def set_mean(self, mean): - # mean value, may be one value per channel - if mean.ndim == 1: - mean = mean[:, np.newaxis, np.newaxis] - else: - # elementwise mean - if self.is_color: - assert len(mean.shape) == 3 - self.mean = mean - - def transformer(self, data): - if self.transpose is not None: - data = data.transpose(self.transpose) - if self.channel_swap is not None: - data = data[self.channel_swap, :, :] - if self.mean is not None: - data -= self.mean - return data diff --git a/demo/image_classification/prediction.py b/demo/image_classification/prediction.py deleted file mode 100755 index 49c0ff600c40e0222093ff0a8a2f7e8e38ccba29..0000000000000000000000000000000000000000 --- a/demo/image_classification/prediction.py +++ /dev/null @@ -1,159 +0,0 @@ -# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os, sys -import numpy as np -import logging -from PIL import Image -from optparse import OptionParser - -import paddle.utils.image_util as image_util - -from py_paddle import swig_paddle, DataProviderConverter -from paddle.trainer.PyDataProvider2 import dense_vector -from paddle.trainer.config_parser import parse_config - -logging.basicConfig( - format='[%(levelname)s %(asctime)s %(filename)s:%(lineno)s] %(message)s') -logging.getLogger().setLevel(logging.INFO) - - -class ImageClassifier(): - def __init__(self, - train_conf, - use_gpu=True, - model_dir=None, - resize_dim=None, - crop_dim=None, - mean_file=None, - oversample=False, - is_color=True): - """ - train_conf: network configure. - model_dir: string, directory of model. - resize_dim: int, resized image size. - crop_dim: int, crop size. - mean_file: string, image mean file. - oversample: bool, oversample means multiple crops, namely five - patches (the four corner patches and the center - patch) as well as their horizontal reflections, - ten crops in all. - """ - self.train_conf = train_conf - self.model_dir = model_dir - if model_dir is None: - self.model_dir = os.path.dirname(train_conf) - - self.resize_dim = resize_dim - self.crop_dims = [crop_dim, crop_dim] - self.oversample = oversample - self.is_color = is_color - - self.transformer = image_util.ImageTransformer(is_color=is_color) - self.transformer.set_transpose((2, 0, 1)) - - self.mean_file = mean_file - mean = np.load(self.mean_file)['data_mean'] - mean = mean.reshape(3, self.crop_dims[0], self.crop_dims[1]) - self.transformer.set_mean(mean) # mean pixel - gpu = 1 if use_gpu else 0 - conf_args = "is_test=1,use_gpu=%d,is_predict=1" % (gpu) - conf = parse_config(train_conf, conf_args) - swig_paddle.initPaddle("--use_gpu=%d" % (gpu)) - self.network = swig_paddle.GradientMachine.createFromConfigProto( - conf.model_config) - assert isinstance(self.network, swig_paddle.GradientMachine) - self.network.loadParameters(self.model_dir) - - data_size = 3 * self.crop_dims[0] * self.crop_dims[1] - slots = [dense_vector(data_size)] - self.converter = DataProviderConverter(slots) - - def get_data(self, img_path): - """ - 1. load image from img_path. - 2. resize or oversampling. - 3. transformer data: transpose, sub mean. - return K x H x W ndarray. - img_path: image path. - """ - image = image_util.load_image(img_path, self.is_color) - if self.oversample: - # image_util.resize_image: short side is self.resize_dim - image = image_util.resize_image(image, self.resize_dim) - image = np.array(image) - input = np.zeros( - (1, image.shape[0], image.shape[1], 3), dtype=np.float32) - input[0] = image.astype(np.float32) - input = image_util.oversample(input, self.crop_dims) - else: - image = image.resize(self.crop_dims, Image.ANTIALIAS) - input = np.zeros( - (1, self.crop_dims[0], self.crop_dims[1], 3), dtype=np.float32) - input[0] = np.array(image).astype(np.float32) - - data_in = [] - for img in input: - img = self.transformer.transformer(img).flatten() - data_in.append([img.tolist()]) - return data_in - - def forward(self, input_data): - in_arg = self.converter(input_data) - return self.network.forwardTest(in_arg) - - def forward(self, data, output_layer): - """ - input_data: py_paddle input data. - output_layer: specify the name of probability, namely the layer with - softmax activation. - return: the predicting probability of each label. - """ - input = self.converter(data) - self.network.forwardTest(input) - output = self.network.getLayerOutputs(output_layer) - # For oversampling, average predictions across crops. - # If not, the shape of output[name]: (1, class_number), - # the mean is also applicable. - return output[output_layer]['value'].mean(0) - - def predict(self, image=None, output_layer=None): - assert isinstance(image, basestring) - assert isinstance(output_layer, basestring) - data = self.get_data(image) - prob = self.forward(data, output_layer) - lab = np.argsort(-prob) - logging.info("Label of %s is: %d", image, lab[0]) - - -if __name__ == '__main__': - image_size = 32 - crop_size = 32 - multi_crop = True - config = "vgg_16_cifar.py" - output_layer = "__fc_layer_1__" - mean_path = "data/cifar-out/batches/batches.meta" - model_path = sys.argv[1] - image = sys.argv[2] - use_gpu = bool(int(sys.argv[3])) - - obj = ImageClassifier( - train_conf=config, - model_dir=model_path, - resize_dim=image_size, - crop_dim=crop_size, - mean_file=mean_path, - use_gpu=use_gpu, - oversample=multi_crop) - obj.predict(image, output_layer) diff --git a/demo/image_classification/preprocess.py b/demo/image_classification/preprocess.py deleted file mode 100755 index 2947ad239c36f9a02ed67ccf5906380cb70e37ce..0000000000000000000000000000000000000000 --- a/demo/image_classification/preprocess.py +++ /dev/null @@ -1,54 +0,0 @@ -# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from paddle.utils.preprocess_img import ImageClassificationDatasetCreater -from optparse import OptionParser - - -def option_parser(): - parser = OptionParser(usage="usage: python preprcoess.py "\ - "-i data_dir [options]") - parser.add_option( - "-i", - "--input", - action="store", - dest="input", - help="Input data directory.") - parser.add_option( - "-s", - "--size", - action="store", - dest="size", - help="Processed image size.") - parser.add_option( - "-c", - "--color", - action="store", - dest="color", - help="whether to use color images.") - return parser.parse_args() - - -if __name__ == '__main__': - options, args = option_parser() - data_dir = options.input - processed_image_size = int(options.size) - color = options.color == "1" - data_creator = ImageClassificationDatasetCreater( - data_dir, processed_image_size, color) - data_creator.train_list_name = "train.txt" - data_creator.test_list_name = "test.txt" - data_creator.num_per_batch = 1000 - data_creator.overwrite = True - data_creator.create_batches() diff --git a/demo/image_classification/preprocess.sh b/demo/image_classification/preprocess.sh deleted file mode 100755 index c7396c6393599ef3f2c55089eb05f2435b2b4b82..0000000000000000000000000000000000000000 --- a/demo/image_classification/preprocess.sh +++ /dev/null @@ -1,22 +0,0 @@ -#!/bin/bash -# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -set -e - -data_dir=./data/cifar-out - -python preprocess.py -i $data_dir -s 32 -c 1 - -echo "data/cifar-out/batches/train.txt" > train.list -echo "data/cifar-out/batches/test.txt" > test.list diff --git a/demo/image_classification/train.sh b/demo/image_classification/train.sh deleted file mode 100755 index e45bd47ad5925c6674d628a70a7ad7c4d5d5c173..0000000000000000000000000000000000000000 --- a/demo/image_classification/train.sh +++ /dev/null @@ -1,32 +0,0 @@ -#!/bin/bash -# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -set -e -config=vgg_16_cifar.py -output=./cifar_vgg_model -log=train.log - -paddle train \ ---config=$config \ ---dot_period=10 \ ---log_period=100 \ ---test_all_data_in_one_period=1 \ ---use_gpu=1 \ ---trainer_count=1 \ ---num_passes=300 \ ---save_dir=$output \ -2>&1 | tee $log -paddle usage -l $log -e $? -n "image_classification_train" >/dev/null 2>&1 - -python -m paddle.utils.plotcurve -i $log > plot.png diff --git a/demo/image_classification/vgg_16_cifar.py b/demo/image_classification/vgg_16_cifar.py deleted file mode 100755 index 8ee4a64c15f885023a6e19812885b4f76bb12af9..0000000000000000000000000000000000000000 --- a/demo/image_classification/vgg_16_cifar.py +++ /dev/null @@ -1,58 +0,0 @@ -# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from paddle.trainer_config_helpers import * - -is_predict = get_config_arg("is_predict", bool, False) - -####################Data Configuration ################## -if not is_predict: - data_dir = 'data/cifar-out/batches/' - meta_path = data_dir + 'batches.meta' - - args = { - 'meta': meta_path, - 'mean_img_size': 32, - 'img_size': 32, - 'num_classes': 10, - 'use_jpeg': 1, - 'color': "color" - } - - define_py_data_sources2( - train_list="train.list", - test_list="train.list", - module='image_provider', - obj='processData', - args=args) - -######################Algorithm Configuration ############# -settings( - batch_size=128, - learning_rate=0.1 / 128.0, - learning_method=MomentumOptimizer(0.9), - regularization=L2Regularization(0.0005 * 128)) - -#######################Network Configuration ############# -data_size = 3 * 32 * 32 -label_size = 10 -img = data_layer(name='image', size=data_size) -# small_vgg is predefined in trainer_config_helpers.networks -predict = small_vgg(input_image=img, num_channels=3, num_classes=label_size) - -if not is_predict: - lbl = data_layer(name="label", size=label_size) - outputs(classification_cost(input=predict, label=lbl)) -else: - outputs(predict) diff --git a/demo/introduction/.gitignore b/demo/introduction/.gitignore deleted file mode 100644 index c54f3f9480ce4ceefda98f77a812ec2d6cd4a5e3..0000000000000000000000000000000000000000 --- a/demo/introduction/.gitignore +++ /dev/null @@ -1,5 +0,0 @@ -dataprovider.pyc -empty.list -train.log -output -train.list diff --git a/demo/introduction/README.md b/demo/introduction/README.md deleted file mode 100644 index 0614a7afe645677ef0b65a17ea05f1dcfa45214f..0000000000000000000000000000000000000000 --- a/demo/introduction/README.md +++ /dev/null @@ -1,3 +0,0 @@ -This folder contains scripts used in PaddlePaddle introduction. -- use `bash train.sh` to train a simple linear regression model -- use `python evaluate_model.py` to read model parameters. You can see that `w` and `b` are very close to [2, 0.3]. diff --git a/demo/introduction/dataprovider.py b/demo/introduction/dataprovider.py deleted file mode 100644 index 5b48aad0408800676ae7da16eba2dcbb2124f25f..0000000000000000000000000000000000000000 --- a/demo/introduction/dataprovider.py +++ /dev/null @@ -1,26 +0,0 @@ -# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from paddle.trainer.PyDataProvider2 import * -import random - - -# define data types of input: 2 real numbers -@provider( - input_types={'x': dense_vector(1), - 'y': dense_vector(1)}, use_seq=False) -def process(settings, input_file): - for i in xrange(2000): - x = random.random() - yield {'x': [x], 'y': [2 * x + 0.3]} diff --git a/demo/introduction/trainer_config.py b/demo/introduction/trainer_config.py deleted file mode 100644 index 651dfaa4b7b4873810a0b393655541a62d1a311b..0000000000000000000000000000000000000000 --- a/demo/introduction/trainer_config.py +++ /dev/null @@ -1,38 +0,0 @@ -# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from paddle.trainer_config_helpers import * - -# 1. read data. Suppose you saved above python code as dataprovider.py -define_py_data_sources2( - train_list=['no_matter.txt'], - test_list=None, - module='dataprovider', - obj='process', - args={}) - -# 2. learning algorithm -settings(batch_size=12, learning_rate=1e-3, learning_method=MomentumOptimizer()) - -# 3. Network configuration -x = data_layer(name='x', size=1) -y = data_layer(name='y', size=1) -y_predict = fc_layer( - input=x, - param_attr=ParamAttr(name='w'), - size=1, - act=LinearActivation(), - bias_attr=ParamAttr(name='b')) -cost = mse_cost(input=y_predict, label=y) -outputs(cost) diff --git a/demo/recommendation/.gitignore b/demo/recommendation/.gitignore deleted file mode 100644 index fd27ef62a87cae51f2392c0eba50a44490d029af..0000000000000000000000000000000000000000 --- a/demo/recommendation/.gitignore +++ /dev/null @@ -1,10 +0,0 @@ -log.txt -data/meta.bin -data/ml-1m -data/ratings.dat.train -data/ratings.dat.test -data/train.list -data/test.list -dataprovider_copy_1.py -*.pyc -output diff --git a/demo/recommendation/api_train_v2.py b/demo/recommendation/api_train_v2.py deleted file mode 100644 index f6a061799e3ac50236a68beedaf700dd6c698a05..0000000000000000000000000000000000000000 --- a/demo/recommendation/api_train_v2.py +++ /dev/null @@ -1,125 +0,0 @@ -import paddle.v2 as paddle -import cPickle -import copy - - -def main(): - paddle.init(use_gpu=False) - movie_title_dict = paddle.dataset.movielens.get_movie_title_dict() - uid = paddle.layer.data( - name='user_id', - type=paddle.data_type.integer_value( - paddle.dataset.movielens.max_user_id() + 1)) - usr_emb = paddle.layer.embedding(input=uid, size=32) - - usr_gender_id = paddle.layer.data( - name='gender_id', type=paddle.data_type.integer_value(2)) - usr_gender_emb = paddle.layer.embedding(input=usr_gender_id, size=16) - - usr_age_id = paddle.layer.data( - name='age_id', - type=paddle.data_type.integer_value( - len(paddle.dataset.movielens.age_table))) - usr_age_emb = paddle.layer.embedding(input=usr_age_id, size=16) - - usr_job_id = paddle.layer.data( - name='job_id', - type=paddle.data_type.integer_value(paddle.dataset.movielens.max_job_id( - ) + 1)) - - usr_job_emb = paddle.layer.embedding(input=usr_job_id, size=16) - - usr_combined_features = paddle.layer.fc( - input=[usr_emb, usr_gender_emb, usr_age_emb, usr_job_emb], - size=200, - act=paddle.activation.Tanh()) - - mov_id = paddle.layer.data( - name='movie_id', - type=paddle.data_type.integer_value( - paddle.dataset.movielens.max_movie_id() + 1)) - mov_emb = paddle.layer.embedding(input=mov_id, size=32) - - mov_categories = paddle.layer.data( - name='category_id', - type=paddle.data_type.sparse_binary_vector( - len(paddle.dataset.movielens.movie_categories()))) - - mov_categories_hidden = paddle.layer.fc(input=mov_categories, size=32) - - mov_title_id = paddle.layer.data( - name='movie_title', - type=paddle.data_type.integer_value_sequence(len(movie_title_dict))) - mov_title_emb = paddle.layer.embedding(input=mov_title_id, size=32) - mov_title_conv = paddle.networks.sequence_conv_pool( - input=mov_title_emb, hidden_size=32, context_len=3) - - mov_combined_features = paddle.layer.fc( - input=[mov_emb, mov_categories_hidden, mov_title_conv], - size=200, - act=paddle.activation.Tanh()) - - inference = paddle.layer.cos_sim( - a=usr_combined_features, b=mov_combined_features, size=1, scale=5) - cost = paddle.layer.mse_cost( - input=inference, - label=paddle.layer.data( - name='score', type=paddle.data_type.dense_vector(1))) - - parameters = paddle.parameters.create(cost) - - trainer = paddle.trainer.SGD(cost=cost, - parameters=parameters, - update_equation=paddle.optimizer.Adam( - learning_rate=1e-4)) - feeding = { - 'user_id': 0, - 'gender_id': 1, - 'age_id': 2, - 'job_id': 3, - 'movie_id': 4, - 'category_id': 5, - 'movie_title': 6, - 'score': 7 - } - - def event_handler(event): - if isinstance(event, paddle.event.EndIteration): - if event.batch_id % 100 == 0: - print "Pass %d Batch %d Cost %.2f" % ( - event.pass_id, event.batch_id, event.cost) - - trainer.train( - reader=paddle.batch( - paddle.reader.shuffle( - paddle.dataset.movielens.train(), buf_size=8192), - batch_size=256), - event_handler=event_handler, - feeding=feeding, - num_passes=1) - - user_id = 234 - movie_id = 345 - - user = paddle.dataset.movielens.user_info()[user_id] - movie = paddle.dataset.movielens.movie_info()[movie_id] - - feature = user.value() + movie.value() - - def reader(): - yield feature - - infer_dict = copy.copy(feeding) - del infer_dict['score'] - - prediction = paddle.infer( - output=inference, - parameters=parameters, - reader=paddle.batch( - reader, batch_size=32), - feeding=infer_dict) - print(prediction + 5) / 2 - - -if __name__ == '__main__': - main() diff --git a/demo/recommendation/common_utils.py b/demo/recommendation/common_utils.py deleted file mode 100755 index c20c65286621d701ad58409b539bbe9c813d453a..0000000000000000000000000000000000000000 --- a/demo/recommendation/common_utils.py +++ /dev/null @@ -1,30 +0,0 @@ -# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -from paddle.trainer.PyDataProvider2 import * - - -def meta_to_header(meta, name): - metas = meta[name]['__meta__']['raw_meta'] - for each_meta in metas: - slot_name = each_meta.get('name', '%s_id' % name) - if each_meta['type'] == 'id': - yield slot_name, integer_value(each_meta['max']) - elif each_meta['type'] == 'embedding': - is_seq = each_meta['seq'] == 'sequence' - yield slot_name, integer_value( - len(each_meta['dict']), - seq_type=SequenceType.SEQUENCE - if is_seq else SequenceType.NO_SEQUENCE) - elif each_meta['type'] == 'one_hot_dense': - yield slot_name, dense_vector(len(each_meta['dict'])) diff --git a/demo/recommendation/data/config.json b/demo/recommendation/data/config.json deleted file mode 100644 index f26e74ce47bb7843a571e6033f051c046b31f054..0000000000000000000000000000000000000000 --- a/demo/recommendation/data/config.json +++ /dev/null @@ -1,16 +0,0 @@ -{ - "user": { - "file": { - "name": "users.dat", - "delimiter": "::" - }, - "fields": ["id", "gender", "age", "occupation"] - }, - "movie": { - "file": { - "name": "movies.dat", - "delimiter": "::" - }, - "fields": ["id", "title", "genres"] - } -} diff --git a/demo/recommendation/data/config_generator.py b/demo/recommendation/data/config_generator.py deleted file mode 100644 index 4ca496a252dffc62ed62bb8f2a5ee1661a940580..0000000000000000000000000000000000000000 --- a/demo/recommendation/data/config_generator.py +++ /dev/null @@ -1,127 +0,0 @@ -#!/bin/env python2 -# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -""" -config_generator.py - -Usage: - ./config_generator.py [--output_format=] - ./config_generator.py -h | --help - -Options: - -h --help Show this screen. - --output_format= Output Config format(json or yaml) [default: json]. -""" - -import json -import docopt -import copy - -DEFAULT_FILE = {"type": "split", "delimiter": ","} - -DEFAULT_FIELD = { - "id": { - "type": "id" - }, - "gender": { - "name": "gender", - "type": "embedding", - "dict": { - "type": "char_based" - } - }, - "age": { - "name": "age", - "type": "embedding", - "dict": { - "type": "whole_content", - "sort": True - } - }, - "occupation": { - "name": "occupation", - "type": "embedding", - "dict": { - "type": "whole_content", - "sort": "true" - } - }, - "title": { - "regex": { - "pattern": r"^(.*)\((\d+)\)$", - "group_id": 1, - "strip": True - }, - "name": "title", - "type": { - "name": "embedding", - "seq_type": "sequence", - }, - "dict": { - "type": "char_based" - } - }, - "genres": { - "type": "one_hot_dense", - "dict": { - "type": "split", - "delimiter": "|" - }, - "name": "genres" - } -} - - -def merge_dict(master_dict, slave_dict): - return dict(((k, master_dict.get(k) or slave_dict.get(k)) - for k in set(slave_dict) | set(master_dict))) - - -def main(filename, fmt): - with open(filename, 'r') as f: - conf = json.load(f) - obj = dict() - for k in conf: - val = conf[k] - file_dict = val['file'] - file_dict = merge_dict(file_dict, DEFAULT_FILE) - - fields = [] - for pos, field_key in enumerate(val['fields']): - assert isinstance(field_key, basestring) - field = copy.deepcopy(DEFAULT_FIELD[field_key]) - field['pos'] = pos - fields.append(field) - obj[k] = {"file": file_dict, "fields": fields} - meta = {"meta": obj} - # print meta - if fmt == 'json': - - def formatter(x): - import json - return json.dumps(x, indent=2) - elif fmt == 'yaml': - - def formatter(x): - import yaml - return yaml.safe_dump(x, default_flow_style=False) - else: - raise NotImplementedError("Dump format %s is not implemented" % fmt) - - print formatter(meta) - - -if __name__ == '__main__': - args = docopt.docopt(__doc__, version="0.1.0") - main(args[""], args["--output_format"]) diff --git a/demo/recommendation/data/meta_config.json b/demo/recommendation/data/meta_config.json deleted file mode 100644 index cc6a046e271dd0faaa47eeb5a5bef6d3604113fe..0000000000000000000000000000000000000000 --- a/demo/recommendation/data/meta_config.json +++ /dev/null @@ -1,81 +0,0 @@ -{ - "meta": { - "movie": { - "fields": [ - { - "type": "id", - "pos": 0 - }, - { - "regex": { - "pattern": "^(.*)\\((\\d+)\\)$", - "group_id": 1, - "strip": true - }, - "type": { - "seq_type": "sequence", - "name": "embedding" - }, - "dict": { - "type": "char_based" - }, - "name": "title", - "pos": 1 - }, - { - "type": "one_hot_dense", - "dict": { - "delimiter": "|", - "type": "split" - }, - "name": "genres", - "pos": 2 - } - ], - "file": { - "delimiter": "::", - "type": "split", - "name": "movies.dat" - } - }, - "user": { - "fields": [ - { - "type": "id", - "pos": 0 - }, - { - "type": "embedding", - "dict": { - "type": "char_based" - }, - "name": "gender", - "pos": 1 - }, - { - "type": "embedding", - "dict": { - "sort": true, - "type": "whole_content" - }, - "name": "age", - "pos": 2 - }, - { - "type": "embedding", - "dict": { - "sort": "true", - "type": "whole_content" - }, - "name": "occupation", - "pos": 3 - } - ], - "file": { - "delimiter": "::", - "type": "split", - "name": "users.dat" - } - } - } -} diff --git a/demo/recommendation/data/meta_generator.py b/demo/recommendation/data/meta_generator.py deleted file mode 100644 index 38e4679d266c331a751114cd13f0e3453016cf26..0000000000000000000000000000000000000000 --- a/demo/recommendation/data/meta_generator.py +++ /dev/null @@ -1,430 +0,0 @@ -#!/bin/env python2 -# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -""" -Preprocess Movielens dataset, to get movie/user object. - -Usage: - ./preprocess.py [--config=] - ./preprocess.py -h | --help - -Options: - -h --help Show this screen. - --version Show version. - --config= Get MetaData config file [default: config.json]. -""" -import docopt -import os -import sys -import re -import collections - -try: - import cPickle as pickle -except ImportError: - import pickle - - -class UniqueIDGenerator(object): - def __init__(self): - self.pool = collections.defaultdict(self.__next_id__) - self.next_id = 0 - - def __next_id__(self): - tmp = self.next_id - self.next_id += 1 - return tmp - - def __call__(self, k): - return self.pool[k] - - def to_list(self): - ret_val = [None] * len(self.pool) - for k in self.pool.keys(): - ret_val[self.pool[k]] = k - return ret_val - - -class SortedIDGenerator(object): - def __init__(self): - self.__key_set__ = set() - self.dict = None - - def scan(self, key): - self.__key_set__.add(key) - - def finish_scan(self, compare=None, key=None, reverse=False): - self.__key_set__ = sorted( - list(self.__key_set__), cmp=compare, key=key, reverse=reverse) - self.dict = dict() - for idx, each_key in enumerate(self.__key_set__): - self.dict[each_key] = idx - - def __call__(self, key): - return self.dict[key] - - def to_list(self): - return self.__key_set__ - - -class SplitFileReader(object): - def __init__(self, work_dir, config): - assert isinstance(config, dict) - self.filename = config['name'] - self.delimiter = config.get('delimiter', ',') - self.work_dir = work_dir - - def read(self): - with open(os.path.join(self.work_dir, self.filename), 'r') as f: - for line in f: - line = line.strip() - if isinstance(self.delimiter, unicode): - self.delimiter = str(self.delimiter) - yield line.split(self.delimiter) - - @staticmethod - def create(work_dir, config): - assert isinstance(config, dict) - if config['type'] == 'split': - return SplitFileReader(work_dir, config) - - -class IFileReader(object): - READERS = [SplitFileReader] - - def read(self): - raise NotImplementedError() - - @staticmethod - def create(work_dir, config): - for reader_cls in IFileReader.READERS: - val = reader_cls.create(work_dir, config) - if val is not None: - return val - - -class IDFieldParser(object): - TYPE = 'id' - - def __init__(self, config): - self.__max_id__ = -sys.maxint - 1 - self.__min_id__ = sys.maxint - self.__id_count__ = 0 - - def scan(self, line): - idx = int(line) - self.__max_id__ = max(self.__max_id__, idx) - self.__min_id__ = min(self.__min_id__, idx) - self.__id_count__ += 1 - - def parse(self, line): - return int(line) - - def meta_field(self): - return { - "is_key": True, - 'max': self.__max_id__, - 'min': self.__min_id__, - 'count': self.__id_count__, - 'type': 'id' - } - - -class SplitEmbeddingDict(object): - def __init__(self, delimiter): - self.__id__ = UniqueIDGenerator() - self.delimiter = delimiter - - def scan(self, multi): - for val in multi.split(self.delimiter): - self.__id__(val) - - def parse(self, multi): - return map(self.__id__, multi.split(self.delimiter)) - - def meta_field(self): - return self.__id__.to_list() - - -class EmbeddingFieldParser(object): - TYPE = 'embedding' - - NO_SEQUENCE = "no_sequence" - SEQUENCE = "sequence" - - class CharBasedEmbeddingDict(object): - def __init__(self, is_seq=True): - self.__id__ = UniqueIDGenerator() - self.is_seq = is_seq - - def scan(self, s): - for ch in s: - self.__id__(ch) - - def parse(self, s): - return map(self.__id__, s) if self.is_seq else self.__id__(s[0]) - - def meta_field(self): - return self.__id__.to_list() - - class WholeContentDict(object): - def __init__(self, need_sort=True): - assert need_sort - self.__id__ = SortedIDGenerator() - self.__has_finished__ = False - - def scan(self, txt): - self.__id__.scan(txt) - - def meta_field(self): - if not self.__has_finished__: - self.__id__.finish_scan() - self.__has_finished__ = True - return self.__id__.to_list() - - def parse(self, txt): - return self.__id__(txt) - - def __init__(self, config): - try: - self.seq_type = config['type']['seq_type'] - except TypeError: - self.seq_type = EmbeddingFieldParser.NO_SEQUENCE - - if config['dict']['type'] == 'char_based': - self.dict = EmbeddingFieldParser.CharBasedEmbeddingDict( - self.seq_type == EmbeddingFieldParser.SEQUENCE) - elif config['dict']['type'] == 'split': - self.dict = SplitEmbeddingDict(config['dict'].get('delimiter', ',')) - elif config['dict']['type'] == 'whole_content': - self.dict = EmbeddingFieldParser.WholeContentDict(config['dict'][ - 'sort']) - else: - print config - assert False - - self.name = config['name'] - - def scan(self, s): - self.dict.scan(s) - - def meta_field(self): - return { - 'name': self.name, - 'dict': self.dict.meta_field(), - 'type': 'embedding', - 'seq': self.seq_type - } - - def parse(self, s): - return self.dict.parse(s) - - -class OneHotDenseFieldParser(object): - TYPE = 'one_hot_dense' - - def __init__(self, config): - if config['dict']['type'] == 'split': - self.dict = SplitEmbeddingDict(config['dict']['delimiter']) - self.name = config['name'] - - def scan(self, s): - self.dict.scan(s) - - def meta_field(self): - # print self.dict.meta_field() - return { - 'dict': self.dict.meta_field(), - 'name': self.name, - 'type': 'one_hot_dense' - } - - def parse(self, s): - ids = self.dict.parse(s) - retv = [0.0] * len(self.dict.meta_field()) - for idx in ids: - retv[idx] = 1.0 - # print retv - return retv - - -class FieldParserFactory(object): - PARSERS = [IDFieldParser, EmbeddingFieldParser, OneHotDenseFieldParser] - - @staticmethod - def create(config): - if isinstance(config['type'], basestring): - config_type = config['type'] - elif isinstance(config['type'], dict): - config_type = config['type']['name'] - - assert config_type is not None - - for each_parser_cls in FieldParserFactory.PARSERS: - if config_type == each_parser_cls.TYPE: - return each_parser_cls(config) - print config - - -class CompositeFieldParser(object): - def __init__(self, parser, extractor): - self.extractor = extractor - self.parser = parser - - def scan(self, *args, **kwargs): - self.parser.scan(self.extractor.extract(*args, **kwargs)) - - def parse(self, *args, **kwargs): - return self.parser.parse(self.extractor.extract(*args, **kwargs)) - - def meta_field(self): - return self.parser.meta_field() - - -class PositionContentExtractor(object): - def __init__(self, pos): - self.pos = pos - - def extract(self, line): - assert isinstance(line, list) - return line[self.pos] - - -class RegexPositionContentExtractor(PositionContentExtractor): - def __init__(self, pos, pattern, group_id, strip=True): - PositionContentExtractor.__init__(self, pos) - pattern = pattern.strip() - self.pattern = re.compile(pattern) - self.group_id = group_id - self.strip = strip - - def extract(self, line): - line = PositionContentExtractor.extract(self, line) - match = self.pattern.match(line) - # print line, self.pattern.pattern, match - assert match is not None - txt = match.group(self.group_id) - if self.strip: - txt.strip() - return txt - - -class ContentExtractorFactory(object): - def extract(self, line): - pass - - @staticmethod - def create(config): - if 'pos' in config: - if 'regex' not in config: - return PositionContentExtractor(config['pos']) - else: - extra_args = config['regex'] - return RegexPositionContentExtractor( - pos=config['pos'], **extra_args) - - -class MetaFile(object): - def __init__(self, work_dir): - self.work_dir = work_dir - self.obj = dict() - - def parse(self, config): - config = config['meta'] - - ret_obj = dict() - for key in config.keys(): - val = config[key] - assert 'file' in val - reader = IFileReader.create(self.work_dir, val['file']) - assert reader is not None - assert 'fields' in val and isinstance(val['fields'], list) - fields_config = val['fields'] - field_parsers = map(MetaFile.__field_config_mapper__, fields_config) - - for each_parser in field_parsers: - assert each_parser is not None - - for each_block in reader.read(): - for each_parser in field_parsers: - each_parser.scan(each_block) - - metas = map(lambda x: x.meta_field(), field_parsers) - # print metas - key_index = filter( - lambda x: x is not None, - map(lambda (idx, meta): idx if 'is_key' in meta and meta['is_key'] else None, - enumerate(metas)))[0] - - key_map = [] - for i in range(min(key_index, len(metas))): - key_map.append(i) - for i in range(key_index + 1, len(metas)): - key_map.append(i) - - obj = {'__meta__': {'raw_meta': metas, 'feature_map': key_map}} - - for each_block in reader.read(): - idx = field_parsers[key_index].parse(each_block) - val = [] - for i, each_parser in enumerate(field_parsers): - if i != key_index: - val.append(each_parser.parse(each_block)) - obj[idx] = val - ret_obj[key] = obj - self.obj = ret_obj - return ret_obj - - @staticmethod - def __field_config_mapper__(conf): - assert isinstance(conf, dict) - extrator = ContentExtractorFactory.create(conf) - field_parser = FieldParserFactory.create(conf) - assert extrator is not None - assert field_parser is not None - return CompositeFieldParser(field_parser, extrator) - - def dump(self, fp): - pickle.dump(self.obj, fp, pickle.HIGHEST_PROTOCOL) - - -def preprocess(binary_filename, dataset_dir, config, **kwargs): - assert isinstance(config, str) - with open(config, 'r') as config_file: - file_loader = None - if config.lower().endswith('.yaml'): - import yaml - file_loader = yaml - elif config.lower().endswith('.json'): - import json - file_loader = json - config = file_loader.load(config_file) - meta = MetaFile(dataset_dir) - meta.parse(config) - with open(binary_filename, 'wb') as outf: - meta.dump(outf) - - -if __name__ == '__main__': - args = docopt.docopt(__doc__, version='0.1.0') - kwargs = dict() - for key in args.keys(): - if key != '--help': - param_name = key - assert isinstance(param_name, str) - param_name = param_name.replace('<', '') - param_name = param_name.replace('>', '') - param_name = param_name.replace('--', '') - kwargs[param_name] = args[key] - preprocess(**kwargs) diff --git a/demo/recommendation/data/ml_data.sh b/demo/recommendation/data/ml_data.sh deleted file mode 100755 index 2268d876389e0bdf5ead405e74d278d276626f82..0000000000000000000000000000000000000000 --- a/demo/recommendation/data/ml_data.sh +++ /dev/null @@ -1,23 +0,0 @@ -#!/bin/bash -# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -set -ex -cd "$(dirname "$0")" -# download the dataset -wget http://files.grouplens.org/datasets/movielens/ml-1m.zip -# unzip the dataset -unzip ml-1m.zip -# remove the unused zip file -rm ml-1m.zip diff --git a/demo/recommendation/data/split.py b/demo/recommendation/data/split.py deleted file mode 100644 index be6869c22f04be1db0f8e9c35c73c851e4c490b0..0000000000000000000000000000000000000000 --- a/demo/recommendation/data/split.py +++ /dev/null @@ -1,66 +0,0 @@ -#!/bin/env python2 -# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -""" -Separate movielens 1m dataset to train/test file. - -Usage: - ./separate.py [--test_ratio=] [--delimiter=] - ./separate.py -h | --help - -Options: - -h --help Show this screen. - --version Show version. - --test_ratio= Test ratio for separate [default: 0.1]. - --delimiter= File delimiter [default: ,]. -""" -import docopt -import collections -import random - - -def process(test_ratio, input_file, delimiter, **kwargs): - test_ratio = float(test_ratio) - rating_dict = collections.defaultdict(list) - with open(input_file, 'r') as f: - for line in f: - user_id = int(line.split(delimiter)[0]) - rating_dict[user_id].append(line.strip()) - - with open(input_file + ".train", 'w') as train_file: - with open(input_file + ".test", 'w') as test_file: - for k in rating_dict.keys(): - lines = rating_dict[k] - assert isinstance(lines, list) - random.shuffle(lines) - test_len = int(len(lines) * test_ratio) - for line in lines[:test_len]: - print >> test_file, line - - for line in lines[test_len:]: - print >> train_file, line - - -if __name__ == '__main__': - args = docopt.docopt(__doc__, version='0.1.0') - kwargs = dict() - for key in args.keys(): - if key != '--help': - param_name = key - assert isinstance(param_name, str) - param_name = param_name.replace('<', '') - param_name = param_name.replace('>', '') - param_name = param_name.replace('--', '') - kwargs[param_name] = args[key] - process(**kwargs) diff --git a/demo/recommendation/dataprovider.py b/demo/recommendation/dataprovider.py deleted file mode 100755 index c4ff96d80e81926049c9a71d6d9d991c0b568c25..0000000000000000000000000000000000000000 --- a/demo/recommendation/dataprovider.py +++ /dev/null @@ -1,88 +0,0 @@ -# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from paddle.trainer.PyDataProvider2 import * -import common_utils # parse - - -def __list_to_map__(lst): - ret_val = dict() - for each in lst: - k, v = each - ret_val[k] = v - return ret_val - - -def hook(settings, meta, **kwargs): - """ - Init hook is invoked before process data. It will set obj.slots and store - data meta. - - :param obj: global object. It will passed to process routine. - :type obj: object - :param meta: the meta file object, which passed from trainer_config. Meta - file record movie/user features. - :param kwargs: unused other arguments. - """ - del kwargs # unused kwargs - - # Header define slots that used for paddle. - # first part is movie features. - # second part is user features. - # final part is rating score. - # header is a list of [USE_SEQ_OR_NOT?, SlotType] - movie_headers = list(common_utils.meta_to_header(meta, 'movie')) - settings.movie_names = [h[0] for h in movie_headers] - headers = movie_headers - user_headers = list(common_utils.meta_to_header(meta, 'user')) - settings.user_names = [h[0] for h in user_headers] - headers.extend(user_headers) - headers.append(("rating", dense_vector(1))) # Score - - # slot types. - settings.input_types = __list_to_map__(headers) - settings.meta = meta - - -@provider(init_hook=hook, cache=CacheType.CACHE_PASS_IN_MEM) -def process(settings, filename): - with open(filename, 'r') as f: - for line in f: - # Get a rating from file. - user_id, movie_id, score = map(int, line.split('::')[:-1]) - - # Scale score to [-5, +5] - score = float(score) * 2 - 5.0 - - # Get movie/user features by movie_id, user_id - movie_meta = settings.meta['movie'][movie_id] - user_meta = settings.meta['user'][user_id] - - outputs = [('movie_id', movie_id - 1)] - - # Then add movie features - for i, each_meta in enumerate(movie_meta): - outputs.append((settings.movie_names[i + 1], each_meta)) - - # Then add user id. - outputs.append(('user_id', user_id - 1)) - - # Then add user features. - for i, each_meta in enumerate(user_meta): - outputs.append((settings.user_names[i + 1], each_meta)) - - # Finally, add score - outputs.append(('rating', [score])) - # Return data to paddle - yield __list_to_map__(outputs) diff --git a/demo/recommendation/evaluate.py b/demo/recommendation/evaluate.py deleted file mode 100755 index 3afa7a1e9db5fefb1bbf5aaa174b8168afae4058..0000000000000000000000000000000000000000 --- a/demo/recommendation/evaluate.py +++ /dev/null @@ -1,37 +0,0 @@ -#!/usr/bin/python -# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import sys -import re -import math - - -def get_best_pass(log_filename): - with open(log_filename, 'r') as f: - text = f.read() - pattern = re.compile('Test.*? cost=([0-9]+\.[0-9]+).*?pass-([0-9]+)', - re.S) - results = re.findall(pattern, text) - sorted_results = sorted(results, key=lambda result: float(result[0])) - return sorted_results[0] - - -log_filename = sys.argv[1] -log = get_best_pass(log_filename) -predict_error = math.sqrt(float(log[0])) / 2 -print 'Best pass is %s, error is %s, which means predict get error as %f' % ( - log[1], log[0], predict_error) - -evaluate_pass = "output/pass-%s" % log[1] -print "evaluating from pass %s" % evaluate_pass diff --git a/demo/recommendation/evaluate.sh b/demo/recommendation/evaluate.sh deleted file mode 100755 index 02b2857de028bc9c05d7ddd67012043b671b2764..0000000000000000000000000000000000000000 --- a/demo/recommendation/evaluate.sh +++ /dev/null @@ -1,27 +0,0 @@ -#!/bin/bash -# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -set -e - -function get_best_pass() { - cat $1 | grep -Pzo 'Test .*\n.*pass-.*' | sed -r 'N;s/Test.* cost=([0-9]+\.[0-9]+).*\n.*pass-([0-9]+)/\1 \2/g' | sort | head -n 1 -} - -LOG=`get_best_pass log.txt` -LOG=(${LOG}) -echo 'Best pass is '${LOG[1]}, ' error is '${LOG[0]}, 'which means predict get error as '`echo ${LOG[0]} | python -c 'import math; print math.sqrt(float(raw_input()))/2'` - -evaluate_pass="output/pass-${LOG[1]}" - -echo 'evaluating from pass '$evaluate_pass diff --git a/demo/recommendation/prediction.py b/demo/recommendation/prediction.py deleted file mode 100755 index 8ad993eab3a9f637cfff752bfedbbc62eaf3c8d5..0000000000000000000000000000000000000000 --- a/demo/recommendation/prediction.py +++ /dev/null @@ -1,51 +0,0 @@ -#!/bin/env python2 -# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from py_paddle import swig_paddle, DataProviderConverter - -from common_utils import * -from paddle.trainer.config_parser import parse_config - -try: - import cPickle as pickle -except ImportError: - import pickle -import sys - -if __name__ == '__main__': - model_path = sys.argv[1] - swig_paddle.initPaddle('--use_gpu=0') - conf = parse_config("trainer_config.py", "is_predict=1") - network = swig_paddle.GradientMachine.createFromConfigProto( - conf.model_config) - assert isinstance(network, swig_paddle.GradientMachine) - network.loadParameters(model_path) - with open('./data/meta.bin', 'rb') as f: - meta = pickle.load(f) - headers = [h[1] for h in meta_to_header(meta, 'movie')] - headers.extend([h[1] for h in meta_to_header(meta, 'user')]) - cvt = DataProviderConverter(headers) - while True: - movie_id = int(raw_input("Input movie_id: ")) - user_id = int(raw_input("Input user_id: ")) - movie_meta = meta['movie'][movie_id] # Query Data From Meta. - user_meta = meta['user'][user_id] - data = [movie_id - 1] - data.extend(movie_meta) - data.append(user_id - 1) - data.extend(user_meta) - print "Prediction Score is %.2f" % ( - (network.forwardTest(cvt.convert([data]))[0]['value'][0][0] + 5) - / 2) diff --git a/demo/recommendation/preprocess.sh b/demo/recommendation/preprocess.sh deleted file mode 100755 index eeb81ce3cb47e65c0aeb303e7571024ba82dad65..0000000000000000000000000000000000000000 --- a/demo/recommendation/preprocess.sh +++ /dev/null @@ -1,40 +0,0 @@ -#!/bin/bash -# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -set -e - -UNAME_STR=`uname` - -if [[ ${UNAME_STR} == 'Linux' ]]; then - SHUF_PROG='shuf' -else - SHUF_PROG='gshuf' -fi - - -cd "$(dirname "$0")" -delimiter='::' -dir=ml-1m -cd data -echo 'generate meta config file' -python config_generator.py config.json > meta_config.json -echo 'generate meta file' -python meta_generator.py $dir meta.bin --config=meta_config.json -echo 'split train/test file' -python split.py $dir/ratings.dat --delimiter=${delimiter} --test_ratio=0.1 -echo 'shuffle train file' -${SHUF_PROG} $dir/ratings.dat.train > ratings.dat.train -cp $dir/ratings.dat.test . -echo "./data/ratings.dat.train" > train.list -echo "./data/ratings.dat.test" > test.list diff --git a/demo/recommendation/requirements.txt b/demo/recommendation/requirements.txt deleted file mode 100644 index 1ea154584a428b6a389309f1f8def502e0aadfce..0000000000000000000000000000000000000000 --- a/demo/recommendation/requirements.txt +++ /dev/null @@ -1,2 +0,0 @@ -PyYAML -docopt diff --git a/demo/recommendation/run.sh b/demo/recommendation/run.sh deleted file mode 100755 index 22aef556082ba429e9ca7c6dd3ec72699b9dbcf4..0000000000000000000000000000000000000000 --- a/demo/recommendation/run.sh +++ /dev/null @@ -1,25 +0,0 @@ -#!/bin/bash -# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -set -e -paddle train \ - --config=trainer_config.py \ - --save_dir=./output \ - --use_gpu=false \ - --trainer_count=4\ - --test_all_data_in_one_period=true \ - --log_period=100 \ - --dot_period=1 \ - --num_passes=50 2>&1 | tee 'log.txt' -paddle usage -l log.txt -e $? -n "recommendation" >/dev/null 2>&1 diff --git a/demo/recommendation/trainer_config.py b/demo/recommendation/trainer_config.py deleted file mode 100755 index 25f529d7d7c430f179107fb189ade34760ab309d..0000000000000000000000000000000000000000 --- a/demo/recommendation/trainer_config.py +++ /dev/null @@ -1,98 +0,0 @@ -# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from paddle.trainer_config_helpers import * - -try: - import cPickle as pickle -except ImportError: - import pickle - -is_predict = get_config_arg('is_predict', bool, False) - -META_FILE = 'data/meta.bin' - -with open(META_FILE, 'rb') as f: - # load meta file - meta = pickle.load(f) - -settings( - batch_size=1600, learning_rate=1e-3, learning_method=RMSPropOptimizer()) - - -def construct_feature(name): - """ - Construct movie/user features. - - This method read from meta data. Then convert feature to neural network due - to feature type. The map relation as follow. - - * id: embedding => fc - * embedding: - is_sequence: embedding => context_projection => fc => pool - not sequence: embedding => fc - * one_hot_dense: fc => fc - - Then gather all features vector, and use a fc layer to combined them as - return. - - :param name: 'movie' or 'user' - :type name: basestring - :return: combined feature output - :rtype: LayerOutput - """ - __meta__ = meta[name]['__meta__']['raw_meta'] - fusion = [] - for each_meta in __meta__: - type_name = each_meta['type'] - slot_name = each_meta.get('name', '%s_id' % name) - if type_name == 'id': - slot_dim = each_meta['max'] - embedding = embedding_layer( - input=data_layer( - slot_name, size=slot_dim), size=256) - fusion.append(fc_layer(input=embedding, size=256)) - elif type_name == 'embedding': - is_seq = each_meta['seq'] == 'sequence' - slot_dim = len(each_meta['dict']) - din = data_layer(slot_name, slot_dim) - embedding = embedding_layer(input=din, size=256) - if is_seq: - fusion.append( - text_conv_pool( - input=embedding, context_len=5, hidden_size=256)) - else: - fusion.append(fc_layer(input=embedding, size=256)) - elif type_name == 'one_hot_dense': - slot_dim = len(each_meta['dict']) - hidden = fc_layer(input=data_layer(slot_name, slot_dim), size=256) - fusion.append(fc_layer(input=hidden, size=256)) - - return fc_layer(name="%s_fusion" % name, input=fusion, size=256) - - -movie_feature = construct_feature("movie") -user_feature = construct_feature("user") -similarity = cos_sim(a=movie_feature, b=user_feature) -if not is_predict: - outputs(mse_cost(input=similarity, label=data_layer('rating', size=1))) - - define_py_data_sources2( - 'data/train.list', - 'data/test.list', - module='dataprovider', - obj='process', - args={'meta': meta}) -else: - outputs(similarity) diff --git a/demo/semantic_role_labeling/.gitignore b/demo/semantic_role_labeling/.gitignore deleted file mode 100644 index 65c9b674c7d1dad53b7d1c6ee1dcbdb72553888d..0000000000000000000000000000000000000000 --- a/demo/semantic_role_labeling/.gitignore +++ /dev/null @@ -1,14 +0,0 @@ -*.pyc -train.log -data/feature -data/conll05st-release/ -data/src.dict -data/test.wsj.props -data/test.wsj.seq_pair -data/test.wsj.words -data/tgt.dict -output -data/emb -data/targetDict.txt -data/verbDict.txt -data/wordDict.txt diff --git a/demo/semantic_role_labeling/api_train_v2.py b/demo/semantic_role_labeling/api_train_v2.py deleted file mode 100644 index 3af636aef5879b43641d55bd7c9b0b8a1242ff8b..0000000000000000000000000000000000000000 --- a/demo/semantic_role_labeling/api_train_v2.py +++ /dev/null @@ -1,277 +0,0 @@ -import math -import numpy as np -import gzip -import logging -import paddle.v2.dataset.conll05 as conll05 -import paddle.v2.evaluator as evaluator -import paddle.v2 as paddle - -logger = logging.getLogger('paddle') - -word_dict, verb_dict, label_dict = conll05.get_dict() -word_dict_len = len(word_dict) -label_dict_len = len(label_dict) -pred_len = len(verb_dict) - -mark_dict_len = 2 -word_dim = 32 -mark_dim = 5 -hidden_dim = 512 -depth = 8 -default_std = 1 / math.sqrt(hidden_dim) / 3.0 -mix_hidden_lr = 1e-3 - - -def d_type(size): - return paddle.data_type.integer_value_sequence(size) - - -def db_lstm(): - #8 features - word = paddle.layer.data(name='word_data', type=d_type(word_dict_len)) - predicate = paddle.layer.data(name='verb_data', type=d_type(pred_len)) - - ctx_n2 = paddle.layer.data(name='ctx_n2_data', type=d_type(word_dict_len)) - ctx_n1 = paddle.layer.data(name='ctx_n1_data', type=d_type(word_dict_len)) - ctx_0 = paddle.layer.data(name='ctx_0_data', type=d_type(word_dict_len)) - ctx_p1 = paddle.layer.data(name='ctx_p1_data', type=d_type(word_dict_len)) - ctx_p2 = paddle.layer.data(name='ctx_p2_data', type=d_type(word_dict_len)) - mark = paddle.layer.data(name='mark_data', type=d_type(mark_dict_len)) - - emb_para = paddle.attr.Param(name='emb', initial_std=0., is_static=True) - std_0 = paddle.attr.Param(initial_std=0.) - std_default = paddle.attr.Param(initial_std=default_std) - - predicate_embedding = paddle.layer.embedding( - size=word_dim, - input=predicate, - param_attr=paddle.attr.Param( - name='vemb', initial_std=default_std)) - mark_embedding = paddle.layer.embedding( - size=mark_dim, input=mark, param_attr=std_0) - - word_input = [word, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2] - emb_layers = [ - paddle.layer.embedding( - size=word_dim, input=x, param_attr=emb_para) for x in word_input - ] - emb_layers.append(predicate_embedding) - emb_layers.append(mark_embedding) - - hidden_0 = paddle.layer.mixed( - size=hidden_dim, - bias_attr=std_default, - input=[ - paddle.layer.full_matrix_projection( - input=emb, param_attr=std_default) for emb in emb_layers - ]) - - lstm_para_attr = paddle.attr.Param(initial_std=0.0, learning_rate=1.0) - hidden_para_attr = paddle.attr.Param( - initial_std=default_std, learning_rate=mix_hidden_lr) - - lstm_0 = paddle.layer.lstmemory( - input=hidden_0, - act=paddle.activation.Relu(), - gate_act=paddle.activation.Sigmoid(), - state_act=paddle.activation.Sigmoid(), - bias_attr=std_0, - param_attr=lstm_para_attr) - - #stack L-LSTM and R-LSTM with direct edges - input_tmp = [hidden_0, lstm_0] - - for i in range(1, depth): - mix_hidden = paddle.layer.mixed( - size=hidden_dim, - bias_attr=std_default, - input=[ - paddle.layer.full_matrix_projection( - input=input_tmp[0], param_attr=hidden_para_attr), - paddle.layer.full_matrix_projection( - input=input_tmp[1], param_attr=lstm_para_attr) - ]) - - lstm = paddle.layer.lstmemory( - input=mix_hidden, - act=paddle.activation.Relu(), - gate_act=paddle.activation.Sigmoid(), - state_act=paddle.activation.Sigmoid(), - reverse=((i % 2) == 1), - bias_attr=std_0, - param_attr=lstm_para_attr) - - input_tmp = [mix_hidden, lstm] - - feature_out = paddle.layer.mixed( - size=label_dict_len, - bias_attr=std_default, - input=[ - paddle.layer.full_matrix_projection( - input=input_tmp[0], param_attr=hidden_para_attr), - paddle.layer.full_matrix_projection( - input=input_tmp[1], param_attr=lstm_para_attr) - ], ) - - return feature_out - - -def load_parameter(file_name, h, w): - with open(file_name, 'rb') as f: - f.read(16) # skip header. - return np.fromfile(f, dtype=np.float32).reshape(h, w) - - -def train(): - paddle.init(use_gpu=False, trainer_count=1) - - # define network topology - feature_out = db_lstm() - target = paddle.layer.data(name='target', type=d_type(label_dict_len)) - crf_cost = paddle.layer.crf(size=label_dict_len, - input=feature_out, - label=target, - param_attr=paddle.attr.Param( - name='crfw', - initial_std=default_std, - learning_rate=mix_hidden_lr)) - - crf_dec = paddle.layer.crf_decoding( - size=label_dict_len, - input=feature_out, - label=target, - param_attr=paddle.attr.Param(name='crfw')) - evaluator.sum(input=crf_dec) - - # create parameters - parameters = paddle.parameters.create(crf_cost) - parameters.set('emb', load_parameter(conll05.get_embedding(), 44068, 32)) - - # create optimizer - optimizer = paddle.optimizer.Momentum( - momentum=0, - learning_rate=2e-2, - regularization=paddle.optimizer.L2Regularization(rate=8e-4), - model_average=paddle.optimizer.ModelAverage( - average_window=0.5, max_average_window=10000), ) - - trainer = paddle.trainer.SGD(cost=crf_cost, - parameters=parameters, - update_equation=optimizer, - extra_layers=crf_dec) - - reader = paddle.batch( - paddle.reader.shuffle( - conll05.test(), buf_size=8192), batch_size=10) - - feeding = { - 'word_data': 0, - 'ctx_n2_data': 1, - 'ctx_n1_data': 2, - 'ctx_0_data': 3, - 'ctx_p1_data': 4, - 'ctx_p2_data': 5, - 'verb_data': 6, - 'mark_data': 7, - 'target': 8 - } - - def event_handler(event): - if isinstance(event, paddle.event.EndIteration): - if event.batch_id % 100 == 0: - logger.info("Pass %d, Batch %d, Cost %f, %s" % ( - event.pass_id, event.batch_id, event.cost, event.metrics)) - if event.batch_id and event.batch_id % 1000 == 0: - result = trainer.test(reader=reader, feeding=feeding) - logger.info("\nTest with Pass %d, Batch %d, %s" % - (event.pass_id, event.batch_id, result.metrics)) - - if isinstance(event, paddle.event.EndPass): - # save parameters - with gzip.open('params_pass_%d.tar.gz' % event.pass_id, 'w') as f: - parameters.to_tar(f) - - result = trainer.test(reader=reader, feeding=feeding) - logger.info("\nTest with Pass %d, %s" % - (event.pass_id, result.metrics)) - - trainer.train( - reader=reader, - event_handler=event_handler, - num_passes=10, - feeding=feeding) - - -def infer_a_batch(inferer, test_data, word_dict, pred_dict, label_dict): - probs = inferer.infer(input=test_data, field='id') - assert len(probs) == sum(len(x[0]) for x in test_data) - - for idx, test_sample in enumerate(test_data): - start_id = 0 - pred_str = "%s\t" % (pred_dict[test_sample[6][0]]) - - for w, tag in zip(test_sample[0], - probs[start_id:start_id + len(test_sample[0])]): - pred_str += "%s[%s] " % (word_dict[w], label_dict[tag]) - print(pred_str.strip()) - start_id += len(test_sample[0]) - - -def infer(): - label_dict_reverse = dict((value, key) - for key, value in label_dict.iteritems()) - word_dict_reverse = dict((value, key) - for key, value in word_dict.iteritems()) - pred_dict_reverse = dict((value, key) - for key, value in verb_dict.iteritems()) - - test_creator = paddle.dataset.conll05.test() - - paddle.init(use_gpu=False, trainer_count=1) - - # define network topology - feature_out = db_lstm() - predict = paddle.layer.crf_decoding( - size=label_dict_len, - input=feature_out, - param_attr=paddle.attr.Param(name='crfw')) - - test_pass = 0 - with gzip.open('params_pass_%d.tar.gz' % (test_pass)) as f: - parameters = paddle.parameters.Parameters.from_tar(f) - inferer = paddle.inference.Inference( - output_layer=predict, parameters=parameters) - - # prepare test data - test_data = [] - test_batch_size = 50 - - for idx, item in enumerate(test_creator()): - test_data.append(item[0:8]) - - if idx and (not idx % test_batch_size): - infer_a_batch( - inferer, - test_data, - word_dict_reverse, - pred_dict_reverse, - label_dict_reverse, ) - test_data = [] - infer_a_batch( - inferer, - test_data, - word_dict_reverse, - pred_dict_reverse, - label_dict_reverse, ) - test_data = [] - - -def main(is_inferring=False): - if is_inferring: - infer() - else: - train() - - -if __name__ == '__main__': - main(is_inferring=False) diff --git a/demo/semantic_role_labeling/data/extract_dict_feature.py b/demo/semantic_role_labeling/data/extract_dict_feature.py deleted file mode 100644 index da44111976a0dec68345fc139d0aa459ca9211c2..0000000000000000000000000000000000000000 --- a/demo/semantic_role_labeling/data/extract_dict_feature.py +++ /dev/null @@ -1,81 +0,0 @@ -# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import sys -import os -from optparse import OptionParser - - -def extract_dict_features(pair_file, feature_file): - - with open(pair_file) as fin, open(feature_file, 'w') as feature_out: - for line in fin: - sentence, predicate, labels = line.strip().split('\t') - sentence_list = sentence.split() - labels_list = labels.split() - - verb_index = labels_list.index('B-V') - - mark = [0] * len(labels_list) - if verb_index > 0: - mark[verb_index - 1] = 1 - ctx_n1 = sentence_list[verb_index - 1] - else: - ctx_n1 = 'bos' - - if verb_index > 1: - mark[verb_index - 2] = 1 - ctx_n2 = sentence_list[verb_index - 2] - else: - ctx_n2 = 'bos' - - mark[verb_index] = 1 - ctx_0 = sentence_list[verb_index] - - if verb_index < len(labels_list) - 1: - mark[verb_index + 1] = 1 - ctx_p1 = sentence_list[verb_index + 1] - else: - ctx_p1 = 'eos' - - if verb_index < len(labels_list) - 2: - mark[verb_index + 2] = 1 - ctx_p2 = sentence_list[verb_index + 2] - else: - ctx_p2 = 'eos' - - - feature_str = sentence + '\t' \ - + predicate + '\t' \ - + ctx_n2 + '\t' \ - + ctx_n1 + '\t' \ - + ctx_0 + '\t' \ - + ctx_p1 + '\t' \ - + ctx_p2 + '\t' \ - + ' '.join([str(i) for i in mark]) + '\t' \ - + labels - - feature_out.write(feature_str + '\n') - - -if __name__ == '__main__': - - usage = '-p pair_file -f feature_file' - parser = OptionParser(usage) - parser.add_option('-p', dest='pair_file', help='the pair file') - parser.add_option('-f', dest='feature_file', help='the feature file') - - (options, args) = parser.parse_args() - - extract_dict_features(options.pair_file, options.feature_file) diff --git a/demo/semantic_role_labeling/data/extract_pairs.py b/demo/semantic_role_labeling/data/extract_pairs.py deleted file mode 100644 index 94a8488c16734eb1882d54f7ec36f4b9308c09d4..0000000000000000000000000000000000000000 --- a/demo/semantic_role_labeling/data/extract_pairs.py +++ /dev/null @@ -1,122 +0,0 @@ -# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import sys -import os -from optparse import OptionParser - - -def read_labels(props_file): - ''' - a sentence maybe has more than one verb, each verb has its label sequence - label[], is a 3-dimension list. - the first dim is to store all sentence's label seqs, len is the sentence number - the second dim is to store all label sequences for one sentences - the third dim is to store each label for one word - ''' - labels = [] - with open(props_file) as fin: - label_seqs_for_one_sentences = [] - one_seg_in_file = [] - for line in fin: - line = line.strip() - if line == '': - for i in xrange(len(one_seg_in_file[0])): - a_kind_lable = [x[i] for x in one_seg_in_file] - label_seqs_for_one_sentences.append(a_kind_lable) - labels.append(label_seqs_for_one_sentences) - one_seg_in_file = [] - label_seqs_for_one_sentences = [] - else: - part = line.split() - one_seg_in_file.append(part) - return labels - - -def read_sentences(words_file): - sentences = [] - with open(words_file) as fin: - s = '' - for line in fin: - line = line.strip() - if line == '': - sentences.append(s) - s = '' - else: - s += line + ' ' - return sentences - - -def transform_labels(sentences, labels): - sen_lab_pair = [] - for i in xrange(len(sentences)): - if len(labels[i]) == 1: - continue - else: - verb_list = [] - for x in labels[i][0]: - if x != '-': - verb_list.append(x) - - for j in xrange(1, len(labels[i])): - label_list = labels[i][j] - current_tag = 'O' - is_in_bracket = False - label_seq = [] - verb_word = '' - for ll in label_list: - if ll == '*' and is_in_bracket == False: - label_seq.append('O') - elif ll == '*' and is_in_bracket == True: - label_seq.append('I-' + current_tag) - elif ll == '*)': - label_seq.append('I-' + current_tag) - is_in_bracket = False - elif ll.find('(') != -1 and ll.find(')') != -1: - current_tag = ll[1:ll.find('*')] - label_seq.append('B-' + current_tag) - is_in_bracket = False - elif ll.find('(') != -1 and ll.find(')') == -1: - current_tag = ll[1:ll.find('*')] - label_seq.append('B-' + current_tag) - is_in_bracket = True - else: - print 'error:', ll - sen_lab_pair.append((sentences[i], verb_list[j - 1], label_seq)) - return sen_lab_pair - - -def write_file(sen_lab_pair, output_file): - with open(output_file, 'w') as fout: - for x in sen_lab_pair: - sentence = x[0] - label_seq = ' '.join(x[2]) - assert len(sentence.split()) == len(x[2]) - fout.write(sentence + '\t' + x[1] + '\t' + label_seq + '\n') - - -if __name__ == '__main__': - - usage = '-w words_file -p props_file -o output_file' - parser = OptionParser(usage) - parser.add_option('-w', dest='words_file', help='the words file') - parser.add_option('-p', dest='props_file', help='the props file') - parser.add_option('-o', dest='output_file', help='the output_file') - (options, args) = parser.parse_args() - - sentences = read_sentences(options.words_file) - labels = read_labels(options.props_file) - sen_lab_pair = transform_labels(sentences, labels) - - write_file(sen_lab_pair, options.output_file) diff --git a/demo/semantic_role_labeling/data/get_data.sh b/demo/semantic_role_labeling/data/get_data.sh deleted file mode 100755 index a0ef26a13b9a03392cb8b6207d6d21b7761e38e8..0000000000000000000000000000000000000000 --- a/demo/semantic_role_labeling/data/get_data.sh +++ /dev/null @@ -1,29 +0,0 @@ -#!/bin/bash -# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -set -e -wget http://www.cs.upc.edu/~srlconll/conll05st-tests.tar.gz -wget http://paddlepaddle.bj.bcebos.com/demo/srl_dict_and_embedding/verbDict.txt -wget http://paddlepaddle.bj.bcebos.com/demo/srl_dict_and_embedding/targetDict.txt -wget http://paddlepaddle.bj.bcebos.com/demo/srl_dict_and_embedding/wordDict.txt -wget http://paddlepaddle.bj.bcebos.com/demo/srl_dict_and_embedding/emb -tar -xzvf conll05st-tests.tar.gz -rm conll05st-tests.tar.gz -cp ./conll05st-release/test.wsj/words/test.wsj.words.gz . -cp ./conll05st-release/test.wsj/props/test.wsj.props.gz . -gunzip test.wsj.words.gz -gunzip test.wsj.props.gz - -python extract_pairs.py -w test.wsj.words -p test.wsj.props -o test.wsj.seq_pair -python extract_dict_feature.py -p test.wsj.seq_pair -f feature diff --git a/demo/semantic_role_labeling/data/test.list b/demo/semantic_role_labeling/data/test.list deleted file mode 100644 index ec370e897a7811b572613150ccb6f665c3adb974..0000000000000000000000000000000000000000 --- a/demo/semantic_role_labeling/data/test.list +++ /dev/null @@ -1 +0,0 @@ -./data/feature diff --git a/demo/semantic_role_labeling/data/train.list b/demo/semantic_role_labeling/data/train.list deleted file mode 100644 index ec370e897a7811b572613150ccb6f665c3adb974..0000000000000000000000000000000000000000 --- a/demo/semantic_role_labeling/data/train.list +++ /dev/null @@ -1 +0,0 @@ -./data/feature diff --git a/demo/semantic_role_labeling/dataprovider.py b/demo/semantic_role_labeling/dataprovider.py deleted file mode 100644 index 360c57ea6283ca43986610abf1831742bfc0c3ef..0000000000000000000000000000000000000000 --- a/demo/semantic_role_labeling/dataprovider.py +++ /dev/null @@ -1,71 +0,0 @@ -# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from paddle.trainer.PyDataProvider2 import * - -UNK_IDX = 0 - - -def hook(settings, word_dict, label_dict, predicate_dict, **kwargs): - settings.word_dict = word_dict - settings.label_dict = label_dict - settings.predicate_dict = predicate_dict - - #all inputs are integral and sequential type - settings.slots = [ - integer_value_sequence(len(word_dict)), - integer_value_sequence(len(word_dict)), - integer_value_sequence(len(word_dict)), - integer_value_sequence(len(word_dict)), - integer_value_sequence(len(word_dict)), - integer_value_sequence(len(word_dict)), - integer_value_sequence(len(predicate_dict)), integer_value_sequence(2), - integer_value_sequence(len(label_dict)) - ] - - -def get_batch_size(yeild_data): - return len(yeild_data[0]) - - -@provider( - init_hook=hook, - should_shuffle=True, - calc_batch_size=get_batch_size, - can_over_batch_size=True, - cache=CacheType.CACHE_PASS_IN_MEM) -def process(settings, file_name): - with open(file_name, 'r') as fdata: - for line in fdata: - sentence, predicate, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, mark, label = \ - line.strip().split('\t') - - words = sentence.split() - sen_len = len(words) - word_slot = [settings.word_dict.get(w, UNK_IDX) for w in words] - - predicate_slot = [settings.predicate_dict.get(predicate)] * sen_len - ctx_n2_slot = [settings.word_dict.get(ctx_n2, UNK_IDX)] * sen_len - ctx_n1_slot = [settings.word_dict.get(ctx_n1, UNK_IDX)] * sen_len - ctx_0_slot = [settings.word_dict.get(ctx_0, UNK_IDX)] * sen_len - ctx_p1_slot = [settings.word_dict.get(ctx_p1, UNK_IDX)] * sen_len - ctx_p2_slot = [settings.word_dict.get(ctx_p2, UNK_IDX)] * sen_len - - marks = mark.split() - mark_slot = [int(w) for w in marks] - - label_list = label.split() - label_slot = [settings.label_dict.get(w) for w in label_list] - yield word_slot, ctx_n2_slot, ctx_n1_slot, \ - ctx_0_slot, ctx_p1_slot, ctx_p2_slot, predicate_slot, mark_slot, label_slot diff --git a/demo/semantic_role_labeling/db_lstm.py b/demo/semantic_role_labeling/db_lstm.py deleted file mode 100644 index 04e2a559b19bd4b9aec0242eb43edf6ab1e7624e..0000000000000000000000000000000000000000 --- a/demo/semantic_role_labeling/db_lstm.py +++ /dev/null @@ -1,218 +0,0 @@ -# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import math -import os -import sys -from paddle.trainer_config_helpers import * - -#file paths -word_dict_file = './data/wordDict.txt' -label_dict_file = './data/targetDict.txt' -predicate_file = './data/verbDict.txt' -train_list_file = './data/train.list' -test_list_file = './data/test.list' - -is_test = get_config_arg('is_test', bool, False) -is_predict = get_config_arg('is_predict', bool, False) - -if not is_predict: - #load dictionaries - word_dict = dict() - label_dict = dict() - predicate_dict = dict() - with open(word_dict_file, 'r') as f_word, \ - open(label_dict_file, 'r') as f_label, \ - open(predicate_file, 'r') as f_pre: - for i, line in enumerate(f_word): - w = line.strip() - word_dict[w] = i - - for i, line in enumerate(f_label): - w = line.strip() - label_dict[w] = i - - for i, line in enumerate(f_pre): - w = line.strip() - predicate_dict[w] = i - - if is_test: - train_list_file = None - - #define data provider - define_py_data_sources2( - train_list=train_list_file, - test_list=test_list_file, - module='dataprovider', - obj='process', - args={ - 'word_dict': word_dict, - 'label_dict': label_dict, - 'predicate_dict': predicate_dict - }) - - word_dict_len = len(word_dict) - label_dict_len = len(label_dict) - pred_len = len(predicate_dict) - -else: - word_dict_len = get_config_arg('dict_len', int) - label_dict_len = get_config_arg('label_len', int) - pred_len = get_config_arg('pred_len', int) - -############################## Hyper-parameters ################################## -mark_dict_len = 2 -word_dim = 32 -mark_dim = 5 -hidden_dim = 512 -depth = 8 - -########################### Optimizer ####################################### - -settings( - batch_size=150, - learning_method=MomentumOptimizer(momentum=0), - learning_rate=2e-2, - regularization=L2Regularization(8e-4), - is_async=False, - model_average=ModelAverage( - average_window=0.5, max_average_window=10000), ) - -####################################### network ############################## -#8 features and 1 target -word = data_layer(name='word_data', size=word_dict_len) -predicate = data_layer(name='verb_data', size=pred_len) - -ctx_n2 = data_layer(name='ctx_n2_data', size=word_dict_len) -ctx_n1 = data_layer(name='ctx_n1_data', size=word_dict_len) -ctx_0 = data_layer(name='ctx_0_data', size=word_dict_len) -ctx_p1 = data_layer(name='ctx_p1_data', size=word_dict_len) -ctx_p2 = data_layer(name='ctx_p2_data', size=word_dict_len) -mark = data_layer(name='mark_data', size=mark_dict_len) - -if not is_predict: - target = data_layer(name='target', size=label_dict_len) - -default_std = 1 / math.sqrt(hidden_dim) / 3.0 - -emb_para = ParameterAttribute(name='emb', initial_std=0., learning_rate=0.) -std_0 = ParameterAttribute(initial_std=0.) -std_default = ParameterAttribute(initial_std=default_std) - -predicate_embedding = embedding_layer( - size=word_dim, - input=predicate, - param_attr=ParameterAttribute( - name='vemb', initial_std=default_std)) -mark_embedding = embedding_layer( - name='word_ctx-in_embedding', size=mark_dim, input=mark, param_attr=std_0) - -word_input = [word, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2] -emb_layers = [ - embedding_layer( - size=word_dim, input=x, param_attr=emb_para) for x in word_input -] -emb_layers.append(predicate_embedding) -emb_layers.append(mark_embedding) - -hidden_0 = mixed_layer( - name='hidden0', - size=hidden_dim, - bias_attr=std_default, - input=[ - full_matrix_projection( - input=emb, param_attr=std_default) for emb in emb_layers - ]) - -mix_hidden_lr = 1e-3 -lstm_para_attr = ParameterAttribute(initial_std=0.0, learning_rate=1.0) -hidden_para_attr = ParameterAttribute( - initial_std=default_std, learning_rate=mix_hidden_lr) - -lstm_0 = lstmemory( - name='lstm0', - input=hidden_0, - act=ReluActivation(), - gate_act=SigmoidActivation(), - state_act=SigmoidActivation(), - bias_attr=std_0, - param_attr=lstm_para_attr) - -#stack L-LSTM and R-LSTM with direct edges -input_tmp = [hidden_0, lstm_0] - -for i in range(1, depth): - - mix_hidden = mixed_layer( - name='hidden' + str(i), - size=hidden_dim, - bias_attr=std_default, - input=[ - full_matrix_projection( - input=input_tmp[0], param_attr=hidden_para_attr), - full_matrix_projection( - input=input_tmp[1], param_attr=lstm_para_attr) - ]) - - lstm = lstmemory( - name='lstm' + str(i), - input=mix_hidden, - act=ReluActivation(), - gate_act=SigmoidActivation(), - state_act=SigmoidActivation(), - reverse=((i % 2) == 1), - bias_attr=std_0, - param_attr=lstm_para_attr) - - input_tmp = [mix_hidden, lstm] - -feature_out = mixed_layer( - name='output', - size=label_dict_len, - bias_attr=std_default, - input=[ - full_matrix_projection( - input=input_tmp[0], param_attr=hidden_para_attr), - full_matrix_projection( - input=input_tmp[1], param_attr=lstm_para_attr) - ], ) - -if not is_predict: - crf_l = crf_layer( - name='crf', - size=label_dict_len, - input=feature_out, - label=target, - param_attr=ParameterAttribute( - name='crfw', initial_std=default_std, learning_rate=mix_hidden_lr)) - - crf_dec_l = crf_decoding_layer( - name='crf_dec_l', - size=label_dict_len, - input=feature_out, - label=target, - param_attr=ParameterAttribute(name='crfw')) - - eval = sum_evaluator(input=crf_dec_l) - - outputs(crf_l) - -else: - crf_dec_l = crf_decoding_layer( - name='crf_dec_l', - size=label_dict_len, - input=feature_out, - param_attr=ParameterAttribute(name='crfw')) - - outputs(crf_dec_l) diff --git a/demo/semantic_role_labeling/predict.py b/demo/semantic_role_labeling/predict.py deleted file mode 100644 index 372fd090b6e8f08f5bb34697772c2e4976810595..0000000000000000000000000000000000000000 --- a/demo/semantic_role_labeling/predict.py +++ /dev/null @@ -1,193 +0,0 @@ -# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os -import numpy as np -from optparse import OptionParser -from py_paddle import swig_paddle, DataProviderConverter -from paddle.trainer.PyDataProvider2 import integer_value_sequence -from paddle.trainer.config_parser import parse_config -""" -Usage: run following command to show help message. - python predict.py -h -""" -UNK_IDX = 0 - - -class Prediction(): - def __init__(self, train_conf, dict_file, model_dir, label_file, - predicate_dict_file): - """ - train_conf: trainer configure. - dict_file: word dictionary file name. - model_dir: directory of model. - """ - - self.dict = {} - self.labels = {} - self.predicate_dict = {} - self.labels_reverse = {} - self.load_dict_label(dict_file, label_file, predicate_dict_file) - - len_dict = len(self.dict) - len_label = len(self.labels) - len_pred = len(self.predicate_dict) - - conf = parse_config( - train_conf, 'dict_len=' + str(len_dict) + ',label_len=' + - str(len_label) + ',pred_len=' + str(len_pred) + ',is_predict=True') - self.network = swig_paddle.GradientMachine.createFromConfigProto( - conf.model_config) - self.network.loadParameters(model_dir) - - slots = [ - integer_value_sequence(len_dict), integer_value_sequence(len_dict), - integer_value_sequence(len_dict), integer_value_sequence(len_dict), - integer_value_sequence(len_dict), integer_value_sequence(len_dict), - integer_value_sequence(len_pred), integer_value_sequence(2) - ] - self.converter = DataProviderConverter(slots) - - def load_dict_label(self, dict_file, label_file, predicate_dict_file): - """ - Load dictionary from self.dict_file. - """ - for line_count, line in enumerate(open(dict_file, 'r')): - self.dict[line.strip()] = line_count - - for line_count, line in enumerate(open(label_file, 'r')): - self.labels[line.strip()] = line_count - self.labels_reverse[line_count] = line.strip() - - for line_count, line in enumerate(open(predicate_dict_file, 'r')): - self.predicate_dict[line.strip()] = line_count - - def get_data(self, data_file): - """ - Get input data of paddle format. - """ - with open(data_file, 'r') as fdata: - for line in fdata: - sentence, predicate, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, mark, label = line.strip( - ).split('\t') - words = sentence.split() - sen_len = len(words) - - word_slot = [self.dict.get(w, UNK_IDX) for w in words] - predicate_slot = [self.predicate_dict.get(predicate, UNK_IDX) - ] * sen_len - ctx_n2_slot = [self.dict.get(ctx_n2, UNK_IDX)] * sen_len - ctx_n1_slot = [self.dict.get(ctx_n1, UNK_IDX)] * sen_len - ctx_0_slot = [self.dict.get(ctx_0, UNK_IDX)] * sen_len - ctx_p1_slot = [self.dict.get(ctx_p1, UNK_IDX)] * sen_len - ctx_p2_slot = [self.dict.get(ctx_p2, UNK_IDX)] * sen_len - - marks = mark.split() - mark_slot = [int(w) for w in marks] - - yield word_slot, ctx_n2_slot, ctx_n1_slot, \ - ctx_0_slot, ctx_p1_slot, ctx_p2_slot, predicate_slot, mark_slot - - def predict(self, data_file, output_file): - """ - data_file: file name of input data. - """ - input = self.converter(self.get_data(data_file)) - output = self.network.forwardTest(input) - lab = output[0]["id"].tolist() - - with open(data_file, 'r') as fin, open(output_file, 'w') as fout: - index = 0 - for line in fin: - sen = line.split('\t')[0] - len_sen = len(sen.split()) - line_labels = lab[index:index + len_sen] - index += len_sen - fout.write(sen + '\t' + ' '.join( - [self.labels_reverse[i] for i in line_labels]) + '\n') - - -def option_parser(): - usage = ( - "python predict.py -c config -w model_dir " - "-d word dictionary -l label_file -i input_file -p pred_dict_file") - parser = OptionParser(usage="usage: %s [options]" % usage) - parser.add_option( - "-c", - "--tconf", - action="store", - dest="train_conf", - help="network config") - parser.add_option( - "-d", - "--dict", - action="store", - dest="dict_file", - help="dictionary file") - parser.add_option( - "-l", - "--label", - action="store", - dest="label_file", - default=None, - help="label file") - parser.add_option( - "-p", - "--predict_dict_file", - action="store", - dest="predict_dict_file", - default=None, - help="predict_dict_file") - parser.add_option( - "-i", - "--data", - action="store", - dest="data_file", - help="data file to predict") - parser.add_option( - "-w", - "--model", - action="store", - dest="model_path", - default=None, - help="model path") - - parser.add_option( - "-o", - "--output_file", - action="store", - dest="output_file", - default=None, - help="output file") - return parser.parse_args() - - -def main(): - options, args = option_parser() - train_conf = options.train_conf - data_file = options.data_file - dict_file = options.dict_file - model_path = options.model_path - label_file = options.label_file - predict_dict_file = options.predict_dict_file - output_file = options.output_file - - swig_paddle.initPaddle("--use_gpu=0") - predict = Prediction(train_conf, dict_file, model_path, label_file, - predict_dict_file) - predict.predict(data_file, output_file) - - -if __name__ == '__main__': - main() diff --git a/demo/semantic_role_labeling/predict.sh b/demo/semantic_role_labeling/predict.sh deleted file mode 100755 index 873aad670d16803ce321ab60baabe9fe29ea64bf..0000000000000000000000000000000000000000 --- a/demo/semantic_role_labeling/predict.sh +++ /dev/null @@ -1,43 +0,0 @@ -#!/bin/bash - -# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -set -e - -function get_best_pass() { - cat $1 | grep -Pzo 'Test .*\n.*pass-.*' | \ - sed -r 'N;s/Test.* cost=([0-9]+\.[0-9]+).*\n.*pass-([0-9]+)/\1 \2/g' | \ - sort -n | head -n 1 -} - -log=train.log -LOG=`get_best_pass $log` -LOG=(${LOG}) -best_model_path="output/pass-${LOG[1]}" - -config_file=db_lstm.py -dict_file=./data/wordDict.txt -label_file=./data/targetDict.txt -predicate_dict_file=./data/verbDict.txt -input_file=./data/feature -output_file=predict.res - -python predict.py \ - -c $config_file \ - -w $best_model_path \ - -l $label_file \ - -p $predicate_dict_file \ - -d $dict_file \ - -i $input_file \ - -o $output_file diff --git a/demo/semantic_role_labeling/test.sh b/demo/semantic_role_labeling/test.sh deleted file mode 100755 index 095bbff2ea42627a13d8ebab436f5a05abc09743..0000000000000000000000000000000000000000 --- a/demo/semantic_role_labeling/test.sh +++ /dev/null @@ -1,41 +0,0 @@ -#!/bin/bash - -# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -set -e - -function get_best_pass() { - cat $1 | grep -Pzo 'Test .*\n.*pass-.*' | \ - sed -r 'N;s/Test.* cost=([0-9]+\.[0-9]+).*\n.*pass-([0-9]+)/\1 \2/g' |\ - sort -n | head -n 1 -} - -log=train.log -LOG=`get_best_pass $log` -LOG=(${LOG}) -evaluate_pass="output/pass-${LOG[1]}" - -echo 'evaluating from pass '$evaluate_pass -model_list=./model.list -touch $model_list | echo $evaluate_pass > $model_list - -paddle train \ - --config=./db_lstm.py \ - --model_list=$model_list \ - --job=test \ - --use_gpu=false \ - --config_args=is_test=1 \ - --test_all_data_in_one_period=1 \ -2>&1 | tee 'test.log' -paddle usage -l test.log -e $? -n "semantic_role_labeling_test" >/dev/null 2>&1 diff --git a/demo/semantic_role_labeling/train.sh b/demo/semantic_role_labeling/train.sh deleted file mode 100755 index eee14010d7b04a1b824f39090fa82fc532085e0d..0000000000000000000000000000000000000000 --- a/demo/semantic_role_labeling/train.sh +++ /dev/null @@ -1,30 +0,0 @@ -#!/bin/bash - -# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -set -e -paddle train \ - --config=./db_lstm.py \ - --use_gpu=0 \ - --log_period=5000 \ - --trainer_count=1 \ - --show_parameter_stats_period=5000 \ - --save_dir=./output \ - --num_passes=10000 \ - --average_test_period=10000000 \ - --init_model_path=./data \ - --load_missing_parameter_strategy=rand \ - --test_all_data_in_one_period=1 \ - 2>&1 | tee 'train.log' -paddle usage -l train.log -e $? -n "semantic_role_labeling_train" >/dev/null 2>&1 diff --git a/demo/sentiment/.gitignore b/demo/sentiment/.gitignore deleted file mode 100644 index bf2a9ab1ce3c937bf06179074cd952dc53591dfd..0000000000000000000000000000000000000000 --- a/demo/sentiment/.gitignore +++ /dev/null @@ -1,11 +0,0 @@ -data/aclImdb -data/imdb -data/pre-imdb -data/mosesdecoder-master -logs/ -model_output -dataprovider_copy_1.py -model.list -test.log -train.log -*.pyc diff --git a/demo/sentiment/data/get_imdb.sh b/demo/sentiment/data/get_imdb.sh deleted file mode 100755 index 7600af6fbb900ee845702f1297779c1f0ed9bf84..0000000000000000000000000000000000000000 --- a/demo/sentiment/data/get_imdb.sh +++ /dev/null @@ -1,51 +0,0 @@ -#!/bin/bash -# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -set -e -set -x - -DIR="$( cd "$(dirname "$0")" ; pwd -P )" -cd $DIR - -#download the dataset -echo "Downloading aclImdb..." -#http://ai.stanford.edu/%7Eamaas/data/sentiment/ -wget http://ai.stanford.edu/%7Eamaas/data/sentiment/aclImdb_v1.tar.gz - -echo "Downloading mosesdecoder..." -#https://github.com/moses-smt/mosesdecoder -wget https://github.com/moses-smt/mosesdecoder/archive/master.zip - -#extract package -echo "Unzipping..." -tar -zxvf aclImdb_v1.tar.gz -unzip master.zip - -#move train and test set to imdb_data directory -#in order to process when traing -mkdir -p imdb/train -mkdir -p imdb/test - -cp -r aclImdb/train/pos/ imdb/train/pos -cp -r aclImdb/train/neg/ imdb/train/neg - -cp -r aclImdb/test/pos/ imdb/test/pos -cp -r aclImdb/test/neg/ imdb/test/neg - -#remove compressed package -rm aclImdb_v1.tar.gz -rm master.zip - -echo "Done." diff --git a/demo/sentiment/dataprovider.py b/demo/sentiment/dataprovider.py deleted file mode 100755 index 4b7f5d0e504aef3884a04cbed8c16503a4079772..0000000000000000000000000000000000000000 --- a/demo/sentiment/dataprovider.py +++ /dev/null @@ -1,37 +0,0 @@ -# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -from paddle.trainer.PyDataProvider2 import * - - -def hook(settings, dictionary, **kwargs): - settings.word_dict = dictionary - settings.input_types = [ - integer_value_sequence(len(settings.word_dict)), integer_value(2) - ] - settings.logger.info('dict len : %d' % (len(settings.word_dict))) - - -@provider(init_hook=hook) -def process(settings, file_name): - with open(file_name, 'r') as fdata: - for line_count, line in enumerate(fdata): - label, comment = line.strip().split('\t\t') - label = int(label) - words = comment.split() - word_slot = [ - settings.word_dict[w] for w in words if w in settings.word_dict - ] - if not word_slot: - continue - yield word_slot, label diff --git a/demo/sentiment/predict.py b/demo/sentiment/predict.py deleted file mode 100755 index 64c78e0d6b9297e7a321a4f070517593b0bfe332..0000000000000000000000000000000000000000 --- a/demo/sentiment/predict.py +++ /dev/null @@ -1,154 +0,0 @@ -# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os, sys -import numpy as np -from optparse import OptionParser -from py_paddle import swig_paddle, DataProviderConverter -from paddle.trainer.PyDataProvider2 import integer_value_sequence -from paddle.trainer.config_parser import parse_config -""" -Usage: run following command to show help message. - python predict.py -h -""" - - -class SentimentPrediction(): - def __init__(self, train_conf, dict_file, model_dir=None, label_file=None): - """ - train_conf: trainer configure. - dict_file: word dictionary file name. - model_dir: directory of model. - """ - self.train_conf = train_conf - self.dict_file = dict_file - self.word_dict = {} - self.dict_dim = self.load_dict() - self.model_dir = model_dir - if model_dir is None: - self.model_dir = os.path.dirname(train_conf) - - self.label = None - if label_file is not None: - self.load_label(label_file) - - conf = parse_config(train_conf, "is_predict=1") - self.network = swig_paddle.GradientMachine.createFromConfigProto( - conf.model_config) - self.network.loadParameters(self.model_dir) - input_types = [integer_value_sequence(self.dict_dim)] - self.converter = DataProviderConverter(input_types) - - def load_dict(self): - """ - Load dictionary from self.dict_file. - """ - for line_count, line in enumerate(open(self.dict_file, 'r')): - self.word_dict[line.strip().split('\t')[0]] = line_count - return len(self.word_dict) - - def load_label(self, label_file): - """ - Load label. - """ - self.label = {} - for v in open(label_file, 'r'): - self.label[int(v.split('\t')[1])] = v.split('\t')[0] - - def get_index(self, data): - """ - transform word into integer index according to the dictionary. - """ - words = data.strip().split() - word_slot = [self.word_dict[w] for w in words if w in self.word_dict] - return word_slot - - def batch_predict(self, data_batch): - input = self.converter(data_batch) - output = self.network.forwardTest(input) - prob = output[0]["value"] - labs = np.argsort(-prob) - for idx, lab in enumerate(labs): - if self.label is None: - print("predicting label is %d" % (lab[0])) - else: - print("predicting label is %s" % (self.label[lab[0]])) - - -def option_parser(): - usage = "python predict.py -n config -w model_dir -d dictionary -i input_file " - parser = OptionParser(usage="usage: %s [options]" % usage) - parser.add_option( - "-n", - "--tconf", - action="store", - dest="train_conf", - help="network config") - parser.add_option( - "-d", - "--dict", - action="store", - dest="dict_file", - help="dictionary file") - parser.add_option( - "-b", - "--label", - action="store", - dest="label", - default=None, - help="dictionary file") - parser.add_option( - "-c", - "--batch_size", - type="int", - action="store", - dest="batch_size", - default=1, - help="the batch size for prediction") - parser.add_option( - "-w", - "--model", - action="store", - dest="model_path", - default=None, - help="model path") - return parser.parse_args() - - -def main(): - options, args = option_parser() - train_conf = options.train_conf - batch_size = options.batch_size - dict_file = options.dict_file - model_path = options.model_path - label = options.label - swig_paddle.initPaddle("--use_gpu=0") - predict = SentimentPrediction(train_conf, dict_file, model_path, label) - - batch = [] - for line in sys.stdin: - words = predict.get_index(line) - if words: - batch.append([words]) - else: - print('All the words in [%s] are not in the dictionary.' % line) - if len(batch) == batch_size: - predict.batch_predict(batch) - batch = [] - if len(batch) > 0: - predict.batch_predict(batch) - - -if __name__ == '__main__': - main() diff --git a/demo/sentiment/predict.sh b/demo/sentiment/predict.sh deleted file mode 100755 index c72a8e8641516543ef267fcb4b448630246d1e8d..0000000000000000000000000000000000000000 --- a/demo/sentiment/predict.sh +++ /dev/null @@ -1,27 +0,0 @@ -#!/bin/bash -# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -set -e - -#Note the default model is pass-00002, you shold make sure the model path -#exists or change the mode path. -model=model_output/pass-00002/ -config=trainer_config.py -label=data/pre-imdb/labels.list -cat ./data/aclImdb/test/pos/10007_10.txt | python predict.py \ - --tconf=$config\ - --model=$model \ - --label=$label \ - --dict=./data/pre-imdb/dict.txt \ - --batch_size=1 diff --git a/demo/sentiment/preprocess.py b/demo/sentiment/preprocess.py deleted file mode 100755 index 29b3682b747c66574590de5ea70574981cc536bb..0000000000000000000000000000000000000000 --- a/demo/sentiment/preprocess.py +++ /dev/null @@ -1,359 +0,0 @@ -# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os -import sys -import random -import operator -import numpy as np -from subprocess import Popen, PIPE -from os.path import join as join_path -from optparse import OptionParser - -from paddle.utils.preprocess_util import * -""" -Usage: run following command to show help message. - python preprocess.py -h -""" - - -def save_dict(dict, filename, is_reverse=True): - """ - Save dictionary into file. - dict: input dictionary. - filename: output file name, string. - is_reverse: True, descending order by value. - False, ascending order by value. - """ - f = open(filename, 'w') - for k, v in sorted(dict.items(), key=operator.itemgetter(1),\ - reverse=is_reverse): - f.write('%s\t%s\n' % (k, v)) - f.close() - - -def tokenize(sentences): - """ - Use tokenizer.perl to tokenize input sentences. - tokenizer.perl is tool of Moses. - sentences : a list of input sentences. - return: a list of processed text. - """ - dir = './data/mosesdecoder-master/scripts/tokenizer/tokenizer.perl' - tokenizer_cmd = [dir, '-l', 'en', '-q', '-'] - assert isinstance(sentences, list) - text = "\n".join(sentences) - tokenizer = Popen(tokenizer_cmd, stdin=PIPE, stdout=PIPE) - tok_text, _ = tokenizer.communicate(text) - toks = tok_text.split('\n')[:-1] - return toks - - -def read_lines(path): - """ - path: String, file path. - return a list of sequence. - """ - seqs = [] - with open(path, 'r') as f: - for line in f.readlines(): - line = line.strip() - if len(line): - seqs.append(line) - return seqs - - -class SentimentDataSetCreate(): - """ - A class to process data for sentiment analysis task. - """ - - def __init__(self, - data_path, - output_path, - use_okenizer=True, - multi_lines=False): - """ - data_path: string, traing and testing dataset path - output_path: string, output path, store processed dataset - multi_lines: whether a file has multi lines. - In order to shuffle fully, it needs to read all files into - memory, then shuffle them if one file has multi lines. - """ - self.output_path = output_path - self.data_path = data_path - - self.train_dir = 'train' - self.test_dir = 'test' - - self.train_list = "train.list" - self.test_list = "test.list" - - self.label_list = "labels.list" - self.classes_num = 0 - - self.batch_size = 50000 - self.batch_dir = 'batches' - - self.dict_file = "dict.txt" - self.dict_with_test = False - self.dict_size = 0 - self.word_count = {} - - self.tokenizer = use_okenizer - self.overwrite = False - - self.multi_lines = multi_lines - - self.train_dir = join_path(data_path, self.train_dir) - self.test_dir = join_path(data_path, self.test_dir) - self.train_list = join_path(output_path, self.train_list) - self.test_list = join_path(output_path, self.test_list) - self.label_list = join_path(output_path, self.label_list) - self.dict_file = join_path(output_path, self.dict_file) - - def data_list(self, path): - """ - create dataset from path - path: data path - return: data list - """ - label_set = get_label_set_from_dir(path) - data = [] - for lab_name in label_set.keys(): - file_paths = list_files(join_path(path, lab_name)) - for p in file_paths: - data.append({"label" : label_set[lab_name],\ - "seq_path": p}) - return data, label_set - - def create_dict(self, data): - """ - create dict for input data. - data: list, [sequence, sequnce, ...] - """ - for seq in data: - for w in seq.strip().lower().split(): - if w not in self.word_count: - self.word_count[w] = 1 - else: - self.word_count[w] += 1 - - def create_dataset(self): - """ - create file batches and dictionary of train data set. - If the self.overwrite is false and train.list already exists in - self.output_path, this function will not create and save file - batches from the data set path. - return: dictionary size, class number. - """ - out_path = self.output_path - if out_path and not os.path.exists(out_path): - os.makedirs(out_path) - - # If self.overwrite is false or self.train_list has existed, - # it will not process dataset. - if not (self.overwrite or not os.path.exists(self.train_list)): - print "%s already exists." % self.train_list - return - - # Preprocess train data. - train_data, train_lab_set = self.data_list(self.train_dir) - print "processing train set..." - file_lists = self.save_data(train_data, "train", self.batch_size, True, - True) - save_list(file_lists, self.train_list) - - # If have test data path, preprocess test data. - if os.path.exists(self.test_dir): - test_data, test_lab_set = self.data_list(self.test_dir) - assert (train_lab_set == test_lab_set) - print "processing test set..." - file_lists = self.save_data(test_data, "test", self.batch_size, - False, self.dict_with_test) - save_list(file_lists, self.test_list) - - # save labels set. - save_dict(train_lab_set, self.label_list, False) - self.classes_num = len(train_lab_set.keys()) - - # save dictionary. - save_dict(self.word_count, self.dict_file, True) - self.dict_size = len(self.word_count) - - def save_data(self, - data, - prefix="", - batch_size=50000, - is_shuffle=False, - build_dict=False): - """ - Create batches for a Dataset object. - data: the Dataset object to process. - prefix: the prefix of each batch. - batch_size: number of data in each batch. - build_dict: whether to build dictionary for data - - return: list of batch names - """ - if is_shuffle and self.multi_lines: - return self.save_data_multi_lines(data, prefix, batch_size, - build_dict) - - if is_shuffle: - random.shuffle(data) - num_batches = int(math.ceil(len(data) / float(batch_size))) - batch_names = [] - for i in range(num_batches): - batch_name = join_path(self.output_path, - "%s_part_%03d" % (prefix, i)) - begin = i * batch_size - end = min((i + 1) * batch_size, len(data)) - # read a batch of data - label_list, data_list = self.get_data_list(begin, end, data) - if build_dict: - self.create_dict(data_list) - self.save_file(label_list, data_list, batch_name) - batch_names.append(batch_name) - - return batch_names - - def get_data_list(self, begin, end, data): - """ - begin: int, begining index of data. - end: int, ending index of data. - data: a list of {"seq_path": seqquence path, "label": label index} - - return a list of label and a list of sequence. - """ - label_list = [] - data_list = [] - for j in range(begin, end): - seqs = read_lines(data[j]["seq_path"]) - lab = int(data[j]["label"]) - #File may have multiple lines. - for seq in seqs: - data_list.append(seq) - label_list.append(lab) - if self.tokenizer: - data_list = tokenize(data_list) - return label_list, data_list - - def save_data_multi_lines(self, - data, - prefix="", - batch_size=50000, - build_dict=False): - """ - In order to shuffle fully, there is no need to load all data if - each file only contains one sample, it only needs to shuffle list - of file name. But one file contains multi lines, each line is one - sample. It needs to read all data into memory to shuffle fully. - This interface is mainly for data containning multi lines in each - file, which consumes more memory if there is a great mount of data. - - data: the Dataset object to process. - prefix: the prefix of each batch. - batch_size: number of data in each batch. - build_dict: whether to build dictionary for data - - return: list of batch names - """ - assert self.multi_lines - label_list = [] - data_list = [] - - # read all data - label_list, data_list = self.get_data_list(0, len(data), data) - if build_dict: - self.create_dict(data_list) - - length = len(label_list) - perm_list = np.array([i for i in xrange(length)]) - random.shuffle(perm_list) - - num_batches = int(math.ceil(length / float(batch_size))) - batch_names = [] - for i in range(num_batches): - batch_name = join_path(self.output_path, - "%s_part_%03d" % (prefix, i)) - begin = i * batch_size - end = min((i + 1) * batch_size, length) - sub_label = [label_list[perm_list[i]] for i in range(begin, end)] - sub_data = [data_list[perm_list[i]] for i in range(begin, end)] - self.save_file(sub_label, sub_data, batch_name) - batch_names.append(batch_name) - - return batch_names - - def save_file(self, label_list, data_list, filename): - """ - Save data into file. - label_list: a list of int value. - data_list: a list of sequnece. - filename: output file name. - """ - f = open(filename, 'w') - print "saving file: %s" % filename - for lab, seq in zip(label_list, data_list): - f.write('%s\t\t%s\n' % (lab, seq)) - f.close() - - -def option_parser(): - parser = OptionParser(usage="usage: python preprcoess.py "\ - "-i data_dir [options]") - parser.add_option( - "-i", - "--data", - action="store", - dest="input", - help="Input data directory.") - parser.add_option( - "-o", - "--output", - action="store", - dest="output", - default=None, - help="Output directory.") - parser.add_option( - "-t", - "--tokenizer", - action="store", - dest="use_tokenizer", - default=True, - help="Whether to use tokenizer.") - parser.add_option("-m", "--multi_lines", action="store", - dest="multi_lines", default=False, - help="If input text files have multi lines and they "\ - "need to be shuffled, you should set -m True,") - return parser.parse_args() - - -def main(): - options, args = option_parser() - data_dir = options.input - output_dir = options.output - use_tokenizer = options.use_tokenizer - multi_lines = options.multi_lines - if output_dir is None: - outname = os.path.basename(options.input) - output_dir = join_path(os.path.dirname(data_dir), 'pre-' + outname) - data_creator = SentimentDataSetCreate(data_dir, output_dir, use_tokenizer, - multi_lines) - data_creator.create_dataset() - - -if __name__ == '__main__': - main() diff --git a/demo/sentiment/sentiment_net.py b/demo/sentiment/sentiment_net.py deleted file mode 100644 index a01577ca5ae025b7bec67c6d54c7dbd931dbee74..0000000000000000000000000000000000000000 --- a/demo/sentiment/sentiment_net.py +++ /dev/null @@ -1,145 +0,0 @@ -# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from os.path import join as join_path - -from paddle.trainer_config_helpers import * - - -def sentiment_data(data_dir=None, - is_test=False, - is_predict=False, - train_list="train.list", - test_list="test.list", - dict_file="dict.txt"): - """ - Predefined data provider for sentiment analysis. - is_test: whether this config is used for test. - is_predict: whether this config is used for prediction. - train_list: text file name, containing a list of training set. - test_list: text file name, containing a list of testing set. - dict_file: text file name, containing dictionary. - """ - dict_dim = len(open(join_path(data_dir, "dict.txt")).readlines()) - class_dim = len(open(join_path(data_dir, 'labels.list')).readlines()) - if is_predict: - return dict_dim, class_dim - - if data_dir is not None: - train_list = join_path(data_dir, train_list) - test_list = join_path(data_dir, test_list) - dict_file = join_path(data_dir, dict_file) - - train_list = train_list if not is_test else None - word_dict = dict() - with open(dict_file, 'r') as f: - for i, line in enumerate(open(dict_file, 'r')): - word_dict[line.split('\t')[0]] = i - - define_py_data_sources2( - train_list, - test_list, - module="dataprovider", - obj="process", - args={'dictionary': word_dict}) - - return dict_dim, class_dim - - -def bidirectional_lstm_net(input_dim, - class_dim=2, - emb_dim=128, - lstm_dim=128, - is_predict=False): - data = data_layer("word", input_dim) - emb = embedding_layer(input=data, size=emb_dim) - bi_lstm = bidirectional_lstm(input=emb, size=lstm_dim) - dropout = dropout_layer(input=bi_lstm, dropout_rate=0.5) - output = fc_layer(input=dropout, size=class_dim, act=SoftmaxActivation()) - - if not is_predict: - lbl = data_layer("label", 1) - outputs(classification_cost(input=output, label=lbl)) - else: - outputs(output) - - -def stacked_lstm_net(input_dim, - class_dim=2, - emb_dim=128, - hid_dim=512, - stacked_num=3, - is_predict=False): - """ - A Wrapper for sentiment classification task. - This network uses bi-directional recurrent network, - consisting three LSTM layers. This configure is referred to - the paper as following url, but use fewer layrs. - http://www.aclweb.org/anthology/P15-1109 - - input_dim: here is word dictionary dimension. - class_dim: number of categories. - emb_dim: dimension of word embedding. - hid_dim: dimension of hidden layer. - stacked_num: number of stacked lstm-hidden layer. - is_predict: is predicting or not. - Some layers is not needed in network when predicting. - """ - hid_lr = 1e-3 - assert stacked_num % 2 == 1 - - layer_attr = ExtraLayerAttribute(drop_rate=0.5) - fc_para_attr = ParameterAttribute(learning_rate=hid_lr) - lstm_para_attr = ParameterAttribute(initial_std=0., learning_rate=1.) - para_attr = [fc_para_attr, lstm_para_attr] - bias_attr = ParameterAttribute(initial_std=0., l2_rate=0.) - relu = ReluActivation() - linear = LinearActivation() - - data = data_layer("word", input_dim) - emb = embedding_layer(input=data, size=emb_dim) - - fc1 = fc_layer(input=emb, size=hid_dim, act=linear, bias_attr=bias_attr) - lstm1 = lstmemory( - input=fc1, act=relu, bias_attr=bias_attr, layer_attr=layer_attr) - - inputs = [fc1, lstm1] - for i in range(2, stacked_num + 1): - fc = fc_layer( - input=inputs, - size=hid_dim, - act=linear, - param_attr=para_attr, - bias_attr=bias_attr) - lstm = lstmemory( - input=fc, - reverse=(i % 2) == 0, - act=relu, - bias_attr=bias_attr, - layer_attr=layer_attr) - inputs = [fc, lstm] - - fc_last = pooling_layer(input=inputs[0], pooling_type=MaxPooling()) - lstm_last = pooling_layer(input=inputs[1], pooling_type=MaxPooling()) - output = fc_layer( - input=[fc_last, lstm_last], - size=class_dim, - act=SoftmaxActivation(), - bias_attr=bias_attr, - param_attr=para_attr) - - if is_predict: - outputs(output) - else: - outputs(classification_cost(input=output, label=data_layer('label', 1))) diff --git a/demo/sentiment/test.sh b/demo/sentiment/test.sh deleted file mode 100755 index 85c4f3ccfc3ede23fcf701769b9701ecbf57c789..0000000000000000000000000000000000000000 --- a/demo/sentiment/test.sh +++ /dev/null @@ -1,40 +0,0 @@ -#!/bin/bash -# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -set -e - -function get_best_pass() { - cat $1 | grep -Pzo 'Test .*\n.*pass-.*' | \ - sed -r 'N;s/Test.* classification_error_evaluator=([0-9]+\.[0-9]+).*\n.*pass-([0-9]+)/\1 \2/g' |\ - sort -n | head -n 1 -} - -log=train.log -LOG=`get_best_pass $log` -LOG=(${LOG}) -evaluate_pass="model_output/pass-${LOG[1]}" - -echo 'evaluating from pass '$evaluate_pass - -model_list=./model.list -touch $model_list | echo $evaluate_pass > $model_list -net_conf=trainer_config.py -paddle train --config=$net_conf \ - --model_list=$model_list \ - --job=test \ - --use_gpu=false \ - --trainer_count=4 \ - --config_args=is_test=1 \ - 2>&1 | tee 'test.log' -paddle usage -l test.log -e $? -n "sentiment_test" >/dev/null 2>&1 diff --git a/demo/sentiment/train.sh b/demo/sentiment/train.sh deleted file mode 100755 index 14620f733bf03444e5ba3b3b792dfbed6146ecde..0000000000000000000000000000000000000000 --- a/demo/sentiment/train.sh +++ /dev/null @@ -1,30 +0,0 @@ -#!/bin/bash -# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -set -e - -config=trainer_config.py -output=./model_output -paddle train --config=$config \ - --save_dir=$output \ - --job=train \ - --use_gpu=false \ - --trainer_count=4 \ - --num_passes=10 \ - --log_period=10 \ - --dot_period=20 \ - --show_parameter_stats_period=100 \ - --test_all_data_in_one_period=1 \ - 2>&1 | tee 'train.log' -paddle usage -l train.log -e $? -n "sentiment_train" >/dev/null 2>&1 diff --git a/demo/sentiment/train_v2.py b/demo/sentiment/train_v2.py deleted file mode 100644 index 1c856556bd0cb32f60eba322469b3621c37e1349..0000000000000000000000000000000000000000 --- a/demo/sentiment/train_v2.py +++ /dev/null @@ -1,159 +0,0 @@ -# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import sys -import paddle.v2 as paddle - - -def convolution_net(input_dim, class_dim=2, emb_dim=128, hid_dim=128): - data = paddle.layer.data("word", - paddle.data_type.integer_value_sequence(input_dim)) - emb = paddle.layer.embedding(input=data, size=emb_dim) - conv_3 = paddle.networks.sequence_conv_pool( - input=emb, context_len=3, hidden_size=hid_dim) - conv_4 = paddle.networks.sequence_conv_pool( - input=emb, context_len=4, hidden_size=hid_dim) - output = paddle.layer.fc(input=[conv_3, conv_4], - size=class_dim, - act=paddle.activation.Softmax()) - lbl = paddle.layer.data("label", paddle.data_type.integer_value(2)) - cost = paddle.layer.classification_cost(input=output, label=lbl) - return cost - - -def stacked_lstm_net(input_dim, - class_dim=2, - emb_dim=128, - hid_dim=512, - stacked_num=3): - """ - A Wrapper for sentiment classification task. - This network uses bi-directional recurrent network, - consisting three LSTM layers. This configure is referred to - the paper as following url, but use fewer layrs. - http://www.aclweb.org/anthology/P15-1109 - - input_dim: here is word dictionary dimension. - class_dim: number of categories. - emb_dim: dimension of word embedding. - hid_dim: dimension of hidden layer. - stacked_num: number of stacked lstm-hidden layer. - """ - assert stacked_num % 2 == 1 - - layer_attr = paddle.attr.Extra(drop_rate=0.5) - fc_para_attr = paddle.attr.Param(learning_rate=1e-3) - lstm_para_attr = paddle.attr.Param(initial_std=0., learning_rate=1.) - para_attr = [fc_para_attr, lstm_para_attr] - bias_attr = paddle.attr.Param(initial_std=0., l2_rate=0.) - relu = paddle.activation.Relu() - linear = paddle.activation.Linear() - - data = paddle.layer.data("word", - paddle.data_type.integer_value_sequence(input_dim)) - emb = paddle.layer.embedding(input=data, size=emb_dim) - - fc1 = paddle.layer.fc(input=emb, - size=hid_dim, - act=linear, - bias_attr=bias_attr) - lstm1 = paddle.layer.lstmemory( - input=fc1, act=relu, bias_attr=bias_attr, layer_attr=layer_attr) - - inputs = [fc1, lstm1] - for i in range(2, stacked_num + 1): - fc = paddle.layer.fc(input=inputs, - size=hid_dim, - act=linear, - param_attr=para_attr, - bias_attr=bias_attr) - lstm = paddle.layer.lstmemory( - input=fc, - reverse=(i % 2) == 0, - act=relu, - bias_attr=bias_attr, - layer_attr=layer_attr) - inputs = [fc, lstm] - - fc_last = paddle.layer.pooling( - input=inputs[0], pooling_type=paddle.pooling.Max()) - lstm_last = paddle.layer.pooling( - input=inputs[1], pooling_type=paddle.pooling.Max()) - output = paddle.layer.fc(input=[fc_last, lstm_last], - size=class_dim, - act=paddle.activation.Softmax(), - bias_attr=bias_attr, - param_attr=para_attr) - - lbl = paddle.layer.data("label", paddle.data_type.integer_value(2)) - cost = paddle.layer.classification_cost(input=output, label=lbl) - return cost - - -if __name__ == '__main__': - # init - paddle.init(use_gpu=False) - - #data - print 'load dictionary...' - word_dict = paddle.dataset.imdb.word_dict() - dict_dim = len(word_dict) - class_dim = 2 - train_reader = paddle.batch( - paddle.reader.shuffle( - lambda: paddle.dataset.imdb.train(word_dict), buf_size=1000), - batch_size=100) - test_reader = paddle.batch( - lambda: paddle.dataset.imdb.test(word_dict), batch_size=100) - - feeding = {'word': 0, 'label': 1} - - # network config - # Please choose the way to build the network - # by uncommenting the corresponding line. - cost = convolution_net(dict_dim, class_dim=class_dim) - # cost = stacked_lstm_net(dict_dim, class_dim=class_dim, stacked_num=3) - - # create parameters - parameters = paddle.parameters.create(cost) - - # create optimizer - adam_optimizer = paddle.optimizer.Adam( - learning_rate=2e-3, - regularization=paddle.optimizer.L2Regularization(rate=8e-4), - model_average=paddle.optimizer.ModelAverage(average_window=0.5)) - - # End batch and end pass event handler - def event_handler(event): - if isinstance(event, paddle.event.EndIteration): - if event.batch_id % 100 == 0: - print "\nPass %d, Batch %d, Cost %f, %s" % ( - event.pass_id, event.batch_id, event.cost, event.metrics) - else: - sys.stdout.write('.') - sys.stdout.flush() - if isinstance(event, paddle.event.EndPass): - result = trainer.test(reader=test_reader, feeding=feeding) - print "\nTest with Pass %d, %s" % (event.pass_id, result.metrics) - - # create trainer - trainer = paddle.trainer.SGD(cost=cost, - parameters=parameters, - update_equation=adam_optimizer) - - trainer.train( - reader=train_reader, - event_handler=event_handler, - feeding=feeding, - num_passes=2) diff --git a/demo/sentiment/trainer_config.py b/demo/sentiment/trainer_config.py deleted file mode 100644 index f1cadaa728ac58107e15f77b5994d31da088caf7..0000000000000000000000000000000000000000 --- a/demo/sentiment/trainer_config.py +++ /dev/null @@ -1,39 +0,0 @@ -# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from sentiment_net import * -from paddle.trainer_config_helpers import * - -# whether this config is used for test -is_test = get_config_arg('is_test', bool, False) -# whether this config is used for prediction -is_predict = get_config_arg('is_predict', bool, False) - -data_dir = "./data/pre-imdb" -dict_dim, class_dim = sentiment_data(data_dir, is_test, is_predict) - -################## Algorithm Config ##################### - -settings( - batch_size=128, - learning_rate=2e-3, - learning_method=AdamOptimizer(), - model_average=ModelAverage(0.5), - regularization=L2Regularization(8e-4), - gradient_clipping_threshold=25) - -#################### Network Config ###################### -stacked_lstm_net( - dict_dim, class_dim=class_dim, stacked_num=3, is_predict=is_predict) -# bidirectional_lstm_net(dict_dim, class_dim=class_dim, is_predict=is_predict) diff --git a/demo/seqToseq/.gitignore b/demo/seqToseq/.gitignore deleted file mode 100644 index 21cec2c2c1f3422cbb0ad133281dc1ecdd076a96..0000000000000000000000000000000000000000 --- a/demo/seqToseq/.gitignore +++ /dev/null @@ -1,17 +0,0 @@ -data/wmt14 -data/pre-wmt14 -data/wmt14_model -data/paraphrase -data/pre-paraphrase -data/paraphrase_model -translation/gen.log -translation/gen_result -translation/train.log -paraphrase/train.log -dataprovider_copy_1.py -translation/thirdparty.tgz -translation/thirdparty/train.conf -translation/thirdparty/dataprovider.py -translation/thirdparty/seqToseq_net.py -translation/thirdparty/*.dict -*.pyc diff --git a/demo/seqToseq/api_train_v2.py b/demo/seqToseq/api_train_v2.py deleted file mode 100644 index 3072c375123a2713c655b09fb28001960c9ab64d..0000000000000000000000000000000000000000 --- a/demo/seqToseq/api_train_v2.py +++ /dev/null @@ -1,214 +0,0 @@ -import sys - -import paddle.v2 as paddle - - -def seqToseq_net(source_dict_dim, target_dict_dim, is_generating=False): - ### Network Architecture - word_vector_dim = 512 # dimension of word vector - decoder_size = 512 # dimension of hidden unit in GRU Decoder network - encoder_size = 512 # dimension of hidden unit in GRU Encoder network - - beam_size = 3 - max_length = 250 - - #### Encoder - src_word_id = paddle.layer.data( - name='source_language_word', - type=paddle.data_type.integer_value_sequence(source_dict_dim)) - src_embedding = paddle.layer.embedding( - input=src_word_id, - size=word_vector_dim, - param_attr=paddle.attr.ParamAttr(name='_source_language_embedding')) - src_forward = paddle.networks.simple_gru( - input=src_embedding, size=encoder_size) - src_backward = paddle.networks.simple_gru( - input=src_embedding, size=encoder_size, reverse=True) - encoded_vector = paddle.layer.concat(input=[src_forward, src_backward]) - - #### Decoder - with paddle.layer.mixed(size=decoder_size) as encoded_proj: - encoded_proj += paddle.layer.full_matrix_projection( - input=encoded_vector) - - backward_first = paddle.layer.first_seq(input=src_backward) - - with paddle.layer.mixed( - size=decoder_size, act=paddle.activation.Tanh()) as decoder_boot: - decoder_boot += paddle.layer.full_matrix_projection( - input=backward_first) - - def gru_decoder_with_attention(enc_vec, enc_proj, current_word): - - decoder_mem = paddle.layer.memory( - name='gru_decoder', size=decoder_size, boot_layer=decoder_boot) - - context = paddle.networks.simple_attention( - encoded_sequence=enc_vec, - encoded_proj=enc_proj, - decoder_state=decoder_mem) - - with paddle.layer.mixed(size=decoder_size * 3) as decoder_inputs: - decoder_inputs += paddle.layer.full_matrix_projection(input=context) - decoder_inputs += paddle.layer.full_matrix_projection( - input=current_word) - - gru_step = paddle.layer.gru_step( - name='gru_decoder', - input=decoder_inputs, - output_mem=decoder_mem, - size=decoder_size) - - with paddle.layer.mixed( - size=target_dict_dim, - bias_attr=True, - act=paddle.activation.Softmax()) as out: - out += paddle.layer.full_matrix_projection(input=gru_step) - return out - - decoder_group_name = "decoder_group" - group_input1 = paddle.layer.StaticInputV2(input=encoded_vector, is_seq=True) - group_input2 = paddle.layer.StaticInputV2(input=encoded_proj, is_seq=True) - group_inputs = [group_input1, group_input2] - - if not is_generating: - trg_embedding = paddle.layer.embedding( - input=paddle.layer.data( - name='target_language_word', - type=paddle.data_type.integer_value_sequence(target_dict_dim)), - size=word_vector_dim, - param_attr=paddle.attr.ParamAttr(name='_target_language_embedding')) - group_inputs.append(trg_embedding) - - # For decoder equipped with attention mechanism, in training, - # target embeding (the groudtruth) is the data input, - # while encoded source sequence is accessed to as an unbounded memory. - # Here, the StaticInput defines a read-only memory - # for the recurrent_group. - decoder = paddle.layer.recurrent_group( - name=decoder_group_name, - step=gru_decoder_with_attention, - input=group_inputs) - - lbl = paddle.layer.data( - name='target_language_next_word', - type=paddle.data_type.integer_value_sequence(target_dict_dim)) - cost = paddle.layer.classification_cost(input=decoder, label=lbl) - - return cost - else: - # In generation, the decoder predicts a next target word based on - # the encoded source sequence and the last generated target word. - - # The encoded source sequence (encoder's output) must be specified by - # StaticInput, which is a read-only memory. - # Embedding of the last generated word is automatically gotten by - # GeneratedInputs, which is initialized by a start mark, such as , - # and must be included in generation. - - trg_embedding = paddle.layer.GeneratedInputV2( - size=target_dict_dim, - embedding_name='_target_language_embedding', - embedding_size=word_vector_dim) - group_inputs.append(trg_embedding) - - beam_gen = paddle.layer.beam_search( - name=decoder_group_name, - step=gru_decoder_with_attention, - input=group_inputs, - bos_id=0, - eos_id=1, - beam_size=beam_size, - max_length=max_length) - - return beam_gen - - -def main(): - paddle.init(use_gpu=False, trainer_count=1) - is_generating = False - - # source and target dict dim. - dict_size = 30000 - source_dict_dim = target_dict_dim = dict_size - - # train the network - if not is_generating: - cost = seqToseq_net(source_dict_dim, target_dict_dim) - parameters = paddle.parameters.create(cost) - - # define optimize method and trainer - optimizer = paddle.optimizer.Adam( - learning_rate=5e-5, - regularization=paddle.optimizer.L2Regularization(rate=8e-4)) - trainer = paddle.trainer.SGD(cost=cost, - parameters=parameters, - update_equation=optimizer) - # define data reader - wmt14_reader = paddle.batch( - paddle.reader.shuffle( - paddle.dataset.wmt14.train(dict_size), buf_size=8192), - batch_size=5) - - # define event_handler callback - def event_handler(event): - if isinstance(event, paddle.event.EndIteration): - if event.batch_id % 10 == 0: - print "\nPass %d, Batch %d, Cost %f, %s" % ( - event.pass_id, event.batch_id, event.cost, - event.metrics) - else: - sys.stdout.write('.') - sys.stdout.flush() - - # start to train - trainer.train( - reader=wmt14_reader, event_handler=event_handler, num_passes=2) - - # generate a english sequence to french - else: - # use the first 3 samples for generation - gen_creator = paddle.dataset.wmt14.gen(dict_size) - gen_data = [] - gen_num = 3 - for item in gen_creator(): - gen_data.append((item[0], )) - if len(gen_data) == gen_num: - break - - beam_gen = seqToseq_net(source_dict_dim, target_dict_dim, is_generating) - # get the pretrained model, whose bleu = 26.92 - parameters = paddle.dataset.wmt14.model() - # prob is the prediction probabilities, and id is the prediction word. - beam_result = paddle.infer( - output_layer=beam_gen, - parameters=parameters, - input=gen_data, - field=['prob', 'id']) - - # get the dictionary - src_dict, trg_dict = paddle.dataset.wmt14.get_dict(dict_size) - - # the delimited element of generated sequences is -1, - # the first element of each generated sequence is the sequence length - seq_list = [] - seq = [] - for w in beam_result[1]: - if w != -1: - seq.append(w) - else: - seq_list.append(' '.join([trg_dict.get(w) for w in seq[1:]])) - seq = [] - - prob = beam_result[0] - beam_size = 3 - for i in xrange(gen_num): - print "\n*******************************************************\n" - print "src:", ' '.join( - [src_dict.get(w) for w in gen_data[i][0]]), "\n" - for j in xrange(beam_size): - print "prob = %f:" % (prob[i][j]), seq_list[i * beam_size + j] - - -if __name__ == '__main__': - main() diff --git a/demo/seqToseq/data/paraphrase_data.sh b/demo/seqToseq/data/paraphrase_data.sh deleted file mode 100755 index e6497c91286d44b5ef3b66c5f824e36a09728720..0000000000000000000000000000000000000000 --- a/demo/seqToseq/data/paraphrase_data.sh +++ /dev/null @@ -1,23 +0,0 @@ -#!/bin/bash -# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -set -e -set -x - -# download the in-house paraphrase dataset -wget http://paddlepaddle.bj.bcebos.com/model_zoo/embedding/paraphrase.tar.gz - -# untar the dataset -tar -zxvf paraphrase.tar.gz -rm paraphrase.tar.gz diff --git a/demo/seqToseq/data/paraphrase_model.sh b/demo/seqToseq/data/paraphrase_model.sh deleted file mode 100755 index d0e7f214a38c4dad0fdf7c10ba3b76eb0ab40f06..0000000000000000000000000000000000000000 --- a/demo/seqToseq/data/paraphrase_model.sh +++ /dev/null @@ -1,37 +0,0 @@ -#!/bin/bash -# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -set -e -set -x - -dim=32 -pretrained_dir='../../model_zoo/embedding/' -preModel=$pretrained_dir'model_'$dim'.emb' -preDict=$pretrained_dir'baidu.dict' - -usrDict_dir='pre-paraphrase/' -srcDict=$usrDict_dir'src.dict' -trgDict=$usrDict_dir'trg.dict' - -usrModel_dir='paraphrase_model/' -mkdir $usrModel_dir -srcModel=$usrModel_dir'_source_language_embedding' -trgModel=$usrModel_dir'_target_language_embedding' - -echo 'extract desired parameters based on user dictionary' -script=$pretrained_dir'extract_para.py' -python $script --preModel $preModel --preDict $preDict \ - --usrModel $srcModel --usrDict $srcDict -d $dim -python $script --preModel $preModel --preDict $preDict \ - --usrModel $trgModel --usrDict $trgDict -d $dim diff --git a/demo/seqToseq/data/wmt14_data.sh b/demo/seqToseq/data/wmt14_data.sh deleted file mode 100755 index 43f67168d2a876ba5401e0f8490a88adac9c5551..0000000000000000000000000000000000000000 --- a/demo/seqToseq/data/wmt14_data.sh +++ /dev/null @@ -1,53 +0,0 @@ -#!/bin/bash -# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -set -e -set -x -mkdir wmt14 -cd wmt14 - -# download the dataset -wget http://www-lium.univ-lemans.fr/~schwenk/cslm_joint_paper/data/bitexts.tgz -wget http://www-lium.univ-lemans.fr/~schwenk/cslm_joint_paper/data/dev+test.tgz - -# untar the dataset -tar -zxvf bitexts.tgz -tar -zxvf dev+test.tgz -gunzip bitexts.selected/* -mv bitexts.selected train -rm bitexts.tgz -rm dev+test.tgz - -# separate the dev and test dataset -mkdir test gen -mv dev/ntst1213.* test -mv dev/ntst14.* gen -rm -rf dev - -set +x -# rename the suffix, .fr->.src, .en->.trg -for dir in train test gen -do - filelist=`ls $dir` - cd $dir - for file in $filelist - do - if [ ${file##*.} = "fr" ]; then - mv $file ${file/%fr/src} - elif [ ${file##*.} = 'en' ]; then - mv $file ${file/%en/trg} - fi - done - cd .. -done diff --git a/demo/seqToseq/data/wmt14_model.sh b/demo/seqToseq/data/wmt14_model.sh deleted file mode 100755 index c4b55b90a3eb98f94e0eb3be028c6de1ef57326b..0000000000000000000000000000000000000000 --- a/demo/seqToseq/data/wmt14_model.sh +++ /dev/null @@ -1,23 +0,0 @@ -#!/bin/bash -# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -set -e -set -x - -# download the pretrained model -wget http://paddlepaddle.bj.bcebos.com/model_zoo/wmt14_model.tar.gz - -# untar the model -tar -zxvf wmt14_model.tar.gz -rm wmt14_model.tar.gz diff --git a/demo/seqToseq/dataprovider.py b/demo/seqToseq/dataprovider.py deleted file mode 100755 index c2b49804be582d7d0bc3ef6332741be03936eb24..0000000000000000000000000000000000000000 --- a/demo/seqToseq/dataprovider.py +++ /dev/null @@ -1,94 +0,0 @@ -# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from paddle.trainer.PyDataProvider2 import * - -UNK_IDX = 2 -START = "" -END = "" - - -def hook(settings, src_dict_path, trg_dict_path, is_generating, file_list, - **kwargs): - # job_mode = 1: training mode - # job_mode = 0: generating mode - settings.job_mode = not is_generating - - def fun(dict_path): - out_dict = dict() - with open(dict_path, "r") as fin: - out_dict = { - line.strip(): line_count - for line_count, line in enumerate(fin) - } - return out_dict - - settings.src_dict = fun(src_dict_path) - settings.trg_dict = fun(trg_dict_path) - - settings.logger.info("src dict len : %d" % (len(settings.src_dict))) - - if settings.job_mode: - settings.slots = { - 'source_language_word': - integer_value_sequence(len(settings.src_dict)), - 'target_language_word': - integer_value_sequence(len(settings.trg_dict)), - 'target_language_next_word': - integer_value_sequence(len(settings.trg_dict)) - } - settings.logger.info("trg dict len : %d" % (len(settings.trg_dict))) - else: - settings.slots = { - 'source_language_word': - integer_value_sequence(len(settings.src_dict)), - 'sent_id': - integer_value_sequence(len(open(file_list[0], "r").readlines())) - } - - -def _get_ids(s, dictionary): - words = s.strip().split() - return [dictionary[START]] + \ - [dictionary.get(w, UNK_IDX) for w in words] + \ - [dictionary[END]] - - -@provider(init_hook=hook, pool_size=50000) -def process(settings, file_name): - with open(file_name, 'r') as f: - for line_count, line in enumerate(f): - line_split = line.strip().split('\t') - if settings.job_mode and len(line_split) != 2: - continue - src_seq = line_split[0] # one source sequence - src_ids = _get_ids(src_seq, settings.src_dict) - - if settings.job_mode: - trg_seq = line_split[1] # one target sequence - trg_words = trg_seq.split() - trg_ids = [settings.trg_dict.get(w, UNK_IDX) for w in trg_words] - - # remove sequence whose length > 80 in training mode - if len(src_ids) > 80 or len(trg_ids) > 80: - continue - trg_ids_next = trg_ids + [settings.trg_dict[END]] - trg_ids = [settings.trg_dict[START]] + trg_ids - yield { - 'source_language_word': src_ids, - 'target_language_word': trg_ids, - 'target_language_next_word': trg_ids_next - } - else: - yield {'source_language_word': src_ids, 'sent_id': [line_count]} diff --git a/demo/seqToseq/paraphrase/train.conf b/demo/seqToseq/paraphrase/train.conf deleted file mode 100644 index be79c5e771c0e864fd1776cedb3ef37c997b6df6..0000000000000000000000000000000000000000 --- a/demo/seqToseq/paraphrase/train.conf +++ /dev/null @@ -1,33 +0,0 @@ -#edit-mode: -*- python -*- -# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import sys -sys.path.append("..") - -from seqToseq_net import * - -is_generating = False -### Data Definiation -train_conf = seq_to_seq_data(data_dir = "./data/pre-paraphrase", - is_generating = is_generating) - -### Algorithm Configuration -settings( - learning_method = AdamOptimizer(), - batch_size = 50, - learning_rate = 5e-4) - -### Network Architecture -gru_encoder_decoder(train_conf, is_generating, word_vector_dim = 32) diff --git a/demo/seqToseq/paraphrase/train.sh b/demo/seqToseq/paraphrase/train.sh deleted file mode 100755 index 9bb6dbdb1d4c5e35bfb31855e0331f0250a69a20..0000000000000000000000000000000000000000 --- a/demo/seqToseq/paraphrase/train.sh +++ /dev/null @@ -1,30 +0,0 @@ -#!/bin/bash -# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -set -e -cd .. - -paddle train \ - --config='paraphrase/train.conf' \ - --save_dir='paraphrase/model' \ - --init_model_path='data/paraphrase_model' \ - --load_missing_parameter_strategy=rand \ - --use_gpu=false \ - --num_passes=16 \ - --show_parameter_stats_period=100 \ - --trainer_count=4 \ - --log_period=10 \ - --dot_period=5 \ - 2>&1 | tee 'paraphrase/train.log' -paddle usage -l 'paraphrase/train.log' -e $? -n "seqToseq_paraphrase_train" >/dev/null 2>&1 diff --git a/demo/seqToseq/preprocess.py b/demo/seqToseq/preprocess.py deleted file mode 100755 index 03f371331a0755e5939e457f4bdfb1770b8dad88..0000000000000000000000000000000000000000 --- a/demo/seqToseq/preprocess.py +++ /dev/null @@ -1,219 +0,0 @@ -#!/bin/env python -# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -""" -Example: - python preprocess.py -i INPUT [-d DICTSIZE] [-m] - -Options: - -h, --help show this help message and exit - -i INPUT input original dataset path - -d DICTSIZE specified word count of dictionary - -m --mergeDict merge source and target dictionary -""" -import os -import sys - -import string -from optparse import OptionParser -from paddle.utils.preprocess_util import save_list, DatasetCreater - - -class SeqToSeqDatasetCreater(DatasetCreater): - """ - A class to process data for sequence to sequence application. - """ - - def __init__(self, data_path, output_path): - """ - data_path: the path to store the train data, test data and gen data - output_path: the path to store the processed dataset - """ - DatasetCreater.__init__(self, data_path) - self.gen_dir_name = 'gen' - self.gen_list_name = 'gen.list' - self.output_path = output_path - - def concat_file(self, file_path, file1, file2, output_path, output): - """ - Concat file1 and file2 to be one output file - The i-th line of output = i-th line of file1 + '\t' + i-th line of file2 - file_path: the path to store file1 and file2 - output_path: the path to store output file - """ - file1 = os.path.join(file_path, file1) - file2 = os.path.join(file_path, file2) - output = os.path.join(output_path, output) - if not os.path.exists(output): - os.system('paste ' + file1 + ' ' + file2 + ' > ' + output) - - def cat_file(self, dir_path, suffix, output_path, output): - """ - Cat all the files in dir_path with suffix to be one output file - dir_path: the base directory to store input file - suffix: suffix of file name - output_path: the path to store output file - """ - cmd = 'cat ' - file_list = os.listdir(dir_path) - file_list.sort() - for file in file_list: - if file.endswith(suffix): - cmd += os.path.join(dir_path, file) + ' ' - output = os.path.join(output_path, output) - if not os.path.exists(output): - os.system(cmd + '> ' + output) - - def build_dict(self, file_path, dict_path, dict_size=-1): - """ - Create the dictionary for the file, Note that - 1. Valid characters include all printable characters - 2. There is distinction between uppercase and lowercase letters - 3. There is 3 special token: - : the start of a sequence - : the end of a sequence - : a word not included in dictionary - file_path: the path to store file - dict_path: the path to store dictionary - dict_size: word count of dictionary - if is -1, dictionary will contains all the words in file - """ - if not os.path.exists(dict_path): - dictory = dict() - with open(file_path, "r") as fdata: - for line in fdata: - line = line.split('\t') - for line_split in line: - words = line_split.strip().split() - for word in words: - if word not in dictory: - dictory[word] = 1 - else: - dictory[word] += 1 - output = open(dict_path, "w+") - output.write('\n\n\n') - count = 3 - for key, value in sorted( - dictory.items(), key=lambda d: d[1], reverse=True): - output.write(key + "\n") - count += 1 - if count == dict_size: - break - self.dict_size = count - - def create_dataset(self, - dict_size=-1, - mergeDict=False, - suffixes=['.src', '.trg']): - """ - Create seqToseq dataset - """ - # dataset_list and dir_list has one-to-one relationship - train_dataset = os.path.join(self.data_path, self.train_dir_name) - test_dataset = os.path.join(self.data_path, self.test_dir_name) - gen_dataset = os.path.join(self.data_path, self.gen_dir_name) - dataset_list = [train_dataset, test_dataset, gen_dataset] - - train_dir = os.path.join(self.output_path, self.train_dir_name) - test_dir = os.path.join(self.output_path, self.test_dir_name) - gen_dir = os.path.join(self.output_path, self.gen_dir_name) - dir_list = [train_dir, test_dir, gen_dir] - - # create directory - for dir in dir_list: - if not os.path.exists(dir): - os.mkdir(dir) - - # checkout dataset should be parallel corpora - suffix_len = len(suffixes[0]) - for dataset in dataset_list: - file_list = os.listdir(dataset) - if len(file_list) % 2 == 1: - raise RuntimeError("dataset should be parallel corpora") - file_list.sort() - for i in range(0, len(file_list), 2): - if file_list[i][:-suffix_len] != file_list[i + 1][:-suffix_len]: - raise RuntimeError( - "source and target file name should be equal") - - # cat all the files with the same suffix in dataset - for suffix in suffixes: - for dataset in dataset_list: - outname = os.path.basename(dataset) + suffix - self.cat_file(dataset, suffix, dataset, outname) - - # concat parallel corpora and create file.list - print 'concat parallel corpora for dataset' - id = 0 - list = ['train.list', 'test.list', 'gen.list'] - for dataset in dataset_list: - outname = os.path.basename(dataset) - self.concat_file(dataset, outname + suffixes[0], - outname + suffixes[1], dir_list[id], outname) - save_list([os.path.join(dir_list[id], outname)], - os.path.join(self.output_path, list[id])) - id += 1 - - # build dictionary for train data - dict = ['src.dict', 'trg.dict'] - dict_path = [ - os.path.join(self.output_path, dict[0]), - os.path.join(self.output_path, dict[1]) - ] - if mergeDict: - outname = os.path.join(train_dir, train_dataset.split('/')[-1]) - print 'build src dictionary for train data' - self.build_dict(outname, dict_path[0], dict_size) - print 'build trg dictionary for train data' - os.system('cp ' + dict_path[0] + ' ' + dict_path[1]) - else: - outname = os.path.join(train_dataset, self.train_dir_name) - for id in range(0, 2): - suffix = suffixes[id] - print 'build ' + suffix[1:] + ' dictionary for train data' - self.build_dict(outname + suffix, dict_path[id], dict_size) - print 'dictionary size is', self.dict_size - - -def main(): - usage = "usage: \n" \ - "python %prog -i INPUT [-d DICTSIZE] [-m]" - parser = OptionParser(usage) - parser.add_option( - "-i", action="store", dest="input", help="input original dataset path") - parser.add_option( - "-d", - action="store", - dest="dictsize", - help="specified word count of dictionary") - parser.add_option( - "-m", - "--mergeDict", - action="store_true", - dest="mergeDict", - help="merge source and target dictionary") - (options, args) = parser.parse_args() - if options.input[-1] == os.path.sep: - options.input = options.input[:-1] - outname = os.path.basename(options.input) - output_path = os.path.join(os.path.dirname(options.input), 'pre-' + outname) - dictsize = int(options.dictsize) if options.dictsize else -1 - if not os.path.exists(output_path): - os.mkdir(output_path) - data_creator = SeqToSeqDatasetCreater(options.input, output_path) - data_creator.create_dataset(dictsize, options.mergeDict) - - -if __name__ == "__main__": - main() diff --git a/demo/seqToseq/seqToseq_net.py b/demo/seqToseq/seqToseq_net.py deleted file mode 100644 index 3d1f86ec3b7eda4fceaf3a1e406e3d0a1a4a2f60..0000000000000000000000000000000000000000 --- a/demo/seqToseq/seqToseq_net.py +++ /dev/null @@ -1,204 +0,0 @@ -# edit-mode: -*- python -*- - -# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import sys -import os -from paddle.trainer_config_helpers import * - - -def seq_to_seq_data(data_dir, - is_generating, - dict_size=30000, - train_list='train.list', - test_list='test.list', - gen_list='gen.list', - gen_result='gen_result'): - """ - Predefined seqToseq train data provider for application - is_generating: whether this config is used for generating - dict_size: word count of dictionary - train_list: a text file containing a list of training data - test_list: a text file containing a list of testing data - gen_list: a text file containing a list of generating data - gen_result: a text file containing generating result - """ - src_lang_dict = os.path.join(data_dir, 'src.dict') - trg_lang_dict = os.path.join(data_dir, 'trg.dict') - - if is_generating: - train_list = None - test_list = os.path.join(data_dir, gen_list) - else: - train_list = os.path.join(data_dir, train_list) - test_list = os.path.join(data_dir, test_list) - - define_py_data_sources2( - train_list, - test_list, - module="dataprovider", - obj="process", - args={ - "src_dict_path": src_lang_dict, - "trg_dict_path": trg_lang_dict, - "is_generating": is_generating - }) - - return { - "src_dict_path": src_lang_dict, - "trg_dict_path": trg_lang_dict, - "gen_result": gen_result - } - - -def gru_encoder_decoder(data_conf, - is_generating, - word_vector_dim=512, - encoder_size=512, - decoder_size=512, - beam_size=3, - max_length=250, - error_clipping=50): - """ - A wrapper for an attention version of GRU Encoder-Decoder network - is_generating: whether this config is used for generating - encoder_size: dimension of hidden unit in GRU Encoder network - decoder_size: dimension of hidden unit in GRU Decoder network - word_vector_dim: dimension of word vector - beam_size: expand width in beam search - max_length: a stop condition of sequence generation - """ - for k, v in data_conf.iteritems(): - globals()[k] = v - source_dict_dim = len(open(src_dict_path, "r").readlines()) - target_dict_dim = len(open(trg_dict_path, "r").readlines()) - gen_trans_file = gen_result - - src_word_id = data_layer(name='source_language_word', size=source_dict_dim) - src_embedding = embedding_layer( - input=src_word_id, - size=word_vector_dim, - param_attr=ParamAttr(name='_source_language_embedding')) - src_forward = simple_gru( - input=src_embedding, - size=encoder_size, - naive=True, - gru_layer_attr=ExtraLayerAttribute( - error_clipping_threshold=error_clipping)) - src_backward = simple_gru( - input=src_embedding, - size=encoder_size, - reverse=True, - naive=True, - gru_layer_attr=ExtraLayerAttribute( - error_clipping_threshold=error_clipping)) - encoded_vector = concat_layer(input=[src_forward, src_backward]) - - with mixed_layer(size=decoder_size) as encoded_proj: - encoded_proj += full_matrix_projection(input=encoded_vector) - - backward_first = first_seq(input=src_backward) - with mixed_layer( - size=decoder_size, - act=TanhActivation(), ) as decoder_boot: - decoder_boot += full_matrix_projection(input=backward_first) - - def gru_decoder_with_attention(enc_vec, enc_proj, current_word): - decoder_mem = memory( - name='gru_decoder', size=decoder_size, boot_layer=decoder_boot) - - context = simple_attention( - encoded_sequence=enc_vec, - encoded_proj=enc_proj, - decoder_state=decoder_mem, ) - - with mixed_layer(size=decoder_size * 3) as decoder_inputs: - decoder_inputs += full_matrix_projection(input=context) - decoder_inputs += full_matrix_projection(input=current_word) - - gru_step = gru_step_naive_layer( - name='gru_decoder', - input=decoder_inputs, - output_mem=decoder_mem, - size=decoder_size, - layer_attr=ExtraLayerAttribute( - error_clipping_threshold=error_clipping)) - - with mixed_layer( - size=target_dict_dim, bias_attr=True, - act=SoftmaxActivation()) as out: - out += full_matrix_projection(input=gru_step) - return out - - decoder_group_name = "decoder_group" - group_inputs = [ - StaticInput( - input=encoded_vector, is_seq=True), StaticInput( - input=encoded_proj, is_seq=True) - ] - - if not is_generating: - trg_embedding = embedding_layer( - input=data_layer( - name='target_language_word', size=target_dict_dim), - size=word_vector_dim, - param_attr=ParamAttr(name='_target_language_embedding')) - group_inputs.append(trg_embedding) - - # For decoder equipped with attention mechanism, in training, - # target embeding (the groudtruth) is the data input, - # while encoded source sequence is accessed to as an unbounded memory. - # Here, the StaticInput defines a read-only memory - # for the recurrent_group. - decoder = recurrent_group( - name=decoder_group_name, - step=gru_decoder_with_attention, - input=group_inputs) - - lbl = data_layer(name='target_language_next_word', size=target_dict_dim) - cost = classification_cost(input=decoder, label=lbl) - outputs(cost) - else: - # In generation, the decoder predicts a next target word based on - # the encoded source sequence and the last generated target word. - - # The encoded source sequence (encoder's output) must be specified by - # StaticInput, which is a read-only memory. - # Embedding of the last generated word is automatically gotten by - # GeneratedInputs, which is initialized by a start mark, such as , - # and must be included in generation. - - trg_embedding = GeneratedInput( - size=target_dict_dim, - embedding_name='_target_language_embedding', - embedding_size=word_vector_dim) - group_inputs.append(trg_embedding) - - beam_gen = beam_search( - name=decoder_group_name, - step=gru_decoder_with_attention, - input=group_inputs, - bos_id=0, - eos_id=1, - beam_size=beam_size, - max_length=max_length) - - seqtext_printer_evaluator( - input=beam_gen, - id_input=data_layer( - name="sent_id", size=1), - dict_file=trg_dict_path, - result_file=gen_trans_file) - outputs(beam_gen) diff --git a/demo/seqToseq/translation/eval_bleu.sh b/demo/seqToseq/translation/eval_bleu.sh deleted file mode 100755 index 54c2ed237e93adb3456dbe62f75626d36c2d90bc..0000000000000000000000000000000000000000 --- a/demo/seqToseq/translation/eval_bleu.sh +++ /dev/null @@ -1,42 +0,0 @@ -#!/bin/bash -# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -set -e -gen_file=$1 -beam_size=$2 - -# find top1 generating result -top1=$(printf '%s_top1.txt' `basename $gen_file .txt`) -if [ $beam_size -eq 1 ]; then - awk -F "\t" '{sub(" ","",$2);sub(" ","",$2);print $2}' $gen_file >$top1 -else - awk 'BEGIN{ - FS="\t"; - OFS="\t"; - read_pos = 2} { - if (NR == read_pos){ - sub(" ","",$3); - sub(" ","",$3); - print $3; - read_pos += (2 + res_num); - }}' res_num=$beam_size $gen_file >$top1 -fi - -# evalute bleu value -bleu_script=multi-bleu.perl -standard_res=../data/wmt14/gen/ntst14.trg -bleu_res=`perl $bleu_script $standard_res <$top1` - -echo $bleu_res -rm $top1 diff --git a/demo/seqToseq/translation/gen.conf b/demo/seqToseq/translation/gen.conf deleted file mode 100644 index e9bea4e4559ff31ad83c4474e91de7e7acc77e9f..0000000000000000000000000000000000000000 --- a/demo/seqToseq/translation/gen.conf +++ /dev/null @@ -1,36 +0,0 @@ -#edit-mode: -*- python -*- -# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import sys -sys.path.append("..") - -from seqToseq_net import * - -# whether this config is used for generating -is_generating = True - -### Data Definiation -gen_conf = seq_to_seq_data(data_dir = "./data/pre-wmt14", - is_generating = is_generating, - gen_result = "./translation/gen_result") - -### Algorithm Configuration -settings( - learning_method = AdamOptimizer(), - batch_size = 1, - learning_rate = 0) - -### Network Architecture -gru_encoder_decoder(gen_conf, is_generating) diff --git a/demo/seqToseq/translation/gen.sh b/demo/seqToseq/translation/gen.sh deleted file mode 100755 index 64b78f5e9654e7b206740f92e224e0164108c9f1..0000000000000000000000000000000000000000 --- a/demo/seqToseq/translation/gen.sh +++ /dev/null @@ -1,27 +0,0 @@ -#!/bin/bash -# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -set -e -cd .. - -paddle train \ - --job=test \ - --config='translation/gen.conf' \ - --save_dir='data/wmt14_model' \ - --use_gpu=false \ - --num_passes=13 \ - --test_pass=12 \ - --trainer_count=1 \ - 2>&1 | tee 'translation/gen.log' -paddle usage -l 'translation/gen.log' -e $? -n "seqToseq_translation_gen" >/dev/null 2>&1 diff --git a/demo/seqToseq/translation/moses_bleu.sh b/demo/seqToseq/translation/moses_bleu.sh deleted file mode 100755 index 2f230d7f4c736da003966fbdb277f6b8b1ec952c..0000000000000000000000000000000000000000 --- a/demo/seqToseq/translation/moses_bleu.sh +++ /dev/null @@ -1,18 +0,0 @@ -#!/bin/bash -# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -set -e -set -x -echo "Downloading multi-bleu.perl" -wget https://raw.githubusercontent.com/moses-smt/mosesdecoder/master/scripts/generic/multi-bleu.perl --no-check-certificate diff --git a/demo/seqToseq/translation/train.conf b/demo/seqToseq/translation/train.conf deleted file mode 100644 index 72b7ccdbb95dbda8f06674079db9a3257bb31622..0000000000000000000000000000000000000000 --- a/demo/seqToseq/translation/train.conf +++ /dev/null @@ -1,36 +0,0 @@ -#edit-mode: -*- python -*- -# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import sys -sys.path.append("..") - -from seqToseq_net import * - -# whether this config is used for generating -is_generating = False - -### Data Definiation -data_dir = "./data/pre-wmt14" -train_conf = seq_to_seq_data(data_dir = data_dir, - is_generating = is_generating) - -### Algorithm Configuration -settings( - learning_method = AdamOptimizer(), - batch_size = 50, - learning_rate = 5e-4) - -### Network Architecture -gru_encoder_decoder(train_conf, is_generating) diff --git a/demo/seqToseq/translation/train.sh b/demo/seqToseq/translation/train.sh deleted file mode 100755 index b0ec9854b118cbb9ed39d6bed0cdd845403926a4..0000000000000000000000000000000000000000 --- a/demo/seqToseq/translation/train.sh +++ /dev/null @@ -1,28 +0,0 @@ -#!/bin/bash -# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -set -e -cd .. - -paddle train \ ---config='translation/train.conf' \ ---save_dir='translation/model' \ ---use_gpu=false \ ---num_passes=16 \ ---show_parameter_stats_period=100 \ ---trainer_count=4 \ ---log_period=10 \ ---dot_period=5 \ -2>&1 | tee 'translation/train.log' -paddle usage -l 'translation/train.log' -e $? -n "seqToseq_translation_train" >/dev/null 2>&1 diff --git a/demo/word2vec/api_train_v2.py b/demo/word2vec/api_train_v2.py deleted file mode 100644 index c0940f0e56eafa22f8aeb7052c0ddc79d8862917..0000000000000000000000000000000000000000 --- a/demo/word2vec/api_train_v2.py +++ /dev/null @@ -1,100 +0,0 @@ -import gzip -import math - -import paddle.v2 as paddle - -embsize = 32 -hiddensize = 256 -N = 5 - - -def wordemb(inlayer): - wordemb = paddle.layer.embedding( - input=inlayer, - size=embsize, - param_attr=paddle.attr.Param( - name="_proj", - initial_std=0.001, - learning_rate=1, - l2_rate=0, - sparse_update=True)) - return wordemb - - -def main(): - # for local training - cluster_train = False - - if not cluster_train: - paddle.init(use_gpu=False, trainer_count=1) - else: - paddle.init( - use_gpu=False, - trainer_count=2, - port=7164, - ports_num=1, - ports_num_for_sparse=1, - num_gradient_servers=1) - word_dict = paddle.dataset.imikolov.build_dict() - dict_size = len(word_dict) - firstword = paddle.layer.data( - name="firstw", type=paddle.data_type.integer_value(dict_size)) - secondword = paddle.layer.data( - name="secondw", type=paddle.data_type.integer_value(dict_size)) - thirdword = paddle.layer.data( - name="thirdw", type=paddle.data_type.integer_value(dict_size)) - fourthword = paddle.layer.data( - name="fourthw", type=paddle.data_type.integer_value(dict_size)) - nextword = paddle.layer.data( - name="fifthw", type=paddle.data_type.integer_value(dict_size)) - - Efirst = wordemb(firstword) - Esecond = wordemb(secondword) - Ethird = wordemb(thirdword) - Efourth = wordemb(fourthword) - - contextemb = paddle.layer.concat(input=[Efirst, Esecond, Ethird, Efourth]) - hidden1 = paddle.layer.fc(input=contextemb, - size=hiddensize, - act=paddle.activation.Sigmoid(), - layer_attr=paddle.attr.Extra(drop_rate=0.5), - bias_attr=paddle.attr.Param(learning_rate=2), - param_attr=paddle.attr.Param( - initial_std=1. / math.sqrt(embsize * 8), - learning_rate=1)) - predictword = paddle.layer.fc(input=hidden1, - size=dict_size, - bias_attr=paddle.attr.Param(learning_rate=2), - act=paddle.activation.Softmax()) - - def event_handler(event): - if isinstance(event, paddle.event.EndIteration): - if event.batch_id % 100 == 0: - with gzip.open("batch-" + str(event.batch_id) + ".tar.gz", - 'w') as f: - trainer.save_parameter_to_tar(f) - result = trainer.test( - paddle.batch( - paddle.dataset.imikolov.test(word_dict, N), 32)) - print "Pass %d, Batch %d, Cost %f, %s, Testing metrics %s" % ( - event.pass_id, event.batch_id, event.cost, event.metrics, - result.metrics) - - cost = paddle.layer.classification_cost(input=predictword, label=nextword) - - parameters = paddle.parameters.create(cost) - adagrad = paddle.optimizer.AdaGrad( - learning_rate=3e-3, - regularization=paddle.optimizer.L2Regularization(8e-4)) - trainer = paddle.trainer.SGD(cost, - parameters, - adagrad, - is_local=not cluster_train) - trainer.train( - paddle.batch(paddle.dataset.imikolov.train(word_dict, N), 32), - num_passes=30, - event_handler=event_handler) - - -if __name__ == '__main__': - main() diff --git a/doc/CMakeLists.txt b/doc/CMakeLists.txt index 6fa42fd0c71e78cc2fa6b0fe2cb970baf4ac89ed..94dd3457fb5b513441c4c8e339e1862de9092517 100644 --- a/doc/CMakeLists.txt +++ b/doc/CMakeLists.txt @@ -27,10 +27,6 @@ sphinx_add_target(paddle_docs ${CMAKE_CURRENT_SOURCE_DIR} ${SPHINX_HTML_DIR_EN}) -add_dependencies(paddle_docs - gen_proto_py) - - # configured documentation tools and intermediate build results set(BINARY_BUILD_DIR_CN "${CMAKE_CURRENT_BINARY_DIR}/cn/_build") @@ -51,6 +47,3 @@ sphinx_add_target(paddle_docs_cn ${SPHINX_CACHE_DIR_CN} ${CMAKE_CURRENT_SOURCE_DIR} ${SPHINX_HTML_DIR_CN}) - -add_dependencies(paddle_docs_cn - gen_proto_py) diff --git a/doc/api/v2/config/evaluators.rst b/doc/api/v2/config/evaluators.rst index 39db51fa4abc370855ca3f2778b47464f33b6fce..9ac972fb193a2fb525edc507f7ba1303d2c8eabe 100644 --- a/doc/api/v2/config/evaluators.rst +++ b/doc/api/v2/config/evaluators.rst @@ -99,3 +99,12 @@ value_printer .. automodule:: paddle.v2.evaluator :members: value_printer :noindex: + +Detection +===== + +detection_map +------------- +.. automodule:: paddle.v2.evaluator + :members: detection_map + :noindex: diff --git a/doc/api/v2/config/layer.rst b/doc/api/v2/config/layer.rst index 1efa74ecda4170332d96603ca2253c68468474f9..cb330ea5e1b914587a725c9b90a33053f3fbbc3d 100644 --- a/doc/api/v2/config/layer.rst +++ b/doc/api/v2/config/layer.rst @@ -59,6 +59,11 @@ context_projection .. autoclass:: paddle.v2.layer.context_projection :noindex: +row_conv +-------- +.. autoclass:: paddle.v2.layer.row_conv + :noindex: + Image Pooling Layer =================== @@ -99,6 +104,11 @@ cross_channel_norm ------------------ .. autoclass:: paddle.v2.layer.cross_channel_norm :noindex: + +row_l2_norm +----------- +.. autoclass:: paddle.v2.layer.row_l2_norm + :noindex: Recurrent Layers ================ @@ -130,7 +140,7 @@ recurrent_group --------------- .. autoclass:: paddle.v2.layer.recurrent_group :noindex: - + lstm_step --------- .. autoclass:: paddle.v2.layer.lstm_step @@ -145,12 +155,12 @@ beam_search ------------ .. autoclass:: paddle.v2.layer.beam_search :noindex: - + get_output ---------- .. autoclass:: paddle.v2.layer.get_output :noindex: - + Mixed Layer =========== @@ -193,6 +203,10 @@ identity_projection .. autoclass:: paddle.v2.layer.identity_projection :noindex: +slice_projection +------------------- +.. autoclass:: paddle.v2.layer.slice_projection + :noindex: table_projection ---------------- @@ -203,7 +217,7 @@ trans_full_matrix_projection ---------------------------- .. autoclass:: paddle.v2.layer.trans_full_matrix_projection :noindex: - + Aggregate Layers ================ @@ -243,6 +257,16 @@ seq_concat .. autoclass:: paddle.v2.layer.seq_concat :noindex: +kmax_sequence_score +------------------- +.. autoclass:: paddle.v2.layer.kmax_sequence_score + :noindex: + +sub_nested_seq +-------------- +.. autoclass:: paddle.v2.layer.sub_nested_seq + :noindex: + Reshaping Layers ================ @@ -311,6 +335,11 @@ scaling .. autoclass:: paddle.v2.layer.scaling :noindex: +clip +---- +.. autoclass:: paddle.v2.layer.clip + :noindex: + slope_intercept --------------- .. autoclass:: paddle.v2.layer.slope_intercept @@ -346,6 +375,12 @@ sampling_id .. autoclass:: paddle.v2.layer.sampling_id :noindex: +multiplex +--------- +.. autoclass:: paddle.v2.layer.multiplex + :noindex: + + Slicing and Joining Layers ========================== @@ -434,10 +469,44 @@ smooth_l1_cost .. autoclass:: paddle.v2.layer.smooth_l1_cost :noindex: -Check Layer +multibox_loss +-------------- +.. autoclass:: paddle.v2.layer.multibox_loss + :noindex: + +Check Layer ============ eos --- .. autoclass:: paddle.v2.layer.eos :noindex: + +Miscs +===== + +dropout +-------------- +.. autoclass:: paddle.v2.layer.dropout + :noindex: + +Activation with learnable parameter +=================================== + +prelu +-------- +.. autoclass:: paddle.v2.layer.prelu + :noindex: + +gated_unit +----------- +.. autoclass:: paddle.v2.layer.gated_unit + :noindex: + +Detection output Layer +====================== + +detection_output +---------------- +.. autoclass:: paddle.v2.layer.detection_output + :noindex: diff --git a/doc/api/v2/config/networks.rst b/doc/api/v2/config/networks.rst index b2a617fff134035c04eeabbbaf6d9cbe2a525f1c..6e813ab1a820d068ea3e54cad6178f1cf928eadc 100644 --- a/doc/api/v2/config/networks.rst +++ b/doc/api/v2/config/networks.rst @@ -125,11 +125,3 @@ simple_attention :members: simple_attention :noindex: -Miscs -===== - -dropout_layer --------------- -.. automodule:: paddle.v2.networks - :members: dropout_layer - :noindex: diff --git a/doc/design/auto_gradient_check.md b/doc/design/auto_gradient_check.md new file mode 100644 index 0000000000000000000000000000000000000000..1f4d4ec16f7c395005e610751d95c10f5f3adf52 --- /dev/null +++ b/doc/design/auto_gradient_check.md @@ -0,0 +1,146 @@ +## Auto Gradient Checker Design + +## Backgraound: +- Operator forward computing is easy to check if the result is right because it has a clear definition. **But** backpropagation is a notoriously difficult algorithm to debug and get right: + - 1. you should get the right backpropagation formula according to the forward computation. + - 2. you should implement it right in CPP. + - 3. it's difficult to prepare test data. + +- Auto gradient check gets a numeric gradient by forward Operator and use it as a reference of the backward Operator's result. It has several advantages: + - 1. numeric gradient checker only need forward operator. + - 2. user only need to prepare the input data for forward Operator. + +## Mathematical Theory +The following two document from stanford has a detailed explanation of how to get numeric gradient and why it's useful. + +- [Gradient checking and advanced optimization(en)](http://deeplearning.stanford.edu/wiki/index.php/Gradient_checking_and_advanced_optimization) +- [Gradient checking and advanced optimization(cn)](http://ufldl.stanford.edu/wiki/index.php/%E6%A2%AF%E5%BA%A6%E6%A3%80%E9%AA%8C%E4%B8%8E%E9%AB%98%E7%BA%A7%E4%BC%98%E5%8C%96) + + +## Numeric Gradient Implementation +### Python Interface +```python +def get_numeric_gradient(op, + input_values, + output_name, + input_to_check, + delta=0.005, + local_scope=None): + """ + Get Numeric Gradient for an operator's input. + + :param op: C++ operator instance, could be an network + :param input_values: The input variables. Should be an dictionary, key is + variable name. Value is numpy array. + :param output_name: The final output variable name. + :param input_to_check: The input variable need to get gradient. + :param delta: The perturbation value for numeric gradient method. The + smaller delta is, the more accurate result will get. But if that delta is + too small, it could occur numerical stability problem. + :param local_scope: The local scope used for get_numeric_gradient. + :return: The gradient array in numpy format. + """ +``` + +### Explaination: + +- Why need `output_name` + - One Operator may have multiple Output, you can get independent gradient from each Output. So user should set one output to calculate. + +- Why need `input_to_check` + - One operator may have multiple inputs. Gradient Op can calculate the gradient of these Inputs at the same time. But Numeric Gradient needs to calculate them one by one. So `get_numeric_gradient` is designed to calculate the gradient for one input. If you need to compute multiple inputs, you can call `get_numeric_gradient` multiple times. + + +### Core Algorithm Implementation + + +```python + # we only compute gradient of one element each time. + # we use a for loop to compute the gradient of every element. + for i in xrange(tensor_size): + # get one input element throw it's index i. + origin = tensor_to_check.get_float_element(i) + + # add delta to it, run op and then get the sum of the result tensor. + x_pos = origin + delta + tensor_to_check.set_float_element(i, x_pos) + y_pos = get_output() + + # plus delta to this element, run op and get the sum of the result tensor. + x_neg = origin - delta + tensor_to_check.set_float_element(i, x_neg) + y_neg = get_output() + + # restore old value + tensor_to_check.set_float_element(i, origin) + + # compute the gradient of this element and store it into a numpy array. + gradient_flat[i] = (y_pos - y_neg) / delta / 2 + + # reshape the gradient result to the shape of the source tensor. + return gradient_flat.reshape(tensor_to_check.get_dims()) +``` + +## Auto Graident Checker Framework + +Each Operator Kernel has three kinds of Gradient: + +- 1. Numeric Gradient +- 2. CPU Operator Gradient +- 3. GPU Operator Gradient(if supported) + +Numeric Gradient Only relies on forward Operator. So we use Numeric Gradient as the reference value. + +- 1. calculate the numeric gradient. +- 2. calculate CPU kernel Gradient with the backward Operator and compare it with the numeric gradient. +- 3. calculate GPU kernel Gradient with the backward Operator and compare it with the numeric gradient.(if support GPU) + +#### Python Interface + +```python + def check_grad(self, + forward_op, + input_vars, + inputs_to_check, + output_name, + no_grad_set=None, + only_cpu=False, + max_relative_error=0.005): + """ + :param forward_op: used to create backward_op + :param input_vars: numpy value of input variable. The following + computation will use these variables. + :param inputs_to_check: inputs var names that should check gradient. + :param output_name: output name that used to + :param max_relative_error: The relative tolerance parameter. + :param no_grad_set: used when create backward ops + :param only_cpu: only compute and check gradient on cpu kernel. + :return: + """ +``` + +### How to check if two numpy array is close enough? +if `abs_numeric_grad` is nearly zero, then use abs error for numeric_grad, not relative + +```python +numeric_grad = ... +operator_grad = numpy.array(scope.find_var(grad_var_name(name)).get_tensor()) + +abs_numeric_grad = numpy.abs(numeric_grad) +# if abs_numeric_grad is nearly zero, then use abs error for numeric_grad, not relative +# error. +abs_numeric_grad[abs_numeric_grad < 1e-3] = 1 + +diff_mat = numpy.abs(abs_numeric_grad - operator_grad) / abs_numeric_grad +max_diff = numpy.max(diff_mat) +``` + + +#### Notes: +1,The Input data for auto gradient checker should be reasonable to avoid numeric problem. + + +#### Refs: + +- [Gradient checking and advanced optimization(en)](http://deeplearning.stanford.edu/wiki/index.php/Gradient_checking_and_advanced_optimization) +- [Gradient checking and advanced optimization(cn)](http://ufldl.stanford.edu/wiki/index.php/%E6%A2%AF%E5%BA%A6%E6%A3%80%E9%AA%8C%E4%B8%8E%E9%AB%98%E7%BA%A7%E4%BC%98%E5%8C%96) diff --git a/doc/design/build_system/README.md b/doc/design/build_system/README.md index 310739f37ae48934afe1d042e87efef85b98f1fc..bf0e4dddc1b640ecbce489f65820aaf8a4b3b1e7 100644 --- a/doc/design/build_system/README.md +++ b/doc/design/build_system/README.md @@ -105,3 +105,48 @@ shared_library(api ### Implementation As above example CMakeLists.txt executes, each function invocation adds "nodes" to a dependency graph. It also use this graph to generate CMake commands including `add_executable`, `add_dependencies`, `target_link_libraries`, and `add_test`. + +### Using Package Manager For Go + +Building Go binaries and libraries need to satisfy their dependencies, generally +we can do `go get ./...` to download and compile all external dependencies. The +problems are: + +1. `go get` will always get the latest code from the default branch of the + remote repo, so changes of dependents might break the build. This is very + different with what we already have in `cmake/external` which download a + specific version or commit id of the dependency. +1. Some locations can not access external dependencies through the internet, as mentioned + in https://github.com/PaddlePaddle/Paddle/issues/2605. Using package management + tools can package the dependencies as a "vendor" package, which can be mirrored + at many cloud file hosting, so users what to compile paddle by themselves can + download this "vendor" package from a mirror site. + +#### Choose A Suitable Tool + +As mentioned by @wangkuiyi, [Here](https://github.com/golang/go/wiki/PackageManagementTools) +list dozens of Go package managers. We choose the tool using following principles: + +- Most "active" projects with more stars, more pull requests or commits +- Widely used project + +After comparing all these projects, we shall choose between the most popular +tools: Godep and Glide. + +Here's a brief comparison between Godep and Glide +: https://github.com/Masterminds/glide/wiki/Go-Package-Manager-Comparison. There are +also many complaints about using `Godep`. There's also a new "official" pakcage +management tool has been started at: https://github.com/golang/dep to resolve +such problems, but it's currently at Alpha stage. So the best choice now is +glide obviously. + +#### Manage Go Packages + +- Dependencies: `go/glide.yaml` will store the dependencies and their versions which + is directly imported by paddle. `go/glide.lock` will store all dependencies recursively + with their commit id. Builds will "lock" to these packages if we don't `glide up` + them +- Vendor package: `go/vendor` directory will generated when running `cmake` command. `cmake` + will download the code corresponding to `go/glide.lock`. If we put a vendor folder + under `go/`, cmake will just check the commit id to the packages under the folder, + if commit id matches, there will be no download at all. diff --git a/doc/design/cluster_train/master_server.md b/doc/design/cluster_train/master_server.md index bb8307652587b4dc56cd668a3a5e64722734d194..4bf3c506f101361875043f8bfd97972b8c981a22 100644 --- a/doc/design/cluster_train/master_server.md +++ b/doc/design/cluster_train/master_server.md @@ -10,7 +10,7 @@ A dataset is a list of files in *RecordIO* format. A RecordIO file consists of c ## Task Queue -As mentioned in [distributed training design doc](./README.md), a *task* is a data shard that the master server assigns to the trainer process to train on. A task consists of one or multiple *blocks* from one or multiple files. The master server maintains *task queues* to track the training progress. +As mentioned in [distributed training design doc](./README.md), a *task* is a data shard that the master server assigns to the trainer process to train on. A task consists of one or multiple *chunks* from one or multiple files. The master server maintains *task queues* to track the training progress. ### Task Queue Creation @@ -21,23 +21,23 @@ As mentioned in [distributed training design doc](./README.md), a *task* is a da func (m *RPCServer) ReportDataset(Paths []string, dummy *int) error { } ``` -1. The master server will scan through each RecordIO file to generate the *block index* and know how many blocks does each file have. A block can be referenced by the file path and the index of the block within the file. The block index is in memory data structure that enables fast access to each block, and the index of the block with the file is an integer start from 0, representing the n-th block within the file. +1. The master server will scan through each RecordIO file to generate the *chunk index* and know how many chunks does each file have. A chunk can be referenced by the file path and the index of the chunk within the file. The chunk index is in memory data structure that enables fast access to each chunk, and the index of the chunk with the file is an integer start from 0, representing the n-th chunk within the file. - The definition of the block is: + The definition of the chunk is: ```go - type Block struct { - Idx int // index of the block within the file + type Chunk struct { + Idx int // index of the chunk within the file Path string - Index recordio.Index // block index + Index recordio.Index // chunk index } ``` -1. Blocks are grouped into tasks, and tasks are filled into the todo queue. The pending queue and the done queue are initialized with no element. +1. Chunks are grouped into tasks, and tasks are filled into the todo queue. The pending queue and the done queue are initialized with no element. The definition of the task is: ```go type Task struct { Index int - Blocks []Block + Chunks []Chunk } ``` diff --git a/doc/design/cluster_train/pserver_client.md b/doc/design/cluster_train/pserver_client.md index 392bab25e9de6bf5aa7cc1b0ad345ef12f1d9e5d..474b8c572cd92fc87e9f7f3f2b19d12cccd158de 100644 --- a/doc/design/cluster_train/pserver_client.md +++ b/doc/design/cluster_train/pserver_client.md @@ -55,7 +55,7 @@ The trainer select process is encapsulated in the C API function: ```c int paddle_begin_init_params(paddle_pserver_client* client, const char* config_proto); ``` -The selected trainer's call to `paddle_begin_init_params` will return with 1, and the other trainers' call to `paddle_begin_init_params` will block until initialization is done, and return 0. As illustrated below: +The selected trainer's call to `paddle_begin_init_params` will return with 1, and the other trainers' call to `paddle_begin_init_params` will return 0. `paddle_get_params` will be blocked until initialization is completed. As illustrated below: @@ -74,14 +74,25 @@ typedef enum { typedef struct { char* name; paddle_element_type element_type; - void* content; + unsigned char* content; int content_len; } paddle_parameter, paddle_gradient; -typedef struct paddle_pserver_client paddle_pserver_client; +typedef int paddle_pserver_client; -paddle_pserver_client* paddle_new_pserver_client(); -void paddle_pserver_client_release(paddle_pserver_client* client); +/** + * @brief creates a pserver client that talks to etcd for coordination. + */ +paddle_pserver_client paddle_new_etcd_pserver_client(char* etcd_addr); + +/** + * @brief creates a pserver client given pserver addresses. + * + * @param pserver_addrs comma-separated pserver addresses. + * @param selected if current pserver client is selected to initialize all parameter servers. + */ +paddle_pserver_client paddle_new_pserver_client(char* pserver_addrs, int selected); +void paddle_pserver_client_release(paddle_pserver_client c); /** * @brief paddle_begin_init_params begins to initialize parameters on @@ -89,16 +100,13 @@ void paddle_pserver_client_release(paddle_pserver_client* client); * * paddle_begin_init_params will be called from multiple trainers, * only one trainer will be selected to initialize the parameters on - * parameter servers. Other trainers will be blocked until the - * initialization is done, and they need to get the initialized + * parameter servers. Other trainers need to get the initialized * parameters from parameter servers using @paddle_get_params. * - * @param pserver_config_proto serialized parameter server configuration in - * Protocol Buffers format. * @return 1 if the trainer is selected to initialize parameter * servers, otherwise 0. */ -int paddle_begin_init_params(paddle_pserver_client* client, const char* pserver_config_proto); +int paddle_begin_init_params(paddle_pserver_client client); /** * @brief paddle_init_param initializes the parameter on parameter @@ -106,12 +114,13 @@ int paddle_begin_init_params(paddle_pserver_client* client, const char* pserver_ * * @param param the parameter to initialize. * @param param_config_proto the configuration for the parameter. + * @param config_len the length of param_config_proto * @return 0 if successful, otherwise -1. On failure, the trainer * needs to restart the entire initialization process (starting from * @paddle_begin_init_param). Or simply exit the program and wait for * the cluster management system to restart the trainer. */ -int paddle_init_param(paddle_pserver_client* client, paddle_parameter params, const char* param_config_proto); +int paddle_init_param(paddle_pserver_client client, paddle_parameter param, const unsigned char* param_config_proto, int config_len); /** * @brief paddle_finish_init_params tells parameter servers client has @@ -122,7 +131,7 @@ int paddle_init_param(paddle_pserver_client* client, paddle_parameter params, co * @paddle_begin_init_param). Or simply exit the program and wait for * the cluster management system to restart the trainer. */ -int paddle_finish_init_params(paddle_pserver_client* client); +int paddle_finish_init_params(paddle_pserver_client client); /** * @brief paddle_send_grads sends gradients to parameter servers for @@ -133,18 +142,23 @@ int paddle_finish_init_params(paddle_pserver_client* client); * @param learning_rate the learning rate for the gradients. * @return 0 if successful, otherwise -1. */ -int paddle_send_grads(paddle_pserver_client* client, const paddle_gradient* grads, int len); +int paddle_send_grads(paddle_pserver_client client, const paddle_gradient* grads, int len); /** * @brief paddle_get_params gets parameters from parameter servers. * - * @param names the array of names of the parameters to get. - * @param dst the destination array of parameters to save to. + * paddle_get_params will block until parameters are initialized on + * the parameter servers. + * + * @param dst the destination array of parameter pointers to save to. + * The parameter pointer must be pre-popullated with required parameter name, + * and the content of parameter must be pre-allocated of the size of required + * parameter on pserver. * @param len the length of the names array and the paddle_parameter * array. * @return 0 if successful, otherwise -1. */ -int paddle_get_params(paddle_pserver_client* client, const char** names, paddle_parameter* dst, int len); +int paddle_get_params(paddle_pserver_client client, paddle_parameter** dst, int len); /** * @brief paddle_save_model indicates parameters to save the parameter @@ -153,5 +167,5 @@ int paddle_get_params(paddle_pserver_client* client, const char** names, paddle_ * @param path the path to save parameters. * @return 0 if successful, otherwise -1. */ -int paddle_save_model(paddle_pserver_client* client, const char* path); +int paddle_save_model(paddle_pserver_client client, const char* path); ``` diff --git a/doc/design/cluster_train/remote_parameter_updater.md b/doc/design/cluster_train/remote_parameter_updater.md new file mode 100644 index 0000000000000000000000000000000000000000..6e8e5938455b869e0f3367794c41250340b37f77 --- /dev/null +++ b/doc/design/cluster_train/remote_parameter_updater.md @@ -0,0 +1,21 @@ +# Design Doc: Remote Parameter Updater for Cluster Train + +For an overview of distribute training, please refer to [distributed training design doc](README.md). In this design doc, we will discuss the parameter updater that will use parameter server cclient [The Client Library of Parameter Server Design Doc](pserver_client.md) to manage and update parameters. + +## Parameter Updater + +Parameter Updater is used by trainer to manage and update parameter, there are mainly two kind of parameter updater: local and remote, since this design is for cluster train, we will only discuss remote parameter updater here. + +### Remote Parameter Updater + +Remote Parameter Updater manage parameters through remote parameter server with the client that communicate with pserver([The Client Library of Parameter Server Design Doc](pserver_client.md)) + +In PaddlePaddle Python V2 API, trainer is implemented in python, and the trainer will hold a instance of parameter updater and call it's functions directly. In this design, we will also expose the api of RemoteParameterUpdater to python with swig. + +#### Sparse Remote Parameter Updater + +Since we will only implement dense parameter management new, the mechanism for sparse parameter will be discussed in next stage. + +### Interface Design + +TBD diff --git a/doc/design/cluster_train/save_model.md b/doc/design/cluster_train/save_model.md new file mode 100644 index 0000000000000000000000000000000000000000..b755185c81ad617b9c85c47de0f5f65d2201c658 --- /dev/null +++ b/doc/design/cluster_train/save_model.md @@ -0,0 +1,111 @@ +# Design Doc: Save Model + +## Overview + +The model is the output of the training process. There are two +ways from which user can obtain a model: + +- Save model triggered by user code: user code asks PaddlePaddle to + save a model. +- Convert model from the checkpoint: model being converted from + pservers' periodic checkpoint. In this way, the user can cancel a + job at any time, and still have a relatively fresh model (we + checkpoint around every 5 minutes). + +### Trainer Saving Model vs. Pservers Saving Model + +Both trainers and pservers have access to the model. So the model can +be saved from a trainer or pservers. We need to decide where the model +is saved from. + +#### Dense Update vs. Sparse Update + +There are two types of model update methods: dense update and sparse +update (when the model parameter is configured to be sparse). + +- Dense update + + Every trainer has it's own full copy of the model. Every model + update will update the entire model. + +- Sparse update + + The training input is sparse, and the trainer does not have the + entire model. It will only download the sub-model necessary related + to the input. When updating the model, only the sub-model related to + the training input is updated. + + +#### Pservers Saving Model + +The benefit of letting pservers save model is they have the entire +model all the time. However, since pservers are on different nodes, it +requires a merging process to merge model shards into the same +model. Thus requires the pservers to write models to a distributed +filesystem, making the checkpoint shards visible to the merge program. + +#### Trainer Saving Model + +The benefit of letting one trainer to save the model is it does not +require a distributed filesystem. And it's reusing the same save model +logic when training locally - except when doing sparse update, the +trainer needs to download the entire model during the saving process. + +#### Conclusion + +Given trainer saving model does not require a distributed filesystem, +and is an intuitive extension to trainer saving model when training +locally, we decide to let the trainer save the model when doing +distributed training. + + +### Convert Model from Checkpoint + +TODO + + +## Timeline + +We first implement trainer save the model. Converting the latest +snapshot to a model will be a TODO for future. + + +## Trainer Save Model + +### Trainer Election + +One trainer will be elected as the one to save the model. When using +etcd, trainer ID is a randomly generated UUID, the trainer will +contact the master server requesting to save the model, and find out +if itself is elected. When the master server is not used, unique +trainer IDs will be given by the administrator, the trainer whose ID +is "0" is elected to save the model. + +### Model Save Path + +Each trainer will be given the directory to save the model. The +elected trainer will save the model to +`given-directory/trainerID`. Since the trainer ID is unique, this +would prevent concurrent save to the same file when multiple trainers +are elected to save the model when split-brain problem happens. + +### What Happens When Model Is Saving + +It takes some time to save model, we need to define what will happen +when save model is taking place. + +When doing dense update, the trainer uses the local model. Pservers +does not need to pause model update. + +When doing sparse update. The trainer needs to download the entire +model while saving. To get the most accurate model, the model update +needs to be paused before the download starts and resumed after the +download finishes. Otherwise, the trainer gets a model that is +"polluted": some part of the model is old, some part of the model is +new. + +It's unclear that the "polluted" model will be inferior due to the +stochastic nature of deep learning, and pausing the model update will +add more complexity to the system. Since supporting sparse update is a +TODO item. We defer the evaluation of pause the model update or not +during saving model to the future. diff --git a/doc/design/cluster_train/src/pserver_init.graffle b/doc/design/cluster_train/src/pserver_init.graffle index 730d3a561ffdc19e723b3cf6612471440951826a..5f3f1f52be8aa7f9049a8fcd6b7c93c8560c1676 100644 Binary files a/doc/design/cluster_train/src/pserver_init.graffle and b/doc/design/cluster_train/src/pserver_init.graffle differ diff --git a/doc/design/cluster_train/src/pserver_init.png b/doc/design/cluster_train/src/pserver_init.png index 4d502226d82ba271c50ae1bec5efaaaac4cc4434..dfe491ff98dd7db1c336093c80964a260df2cd90 100644 Binary files a/doc/design/cluster_train/src/pserver_init.png and b/doc/design/cluster_train/src/pserver_init.png differ diff --git a/doc/design/mkldnn/README.MD b/doc/design/mkldnn/README.MD new file mode 100644 index 0000000000000000000000000000000000000000..e956994431fbb43438c56dcd96ad8313cf516090 --- /dev/null +++ b/doc/design/mkldnn/README.MD @@ -0,0 +1,110 @@ +# Intel® MKL-DNN on PaddlePaddle: Design Doc + +我们计划将Intel深度神经网络数学库(**MKL-DNN**\[[1](#references)\])集成到PaddlePaddle,充分展现英特尔平台的优势,有效提升PaddlePaddle在英特尔架构上的性能。 + +我们短期内的基本目标是: + +- 完成常用layer的MKL-DNN实现。 +- 完成常见深度神经网络VGG,GoogLeNet 和 ResNet的MKL-DNN实现。 + + +## Contents + +- [Overview](#overview) +- [Actions](#actions) + - [CMake](#cmake) + - [Layers](#layers) + - [Activations](#activations) + - [Unit Tests](#unit-tests) + - [Protobuf Messages](#protobuf-messages) + - [Python API](#python-api) + - [Demos](#demos) + - [Benchmarking](#benchmarking) + - [Others](#others) +- [Design Concerns](#design-concerns) + +## Overview + +我们会把MKL-DNN作为第三方库集成进PaddlePaddle,整体框架图 +
+
+Figure 1. PaddlePaddle on IA. +
+ +## Actions +我们把集成方案大致分为了如下几个方面。 + +### CMake +我们会在`CMakeLists.txt`中会添加`WITH_MKLDNN`的选项,当设置这个值为`ON`的时候会启用编译MKL-DNN功能。同时会自动开启OpenMP用于提高MKL-DNN的性能。 + +同时,我们会引入`WITH_MKLML`选项,用于选择是否使用MKL-DNN自带的MKLML安装包。这个安装包可以独立于MKL-DNN使用,但是建议在开启MKL-DNN的同时也打开MKLML的开关,这样才能发挥最好的性能。 + +所以,我们会在`cmake/external`目录新建`mkldnn.cmake`和`mklml.cmake`文件,它们会在编译PaddlePaddle的时候下载对应的软件包,并放到PaddlePaddle的third party目录中。 + +**备注**:当`WITH_MKLML=ON`的时候,会优先使用这个包作为PaddlePaddle的CBLAS和LAPACK库,所以会稍微改动`cmake/cblas.cmake`中的逻辑。 + +### Layers +所有MKL-DNN相关的C++ layers,都会按照PaddlePaddle的目录结构存放在 +`paddle/gserver/layers`中,并且文件名都会一以*Mkldnn*开头。 + +所有MKL-DNN的layers都会继承于一个叫做`MkldnnLayer`的父类,该父类继承于PaddlePaddle的基类`Layer`。 + +### Activations +由于在PaddlePaddle中,激活函数是独立于layer概念的,所以会在`paddle/gserver/activations`目录下添加一个`MkldnnActivation.h`文件定义一些用于MKL-DNN的接口,实现方法还是会在`ActivationFunction.cpp`文件。 + +### Unit Tests +会在`paddle/gserver/test`目录下添加`test_Mkldnn.cpp`和`MkldnnTester.*`用于MKL-DNN的测试。 + +Activation的测试,计划在PaddlePaddle原有的测试文件上直接添加新的测试type。 + +### Protobuf Messages +根据具体layer的需求可能会在`proto/ModelConfig.proto`里面添加必要的选项。 + +### Python API +目前只考虑**v1 API**。 + +计划在`python/paddle/trainer/config_parser.py`里面添加`use_mkldnn`这个选择,方便用户选择使用MKL-DNN的layers。 + +具体实现方式比如: + +```python +use_mkldnn = bool(int(g_command_config_args.get("use_mkldnn", 0))) +if use_mkldnn + self.layer_type = mkldnn_* +``` + +所有MKL-DNN的layer type会以*mkldnn_*开头,以示区分。 + +并且可能在`python/paddle/trainer_config_helper`目录下的`activations.py `和`layers.py`里面添加必要的MKL-DNN的接口。 + +### Demos + +会在`v1_api_demo`目录下添加一个`mkldnn`的文件夹,里面放入一些用于MKL-DNN测试的demo脚本。 + +### Benchmarking +会考虑添加部分逻辑在`benchmark/paddle/image/run.sh`,添加使用MKL-DNN的测试。 + +### Others +1. 如果在使用MKL-DNN的情况下,会把CPU的Buffer对齐为64。 +2. 深入PaddlePaddle,寻找有没有其他可以优化的可能,进一步优化。比如可能会用OpenMP改进SGD的更新性能。 + +## Design Concerns + +为了更好的符合PaddlePaddle的代码风格\[[2](#references)\],同时又尽可能少的牺牲MKL-DNN的性能\[[3](#references)\]。 + +我们总结出一些特别需要注意的点: + +1. 使用**deviceId_**。为了尽可能少的在父类Layer中添加变量或者函数,我们决定使用已有的`deviceId_`变量来区分layer的属性,定义`-2`为`MkldnnLayer`特有的设备ID。 +2. 重写父类Layer的**init**函数,修改`deviceId_`为`-2`,代表这个layer是用于跑在MKL-DNN的环境下。 +3. 创建`MkldnnMatrix`,用于管理MKL-DNN会用到的相关memory函数、接口以及会用的到格式信息。 +4. 创建`MkldnnBase`,定义一些除了layer和memory相关的类和函数。包括MKL-DNN会用到`MkldnnStream`和`CpuEngine`,和未来可能还会用到`FPGAEngine`等。 +5. 在**Argument**里添加两个`MkldnnMatrixPtr`,取名为`mkldnnValue`和`mkldnnGrad`,用于存放`MkldnnLayer`会用到的memory buffer。 并且添加函数cvt(会修改为一个更加合适的函数名),用于处理"CPU device"和"MKL-DNN device"之间memory的相互转化。 +6. 在父类`Layer`中的`getOutput`函数中添加一段逻辑,用于判断`deviceId`,并针对device在MKL-DNN和CPU之间不统一的情况,做一个前期转换。 也就是调用`Argument`的cvt函数把output统一到需要的device上。 +7. 在原来的`FLAGS`中添加一个`use_mkldnn`的flag,用于选择是否使用MKL-DNN的相关功能。 + +## References + +1. [Intel Math Kernel Library for Deep Neural Networks (Intel MKL-DNN)](https://github.com/01org/mkl-dnn "Intel MKL-DNN") +2. [原来的方案](https://github.com/PaddlePaddle/Paddle/pull/3096)会引入**nextLayer**的信息。但是在PaddlePaddle中,无论是重构前的layer还是重构后的op,都不会想要知道next layer/op的信息。 +3. MKL-DNN的高性能格式与PaddlePaddle原有的`NCHW`不同(PaddlePaddle中的CUDNN部分使用的也是`NCHW`,所以不存在这个问题),所以需要引入一个转换方法,并且只需要在必要的时候转换这种格式,才能更好的发挥MKL-DNN的性能。 + diff --git a/doc/design/mkldnn/image/overview.png b/doc/design/mkldnn/image/overview.png new file mode 100644 index 0000000000000000000000000000000000000000..84b455c28230703599a2529f014cfbb222138fef Binary files /dev/null and b/doc/design/mkldnn/image/overview.png differ diff --git a/doc/design/parameters_in_cpp.md b/doc/design/parameters_in_cpp.md new file mode 100644 index 0000000000000000000000000000000000000000..b6f99bc7d9d6fafacb0a4bcff806b65d9aef98cc --- /dev/null +++ b/doc/design/parameters_in_cpp.md @@ -0,0 +1,41 @@ +# Design Doc: The C++ Class `Parameters` + +`Parameters` is a concept we designed in Paddle V2 API. `Parameters` is a container of parameters, and make Paddle can shared parameter between topologies. We described usages of `Parameter` in [api.md](./api.md). + +We used Python to implement Parameters when designing V2 API before. There are several defects for current implementation: +* We just use `memcpy` to share Parameters between topologies, but this is very inefficient. +* We did not implement share Parameters while training. We just trigger `memcpy` when start training. + +It is necessary that we implement Parameters in CPP side. However, it could be a code refactoring for Paddle, because Paddle was designed for training only one topology before, i.e., each GradientMachine contains its Parameter as a data member. In current Paddle implementation, there are three concepts associated with `Parameters`: + +1. `paddle::Parameter`. A `Parameters` is a container for `paddle::Parameter`. +It is evident that we should use `paddle::Parameter` when developing `Parameters`. +However, the `Parameter` class contains many functions and does not have a clear interface. +It contains `create/store Parameter`, `serialize/deserialize`, `optimize(i.e SGD)`, `randomize/zero`. +When we developing `Parameters`, we only use `create/store Parameter` functionality. +We should extract functionalities of Parameter into many classes to clean Paddle CPP implementation. + +2. `paddle::GradientMachine` and its sub-classes, e.g., `paddle::MultiGradientMachine`, `paddle::NeuralNetwork`. +We should pass `Parameters` to `paddle::GradientMachine` when `forward/backward` to avoid `memcpy` between topologies. +Also, we should handle multi-GPU/CPU training, because `forward` and `backward` would perform on multi-GPUs and multi-CPUs. +`Parameters` should dispatch the parameter value to each device, and gather the parameter gradient from each device. + +3. `paddle::ParameterUpdater`. The ParameterUpdater is used to update parameters in Paddle. +So `Parameters` should be used by `paddle::ParameterUpdater`, and `paddle::ParameterUpdater` should optimize `Parameters` (by SGD). + + +The step by step approach for implementation Parameters in Paddle C++ core is listed below. Each step should be a PR and could be merged into Paddle one by one. + +1. Clean `paddle::Parameter` interface. Extract the functionalities of `paddle::Parameter` to prepare for the implementation of Parameters. + +2. Implementation a `Parameters` class. It just stores the `paddle::Parameter` inside. Make `GradientMachine` uses `Parameters` as a class member. + +3. Make `Parameters` support Multi-CPU and Multi-GPU training to prepare for sharing `Parameter` between topologies. +Because we need share `Parameters` between topologies, it is `Parameters`'s response to exchange Parameters between GPUs. +`GradientMachine` should not handle how to exchange Parameters because `GradientMachine` only used to train one topology and we need to support train many topologies in Paddle, i.e., there could be many GradientMachines use one `Parameters`. + * We should use a global function to exchange Parameters between GPUs, not a member function in `Parameters`. The `MultiGradientMachine` invoke this function, which uses `Parameters` as this function inputs. + * The MultiGradientMachine contains many functionalities. Extracting the Parameters exchanging logic could make MultiGradientMachine clearer and simpler. + +4. Make `Parameters` as an argument for `forward/backward` function, not a data member for `GradientMachine`. For example, `forward` could be `forward(const Parameters& params, ...)` and `backward` could be `backward(Parameters* params, ...)`. After this step, Paddle could share `Parameters` between topologies. + +5. `ParameterUpdater` is invoked by `GradientMachine` and `Trainer`, but it updates `Parameters`. In the end of this code refactoring, we could change `ParameterUpdater` directly uses `Parameters` to make `ParameterUpdater`'s implementation clear. diff --git a/doc/design/releasing_process.md b/doc/design/releasing_process.md index 3692a5248a355cfcfd1cfd0911d43d65166921b1..0c10e782808ca6456347ec54cb5e921162731ede 100644 --- a/doc/design/releasing_process.md +++ b/doc/design/releasing_process.md @@ -11,6 +11,15 @@ Paddle每次发新的版本,遵循以下流程: * 编译这个版本的Ubuntu Deb包。如果失败,修复Ubuntu Deb包编译问题,Patch号加一,返回第二步。 * 使用Regression Test List作为检查列表,测试Docker镜像/ubuntu安装包的功能正确性 * 如果失败,记录下所有失败的例子,在这个`release/版本号`分支中,修复所有bug后,Patch号加一,返回第二步 + * 编译这个版本的python wheel包,并发布到pypi。 + * 由于pypi.python.org目前遵循[严格的命名规范PEP 513](https://www.python.org/dev/peps/pep-0513),在使用twine上传之前,需要重命名wheel包中platform相关的后缀,比如将`linux_x86_64`修改成`manylinux1_x86_64`。 + * pypi上的package名称为paddlepaddle和paddlepaddle_gpu,如果要上传GPU版本的包,需要修改build/python/setup.py中,name: "paddlepaddle_gpu"并重新打包wheel包:`python setup.py bdist_wheel`。 + * 上传方法: + ``` + cd build/python + pip install twine + twine upload dist/[package to upload] + ``` 4. 第三步完成后,将`release/版本号`分支合入master分支,并删除`release/版本号`分支。将master分支的合入commit打上tag,tag为`版本号`。同时再将`master`分支合入`develop`分支。最后删除`release/版本号`分支。 5. 编译master分支的Docker发行镜像,发布到dockerhub。编译ubuntu的deb包,发布到github release页面 6. 协同完成Release Note的书写 diff --git a/doc/design/scope.md b/doc/design/scope.md new file mode 100644 index 0000000000000000000000000000000000000000..c9e0be716b606f6c7bf0373e0c6e632647e07a6f --- /dev/null +++ b/doc/design/scope.md @@ -0,0 +1,124 @@ +# Design of Scope in Paddle + +## Overview + +Scope is an important concept in programming languages, which defines a program region that a set of bindings between names and entities applies. In a specific scope, a valid name is uniquely associated with an entity, such as a variable. And in another scope, this name may refer to other entity or nothing at all. It clearly restricts the visibility and validity of names in a program. Hence **Scope** is introduced to PaddlePaddle to manage variables in context. But different from the original abstract concept, Scope now becomes an object with two important attributes: + +- Scope is an association of a name to variable. +- Variables in a parent scope can be retrieved from local scope. + +A detailed explanation of these two attributes goes as following. + + +## Scope is an association of a name to variable. + +Scope is an association of a name to variable. All variables belong to `Scope`. You need to specify a scope to run a Net, i.e., `net.Run(&scope)`. One net can run in different scopes and update different variable in the scope. + + +1. Scope only contains a map of a name to variable. + + All parameters, data, states in a Net should be variables and stored inside a scope. Each op should get inputs and outputs to do computation from a scope, such as data buffer, state(momentum) etc. + +1. Variable can only be created by Scope and a variable can only be got from Scope. User cannot create or get a variable outside a scope. This is a constraints of our framework, and will keep our framework simple and clear. + +1. Scope only contains methods that are used to Create and Get Variables. Scope do not contain Operators and have no information to run them. + `Net` is designed to drive the computation and Scope only contains a map of variables. There is no computation logic inside a `Scope`. Scope just handles the lifetime management of variables. + - `Create` is used to create a Variable by its name and add the mapping relation. + - `Get` is used to find a Variable by name. + +1. Every variable only belongs to one certain Scope. + + Variable can not belong to many scopes. If you want to use variables from parent scope, you can use `parent scope`. + +1. Scope should destruct all Variables inside it when itself is destructed. User can never store `Variable` pointer somewhere else. + + Because Variable can only be got from Scope. When destroying Scope, we also need to destroy all the Variables in it. If user store `Variable` pointer to private data member or some global variable, the pointer will be a invalid pointer when associated `Scope` is destroyed. + +```cpp +class Scope { + public: + Variable* NewVar(const std::string& name); + const Variable* FindVar(const std::string& name) const; + + private: + std::unordered_map> vars_; +}; +``` + + +## Parent scope and local scope + +Just like [scope](https://en.wikipedia.org/wiki/Scope_(computer_science)) in programming languages, `Scope` in the neural network can also be a local scope. There are two attributes about local scope. + +1. We can create local variables in a local scope. When that local scope are destroyed, all local variables should also be destroyed. +2. Variables in a parent scope can be retrieved from local scopes of that parent scope, i.e., when user get a variable from a scope, it will try to search this variable in current scope. If there is no such variable in the local scope, `scope` will keep searching from its parent, until the variable is found or there is no parent. + +```cpp +class Scope { + public: + Scope(const std::shared_ptr& scope): parent_(scope) {} + + Variable* FindVar(const std::string& name) const { + auto it = vars_.find(name); + if (it != vars_.end()) { + return it->second.get(); + } else if (parent_ != nullptr) { + return parent_->FindVar(name); + } else { + return nullptr; + } + } + + private: + std::shared_ptr parent_ {nullptr}; +}; +``` + +In `Scope` class, there is a private data member called `parent_`. `parent_` is a smart pointer to its parent scope. When user `Get` a variable by its `name`, the `name` will be searched inside the current scope. If the variable cannot be found locally and parent scope is not a `nullptr`, the variable will be searched inside that parent scope. `parent_` pointer's default value is `nullptr`. It means that the scope is a global scope when `parent_` is nullptr. + +A local scope is very useful when we implement Recurrent Neural Network. Each timestep of an RNN should be a `Net`. Each `Net` of timestep (`StepNet` for short) should use an independent local scope. Just like variables in a while loop is inside a local scope in programming languages. By using a single `StepNet` and changing local scope, we can implement an RNN easily. + +# Interface Design + +```cpp +class Variable { + private: + Variable() = default; + friend class Scope; +}; + +class Scope { + private: + Scope(const std::shared_ptr& parent = nullptr); + + public: + static std::shared_ptr Create(const std::shared_ptr& parent = nullptr); + + // return nullptr if not found. + Variable* FindVar(const std::string& name) const; + + // return if already contains same name variable. + Variable* NewVar(const std::string& name); + + private: + std::shared_ptr parent_; + std::unordered_map> vars_; +}; +``` +## Only scope can create a variable + +To ensure `only scope can create a variable`, we should mark `Variable`'s constructor as a private member function, and Scope is a friend class of Variable. And then only `NewVar` can construct `Variable`. + +## When scope destroyed, all variables inside this scope should be destroyed together + +The scope hold unique pointers for all variables. User can `FindVar` from scope, but he should not hold this pointer as a member variable. Because when scope is destroyed, all variables inside this scope will be destroyed together. + +## Sharing a parent scope + +Local scope contains a `parent_` pointer. It is a linked-list for scopes. Using a `shared_ptr` because when a local scope is using, its parents cannot be destroyed. + +Also, as the parent scope is a `shared_ptr`, we can only `Create()` a scope shared pointer. We cannot construct a scope variable, because it cannot be passed to other scope as `parent` pointer. + +## Orthogonal interface + +`FindVar` will return `nullptr` when `name` is not found. It can be used as `Contains` method. `NewVar` will return a `Error` when there is a name conflict locally. Combine `FindVar` and `NewVar`, we can implement `NewVar` easily. diff --git a/doc/design/simple_op_design.md b/doc/design/simple_op_design.md new file mode 100644 index 0000000000000000000000000000000000000000..5e07c29c56d21728599195d420d3222213d77e7c --- /dev/null +++ b/doc/design/simple_op_design.md @@ -0,0 +1,202 @@ +## Interaction between C++ and Python + +Users employ API in Python to describe their own network, however, the network construction actually happens in C++. so Protobuf is introduced to send the message between Python and C++. + +The Interaction between Python and C++ can be simplified as two steps: + +1. C++ tells Python how many Ops there are, and what parameter do users need to offer to initialize a new Op. Python then builds API for each Op at compile time. + +2. Users invoke APIs built by Python and provide necessary parameters. These parameters will be sent to C++ fo finish Op construction task. + +### Message form C++ to Python + +We define a Protobuf message class `OpProto` to hold message needed in the first step. What should an `OpProto` contain? This question is equivalent to “What message do we need to offer, to build a Python API which is legal and user oriented and can use to describe a whole Op.” + +Following message are necessary: + +1. Op's name, and its simple comment. +2. Input and output variable number; each variable's name, type, and comment. +3. Op's attributes; each attribute includes name, type, comment, **default value** and **value range**. + +So `OpProto` can be defined as follows: + +```proto +enum AttrType { + INT = 1; + FLOAT = 2; + STRING = 3; + INTS = 4; + FLOATS = 5; + STRINGS = 6; +}; + +message AttrValue { + AttrType type = 1; + optional int iv = 2; + optional float fv = 3; + optional string sv = 4; + repeated int ivs = 5; + repeated float fvs = 6; + repeated string svs = 7; +}; + +message AttrProto { + required string name = 1; + required string comment = 2; + required AttrType type = 3; +}; + +message VarProto { + required string name = 1; + required string comment = 2; + required bool is_tensor = 3; +}; + +message OpProto { + repeated VarProto inputs = 1; + repeated VarProto outputs = 2; + repeated AttrProto attrs = 3; + required string type = 4; + required string comment = 5; +}; +``` + +To generate Python code automatically: + +```python +def create_python_ops_creatation_functions(): + op_protos = paddle.framework.OpRegistry.get_all_op_proto() + for type_name in op_protos: + op_proto = op_protos[type_name] + def __impl__(**kwargs): # User must use key word args in Paddle API + inputs = [kwargs.get(ipt.name, "") for ipt in op_proto.inputs] + outputs = [kwargs.get(opt.name, "") for opt in op_proto.outputs] + attrs = [cast_to_op_attr(attr, kwargs.get(attr.name, None)) for attr in op_proto.attrs] + opdesc = (input, outputs, type_name, attrs) + return paddle.framework.OpRegistry.CreateOp(opdesc) + __impl__.__doc__ = create_doc_string(op_proto) + globals()[type_name] = __impl__ + +create_python_ops_creatation_functions() +``` + +### Message from Python to C++ + +To hold message needed in the above second step, we define Protobuf message class `OpDesc`. It is used to hold user-specified parameters in Op describing. + +```proto +message OpDesc { + required string type = 1; + repeated string inputs = 2; + repeated string outputs = 3; + map attrs = 4; +}; +``` + +## OpProto Register + +Every Op has its own `OpProto`. For using convenience, we need to register them and record all their messages. For each `Op` class, we define a corresponding `OpMaker` class, in whose constructor we implement the `OpProto`'s building process. `OpMaker`'s constructor will be invoked by another function `OpRegistry::RegisterOp()`. + +```cpp +class OpProtoMaker { +public: + OpProtoMaker(OpProto* proto): proto_(proto) {} +protected: + OpProto* proto_; + void AddInput(const std::string& name, const std::string& desc) {...} + void AddAttr(const std::string& name, const std::string& desc, TypeId type) {...} + void AddComment(const std::string& comment) { ... } +}; + +class OpRegistry { +public: + using OpCreator = std::function; + + template + static void RegisterOp(const std::string& name) { + gCreators_[name] = [](const OpDesc& desc) { + return new OpType(desc); + }; + OpProto& opProto = gProtos_[name]; + OpMaker()(&opProto); + } + + static map gCreators_; + static map gProtos_; +}; + +template +class OpRegister { + public: + OpRegister(std::string type) { + OpRegistry::RegisterOp(type); + } +}; + +#define REGISTER_OP(op_class, op_maker_class, type_name) \ + class op_class##Register { \ + private: \ + const static OpRegister<#op_class, #op_maker_class> reg; \ + }; \ + const Register op_class##Register::reg(#type_name); + +class CosineOp { +// ... +} + +struct CosineOpProtoMaker : public OpProtoMaker { + CosineOpProtoMaker(OpProto* proto) : OpProtoMaker(proto) { + AddInput("input", "input of cosine op"); + AddAttr("scale", "scale of cosine op", float).Default(1.0).LargerThan(0.0); + AddType("cos"); + AddComment("This is cos op"); + } +} + +REGISTER_OP(CosineOp, CosineOpProtoMaker, cos); +``` + +In `REGISTER_OP(CosineOp, CosineOpProtoMaker, cos)`, we register not only `CosineOp` but also `CosineOpProto`. As fields of `CosineOpProto`, the default value and value range of `scale` are also registered here. + +## Python API + +Python APIs are divided into two types, high-level API and low-level API. + +### High-Level API + +High-level API is called by users directly, so it should keep its style consistent with existing V2 APIs. + +Here is a sample about how a define a fc layer: + +```python +hd = fc_layer(input=data, size=56, with_bias=True, activation="sigmoid"); +``` + +`hd` is the output of `fc_layer` and it's a `variable`. It can be further sent into other layers as input. + +The definition of `fc_layer()`: + +```python +def fc_layer(input, size, with_bias, activation): + attr_map = {"size":size} + check_attrs(attr_map) + w = make_variable('w') + if with_bias: + b = make_variable('b') + else: + b = None + fc_output = make_variable('fc_output'); + fc_op(input, w, b, fc_output, attr_map) + act_output = make_variable('sigmod_output'); + if activation == "sigmod": + sigmod_op(fc_output, act_output); + elif: + # ... + return act_output; +``` + +### Low Leval API + +In above sample, `fc_op` and `sigmod_op` are low-level API. They build `OpDesc` and invoke corresponding C++ code. + +*TODO* diff --git a/doc/design/speech/README.MD b/doc/design/speech/README.MD new file mode 100644 index 0000000000000000000000000000000000000000..7304650e628dba210488cd2dc4836318b5383b2a --- /dev/null +++ b/doc/design/speech/README.MD @@ -0,0 +1,155 @@ +# DeepSpeech2 on PaddlePaddle: Design Doc + +We are planning to build Deep Speech 2 (DS2) \[[1](#references)\], a powerful Automatic Speech Recognition (ASR) engine, on PaddlePaddle. For the first-stage plan, we have the following short-term goals: + +- Release a basic distributed implementation of DS2 on PaddlePaddle. +- Contribute a chapter of Deep Speech to PaddlePaddle Book. + +Intensive system optimization and low-latency inference library (details in \[[1](#references)\]) are not yet covered in this first-stage plan. + +## Table of Contents + +- [Tasks](#tasks) +- [Task Dependency](#task-dependency) +- [Design Details](#design-details) + - [Overview](#overview) + - [Row Convolution](#row-convolution) + - [Beam Search With CTC and LM](#beam-search-with-ctc-and-lm) +- [Future Work](#future-work) +- [References](#references) + +## Tasks + +We roughly break down the project into 14 tasks: + +1. Develop an **audio data provider**: + - Json filelist generator. + - Audio file format transformer. + - Spectrogram feature extraction, power normalization etc. + - Batch data reader with SortaGrad. + - Data augmentation (optional). + - Prepare (one or more) public English data sets & baseline. +2. Create a **simplified DS2 model configuration**: + - With only fixed-length (by padding) audio sequences (otherwise need *Task 3*). + - With only bidirectional-GRU (otherwise need *Task 4*). + - With only greedy decoder (otherwise need *Task 5, 6*). +3. Develop to support **variable-shaped** dense-vector (image) batches of input data. + - Update `DenseScanner` in `dataprovider_converter.py`, etc. +4. Develop a new **lookahead-row-convolution layer** (See \[[1](#references)\] for details): + - Lookahead convolution windows. + - Within-row convolution, without kernels shared across rows. +5. Build KenLM **language model** (5-gram) for beam search decoder: + - Use KenLM toolkit. + - Prepare the corpus & train the model. + - Create infererence interfaces (for Task 6). +6. Develop a **beam search decoder** with CTC + LM + WORDCOUNT: + - Beam search with CTC. + - Beam search with external custom scorer (e.g. LM). + - Try to design a more general beam search interface. +7. Develop a **Word Error Rate evaluator**: + - update `ctc_error_evaluator`(CER) to support WER. +8. Prepare internal dataset for Mandarin (optional): + - Dataset, baseline, evaluation details. + - Particular data preprocessing for Mandarin. + - Might need cooperating with the Speech Department. +9. Create **standard DS2 model configuration**: + - With variable-length audio sequences (need *Task 3*). + - With unidirectional-GRU + row-convolution (need *Task 4*). + - With CTC-LM beam search decoder (need *Task 5, 6*). +10. Make it run perfectly on **clusters**. +11. Experiments and **benchmarking** (for accuracy, not efficiency): + - With public English dataset. + - With internal (Baidu) Mandarin dataset (optional). +12. Time **profiling** and optimization. +13. Prepare **docs**. +14. Prepare PaddlePaddle **Book** chapter with a simplified version. + +## Task Dependency + +Tasks parallelizable within phases: + +Roadmap | Description | Parallelizable Tasks +----------- | :------------------------------------ | :-------------------- +Phase I | Simplified model & components | *Task 1* ~ *Task 8* +Phase II | Standard model & benchmarking & profiling | *Task 9* ~ *Task 12* +Phase III | Documentations | *Task13* ~ *Task14* + +Issue for each task will be created later. Contributions, discussions and comments are all highly appreciated and welcomed! + +## Design Details + +### Overview + +Traditional **ASR** (Automatic Speech Recognition) pipelines require great human efforts devoted to elaborately tuning multiple hand-engineered components (e.g. audio feature design, accoustic model, pronuncation model and language model etc.). **Deep Speech 2** (**DS2**) \[[1](#references)\], however, trains such ASR models in an end-to-end manner, replacing most intermediate modules with only a single deep network architecture. With scaling up both the data and model sizes, DS2 achieves a very significant performance boost. + +Please read Deep Speech 2 \[[1](#references),[2](#references)\] paper for more background knowledge. + +The classical DS2 network contains 15 layers (from bottom to top): + +- **Two** data layers (audio spectrogram, transcription text) +- **Three** 2D convolution layers +- **Seven** uni-directional simple-RNN layers +- **One** lookahead row convolution layers +- **One** fully-connected layers +- **One** CTC-loss layer + +
+
+Figure 1. Archetecture of Deep Speech 2 Network. +
+ +We don't have to persist on this 2-3-7-1-1-1 depth \[[2](#references)\]. Similar networks with different depths might also work well. As in \[[1](#references)\], authors use a different depth (e.g. 2-2-3-1-1-1) for final experiments. + +Key ingredients about the layers: + +- **Data Layers**: + - Frame sequences data of audio **spectrogram** (with FFT). + - Token sequences data of **transcription** text (labels). + - These two type of sequences do not have the same lengthes, thus a CTC-loss layer is required. +- **2D Convolution Layers**: + - Not only temporal convolution, but also **frequency convolution**. Like a 2D image convolution, but with a variable dimension (i.e. temporal dimension). + - With striding for only the first convlution layer. + - No pooling for all convolution layers. +- **Uni-directional RNNs** + - Uni-directional + row convolution: for low-latency inference. + - Bi-direcitional + without row convolution: if we don't care about the inference latency. +- **Row convolution**: + - For looking only a few steps ahead into the feature, instead of looking into a whole sequence in bi-directional RNNs. + - Not nessesary if with bi-direcitional RNNs. + - "**Row**" means convolutions are done within each frequency dimension (row), and no convolution kernels shared across. +- **Batch Normalization Layers**: + - Added to all above layers (except for data and loss layer). + - Sequence-wise normalization for RNNs: BatchNorm only performed on input-state projection and not state-state projection, for efficiency consideration. + + +Required Components | PaddlePaddle Support | Need to Develop +:------------------------------------- | :-------------------------------------- | :----------------------- +Data Layer I (Spectrogram) | Not supported yet. | TBD (Task 3) +Data Layer II (Transcription) | `paddle.data_type.integer_value_sequence` | - +2D Convolution Layer | `paddle.layer.image_conv_layer` | - +DataType Converter (vec2seq) | `paddle.layer.block_expand` | - +Bi-/Uni-directional RNNs | `paddle.layer.recurrent_group` | - +Row Convolution Layer | Not supported yet. | TBD (Task 4) +CTC-loss Layer | `paddle.layer.warp_ctc` | - +Batch Normalization Layer | `paddle.layer.batch_norm` | - +CTC-Beam search | Not supported yet. | TBD (Task 6) + +### Row Convolution + +TODO by Assignees + +### Beam Search with CTC and LM + +TODO by Assignees + +## Future Work + +- Efficiency Improvement +- Accuracy Improvement +- Low-latency Inference Library +- Large-scale benchmarking + +## References + +1. Dario Amodei, etc., [Deep Speech 2 : End-to-End Speech Recognition in English and Mandarin](http://proceedings.mlr.press/v48/amodei16.pdf). ICML 2016. +2. Dario Amodei, etc., [Deep Speech 2 : End-to-End Speech Recognition in English and Mandarin](https://arxiv.org/abs/1512.02595). arXiv:1512.02595. diff --git a/doc/design/speech/image/ds2_network.png b/doc/design/speech/image/ds2_network.png new file mode 100644 index 0000000000000000000000000000000000000000..1a5b2184d47928cc2849d5a7c8ea2d8cf5337e11 Binary files /dev/null and b/doc/design/speech/image/ds2_network.png differ diff --git a/doc/faq/index_cn.rst b/doc/faq/index_cn.rst index c14160d55ec8fdb9fc552da33f3a3dac13c1a764..138efb566e43fa71952f057829c2afbca96cadc9 100644 --- a/doc/faq/index_cn.rst +++ b/doc/faq/index_cn.rst @@ -311,3 +311,13 @@ Paddle二进制在运行时捕获了浮点数异常,只要出现浮点数异 * 训练数据有问题,导致参数收敛到了一些奇异的情况。或者输入数据尺度过大,有些特征的取值达到数百万,这时进行矩阵乘法运算就可能导致浮点数溢出。 主要的解决办法是减小学习律或者对数据进行归一化处理。 + +15. 编译安装后执行 import paddle.v2 as paddle 报ImportError: No module named v2 +------------------------------------------------------------------------ +先查看一下是否曾经安装过paddle v1版本,有的话需要先卸载: + +pip uninstall py_paddle paddle + +然后安装paddle的python环境, 在build目录下执行 + +pip install python/dist/paddle*.whl && pip install ../paddle/dist/py_paddle*.whl diff --git a/doc/getstarted/build_and_install/build_from_source_en.md b/doc/getstarted/build_and_install/build_from_source_en.md index 69f4501f370dcc9d603ec54a63d68568d66e832e..c0608ede8e57b224dae4b3d510d704a8b0918b53 100644 --- a/doc/getstarted/build_and_install/build_from_source_en.md +++ b/doc/getstarted/build_and_install/build_from_source_en.md @@ -22,6 +22,7 @@ To compile the source code, your computer must be equipped with the following de - **CMake**: CMake >= 3.0 (at least CMake 3.4 on Mac OS X) - **BLAS**: MKL, OpenBlas or ATLAS - **Python**: only support Python 2.7 +- **Go** **Note:** For CUDA 7.0 and CUDA 7.5, GCC 5.0 and up are not supported! For CUDA 8.0, GCC versions later than 5.3 are not supported! @@ -107,6 +108,18 @@ As a simple example, consider the following: sudo apt-get install -y python python-pip python-numpy libpython-dev bison sudo pip install 'protobuf==3.1.0.post1' + # Install Go + # You can follow https://golang.org/doc/install for a detailed explanation. + wget -O go.tgz https://storage.googleapis.com/golang/go1.8.1.linux-amd64.tar.gz && \ + tar -C $HOME -xzf go.tgz && \ + mkdir $HOME/gopath && \ + rm go.tgz + + # Setup environment variables + export GOROOT=$HOME/go + export GOPATH=$HOME/gopath + export PATH=$PATH:$GOROOT/bin + # install cmake 3.4 curl -sSL https://cmake.org/files/v3.4/cmake-3.4.1.tar.gz | tar -xz && \ cd cmake-3.4.1 && ./bootstrap && make -j4 && sudo make install && \ diff --git a/doc/getstarted/build_and_install/docker_install_cn.rst b/doc/getstarted/build_and_install/docker_install_cn.rst index da2d4234658b6ea4730346e721437cc1633c4362..84e33177740ca1652efc09c8081c2519b4366906 100644 --- a/doc/getstarted/build_and_install/docker_install_cn.rst +++ b/doc/getstarted/build_and_install/docker_install_cn.rst @@ -3,6 +3,43 @@ PaddlePaddle的Docker容器使用方式 PaddlePaddle目前唯一官方支持的运行的方式是Docker容器。因为Docker能在所有主要操作系统(包括Linux,Mac OS X和Windows)上运行。 请注意,您需要更改 `Dockers设置 `_ 才能充分利用Mac OS X和Windows上的硬件资源。 +Docker使用入门 +------------------------------ + +几个基础的概念帮助理解和使用Docker: + +- *镜像*:一个Docker镜像是一个打包好的软件。它包含了这个软件本身和它所依赖的运行环境。PaddlePaddle的Docker镜像就包含了PaddlePaddle的Python库以及其依赖的多个Python库。这样我们可以直接在Docker中运行需要的程序而不需要安装后在执行。可以执行: + + .. code-block:: bash + + docker images + + 来列出当前系统中的所有镜像,同样可以执行: + + .. code-block:: bash + + docker pull paddlepaddle/paddle:0.10.0 + + 来下载Docker镜像,paddlepaddle/paddle是从官方镜像源Dockerhub.com下载的,推荐国内用户使用ocker.paddlepaddle.org/paddle下载。 + +- *容器*: 如果说一个Docker镜像就是一个程序,那容器就是这个程序运行时产生的“进程”。 + 实际上,一个容器就是一个操作系统的进程,但是是运行在独立的进程空间,文件系统以及网络之上。 + 可以执行: + + .. code-block:: bash + + docker run paddlepaddle/paddle:0.10.0 + + 来使用一个镜像启动一个容器。 + +- 默认情况下,Docker容器会运行在独立的文件系统空间之上,我们无法在Docker容器中 + 访问到主机上的文件。可以通过*挂载Volume*的方式,将主机上的文件或目录挂载到 + Docker容器中。下面的命令把当前目录挂载到了容器中的 /data 目录下,容器使用 + debian镜像,并且启动后执行 :code:`ls /data`。 + + .. code-block:: bash + + docker run --rm -v $(pwd):/data debian ls /data PaddlePaddle发布的Docker镜像使用说明 ------------------------------ @@ -12,13 +49,13 @@ PaddlePaddle需要的所有编译工具。把编译出来的PaddlePaddle也打 像,称为生产镜像,里面涵盖了PaddlePaddle运行所需的所有环境。每次 PaddlePaddle发布新版本的时候都会发布对应版本的生产镜像以及开发镜像。运 行镜像包括纯CPU版本和GPU版本以及其对应的非AVX版本。我们会在 -`dockerhub.com `_ 提供最新 -的Docker镜像,可以在"tags"标签下找到最新的Paddle镜像版本。为了方便在国 -内的开发者下载Docker镜像,我们提供了国内的镜像服务器供大家使用。如果您 -在国内,请把文档里命令中的paddlepaddle/paddle替换成 -docker.paddlepaddle.org/paddle。 +`dockerhub.com `_ +和国内镜像`docker.paddlepaddle.org` 提供最新 +的Docker镜像,可以在"tags"标签下找到最新的Paddle镜像版本。 -1. 开发镜像::code:`paddlepaddle/paddle:-dev` +**注意:为了方便在国内的开发者下载Docker镜像,我们提供了国内的镜像服务器供大家使用。如果您在国内,请把文档里命令中的paddlepaddle/paddle替换成docker.paddlepaddle.org/paddle。** + +1. 开发镜像::code:`paddlepaddle/paddle:0.10.0-dev` 这个镜像包含了Paddle相关的开发工具以及编译和运行环境。用户可以使用开发镜像代替配置本地环境,完成开发,编译,发布, 文档编写等工作。由于不同的Paddle的版本可能需要不同的依赖和工具,所以如果需要自行配置开发环境需要考虑版本的因素。 @@ -37,13 +74,13 @@ docker.paddlepaddle.org/paddle。 .. code-block:: bash - docker run -it --rm paddlepaddle/paddle:-dev /bin/bash + docker run -it --rm -v $(pwd):/paddle paddlepaddle/paddle:0.10.0-dev /bin/bash 或者,可以以后台进程方式运行容器: .. code-block:: bash - docker run -d -p 2202:22 -p 8888:8888 paddledev/paddle:-dev + docker run -d -p 2202:22 -p 8888:8888 -v $(pwd):/paddle paddlepaddle/paddle:0.10.0-dev /usr/sbin/sshd -D 然后用密码 :code:`root` SSH进入容器: @@ -68,12 +105,14 @@ docker.paddlepaddle.org/paddle。 如果输出是No,就需要选择使用no-AVX的镜像 + **注:在0.10.0之后的版本,PaddlePaddle都可以自动判断硬件是否支持AVX,所以无需判断AVX即可使用** + 以上方法在GPU镜像里也能用,只是请不要忘记提前在物理机上安装GPU最新驱动。 为了保证GPU驱动能够在镜像里面正常运行,我们推荐使用[nvidia-docker](https://github.com/NVIDIA/nvidia-docker)来运行镜像。 .. code-block:: bash - nvidia-docker run -it --rm paddledev/paddle:0.10.0rc1-gpu /bin/bash + nvidia-docker run -it --rm paddledev/paddle:0.10.0-gpu /bin/bash 注意: 如果使用nvidia-docker存在问题,你也许可以尝试更老的方法,具体如下,但是我们并不推荐这种方法。: @@ -81,7 +120,7 @@ docker.paddlepaddle.org/paddle。 export CUDA_SO="$(\ls /usr/lib64/libcuda* | xargs -I{} echo '-v {}:{}') $(\ls /usr/lib64/libnvidia* | xargs -I{} echo '-v {}:{}')" export DEVICES=$(\ls /dev/nvidia* | xargs -I{} echo '--device {}:{}') - docker run ${CUDA_SO} ${DEVICES} -it paddledev/paddle:-gpu + docker run ${CUDA_SO} ${DEVICES} -it paddledev/paddle:0.10.0-gpu 3. 运行以及发布您的AI程序 @@ -98,7 +137,7 @@ docker.paddlepaddle.org/paddle。 nvidia-docker run -it -v $PWD:/work paddle /work/a.py - 这里`a.py`包含的所有依赖假设都可以在Paddle的运行容器中。如果需要包含更多的依赖、或者需要发布您的应用的镜像,可以编写`Dockerfile`使用`FROM paddledev/paddle:` + 这里`a.py`包含的所有依赖假设都可以在Paddle的运行容器中。如果需要包含更多的依赖、或者需要发布您的应用的镜像,可以编写`Dockerfile`使用`FROM paddledev/paddle:0.10.0` 创建和发布自己的AI程序镜像。 运行PaddlePaddle Book @@ -177,7 +216,7 @@ Paddle的Docker开发镜像带有一个通过 `woboq code browser .. code-block:: bash - docker run -d --name paddle-cpu-doc paddle:-dev + docker run -d --name paddle-cpu-doc paddle:0.10.0-dev docker run -d --volumes-from paddle-cpu-doc -p 8088:80 nginx 接着我们就能够打开浏览器在 http://localhost:8088/paddle/ 浏览代码。 diff --git a/doc/getstarted/build_and_install/docker_install_en.rst b/doc/getstarted/build_and_install/docker_install_en.rst index 03df497506099d2fb758bd0ab437d2c082f2b537..94860240f6a4a9bed8a865684a8a79960489280e 100644 --- a/doc/getstarted/build_and_install/docker_install_en.rst +++ b/doc/getstarted/build_and_install/docker_install_en.rst @@ -23,7 +23,7 @@ Docker is simple as long as we understand a few basic concepts: .. code-block:: bash - docker pull paddlepaddle/paddle:0.10.0rc2 + docker pull paddlepaddle/paddle:0.10.0 to download a Docker image, paddlepaddle/paddle in this example, from Dockerhub.com. @@ -35,7 +35,7 @@ Docker is simple as long as we understand a few basic concepts: .. code-block:: bash - docker run paddlepaddle/paddle:0.10.0rc2 + docker run paddlepaddle/paddle:0.10.0 to start a container to run a Docker image, paddlepaddle/paddle in this example. @@ -62,13 +62,36 @@ of PaddlePaddle, we release both of them. Production image includes CPU-only version and a CUDA GPU version and their no-AVX versions. We put the docker images on `dockerhub.com -`_. You can find the -latest versions under "tags" tab at dockerhub.com. If you are in -China, you can use our Docker image registry mirror to speed up the -download process. To use it, please replace all paddlepaddle/paddle in -the commands to docker.paddlepaddle.org/paddle. +`_. You can find the +latest versions under "tags" tab at dockerhub.com. -1. Production images, this image might have multiple variants: +** NOTE: If you are in China, you can use our Docker image registry mirror to speed up the download process. To use it, please replace all paddlepaddle/paddle in the commands to docker.paddlepaddle.org/paddle.** + + +1. development image :code:`paddlepaddle/paddle:-dev` + + This image has packed related develop tools and runtime + environment. Users and developers can use this image instead of + their own local computer to accomplish development, build, + releasing, document writing etc. While different version of paddle + may depends on different version of libraries and tools, if you + want to setup a local environment, you must pay attention to the + versions. The development image contains: + + - gcc/clang + - nvcc + - Python + - sphinx + - woboq + - sshd + + Many developers use servers with GPUs, they can use ssh to login to + the server and run :code:`docker exec` to enter the docker + container and start their work. Also they can start a development + docker image with SSHD service, so they can login to the container + and start work. + +2. Production images, this image might have multiple variants: - GPU/AVX::code:`paddlepaddle/paddle:-gpu` - GPU/no-AVX::code:`paddlepaddle/paddle:-gpu-noavx` @@ -84,12 +107,12 @@ the commands to docker.paddlepaddle.org/paddle. if cat /proc/cpuinfo | grep -i avx; then echo Yes; else echo No; fi - + **NOTE:versions after 0.10.0 will automatically detect system AVX support, so manual detect is not needed in this case.** To run the CPU-only image as an interactive container: .. code-block:: bash - docker run -it --rm paddlepaddle/paddle:0.10.0rc2 /bin/bash + docker run -it --rm paddlepaddle/paddle:0.10.0 /bin/bash Above method work with the GPU image too -- the recommended way is using `nvidia-docker `_. @@ -101,30 +124,7 @@ the commands to docker.paddlepaddle.org/paddle. .. code-block:: bash - nvidia-docker run -it --rm paddlepaddle/paddle:0.10.0rc2-gpu /bin/bash - -2. development image :code:`paddlepaddle/paddle:-dev` - - This image has packed related develop tools and runtime - environment. Users and developers can use this image instead of - their own local computer to accomplish development, build, - releasing, document writing etc. While different version of paddle - may depends on different version of libraries and tools, if you - want to setup a local environment, you must pay attention to the - versions. The development image contains: - - - gcc/clang - - nvcc - - Python - - sphinx - - woboq - - sshd - - Many developers use servers with GPUs, they can use ssh to login to - the server and run :code:`docker exec` to enter the docker - container and start their work. Also they can start a development - docker image with SSHD service, so they can login to the container - and start work. + nvidia-docker run -it --rm paddlepaddle/paddle:0.10.0-gpu /bin/bash Train Model Using Python API @@ -149,13 +149,13 @@ Run the program using docker: .. code-block:: bash - docker run --rm -v ~/workspace:/workspace paddlepaddle/paddle:0.10.0rc2 python /workspace/example.py + docker run --rm -v ~/workspace:/workspace paddlepaddle/paddle:0.10.0 python /workspace/example.py Or if you are using GPU for training: .. code-block:: bash - nvidia-docker run --rm -v ~/workspace:/workspace paddlepaddle/paddle:0.10.0rc2-gpu python /workspace/example.py + nvidia-docker run --rm -v ~/workspace:/workspace paddlepaddle/paddle:0.10.0-gpu python /workspace/example.py Above commands will start a docker container by running :code:`python /workspace/example.py`. It will stop once :code:`python @@ -166,7 +166,7 @@ run PaddlePaddle program interactively: .. code-block:: bash - docker run -it -v ~/workspace:/workspace paddlepaddle/paddle:0.10.0rc2 /bin/bash + docker run -it -v ~/workspace:/workspace paddlepaddle/paddle:0.10.0 /bin/bash # now we are inside docker container cd /workspace python example.py @@ -175,7 +175,7 @@ Running with GPU is identical: .. code-block:: bash - nvidia-docker run -it -v ~/workspace:/workspace paddlepaddle/paddle:0.10.0rc2-gpu /bin/bash + nvidia-docker run -it -v ~/workspace:/workspace paddlepaddle/paddle:0.10.0-gpu /bin/bash # now we are inside docker container cd /workspace python example.py diff --git a/doc/getstarted/concepts/src/train.py b/doc/getstarted/concepts/src/train.py index 679d0a931a7d650108ea89a04080a55d2976f72e..7e604f23de38543a00f305d508af0791193f78ba 100644 --- a/doc/getstarted/concepts/src/train.py +++ b/doc/getstarted/concepts/src/train.py @@ -31,7 +31,7 @@ def event_handler(event): # define training dataset reader def train_reader(): train_x = np.array([[1, 1], [1, 2], [3, 4], [5, 2]]) - train_y = np.array([-2, -3, -7, -7]) + train_y = np.array([[-2], [-3], [-7], [-7]]) def reader(): for i in xrange(train_y.shape[0]): diff --git a/doc/getstarted/concepts/use_concepts_cn.rst b/doc/getstarted/concepts/use_concepts_cn.rst index e63ca11102c8ce457afcc3c262fa5f159361c01d..f15b11bd780402a3ec1755900e8c648f5d2a7bc5 100644 --- a/doc/getstarted/concepts/use_concepts_cn.rst +++ b/doc/getstarted/concepts/use_concepts_cn.rst @@ -111,7 +111,7 @@ PaddlePaddle支持不同类型的输入数据,主要包括四种类型,和 # define training dataset reader def train_reader(): train_x = np.array([[1, 1], [1, 2], [3, 4], [5, 2]]) - train_y = np.array([-2, -3, -7, -7]) + train_y = np.array([[-2], [-3], [-7], [-7]]) def reader(): for i in xrange(train_y.shape[0]): yield train_x[i], train_y[i] diff --git a/doc/getstarted/index_cn.rst b/doc/getstarted/index_cn.rst index 0cb27f802c40ef123fdc9c6799aad3b2a5f554c0..aa418c657a4ba16cce61c030066f4d3e14e891cc 100644 --- a/doc/getstarted/index_cn.rst +++ b/doc/getstarted/index_cn.rst @@ -7,4 +7,4 @@ build_and_install/index_cn.rst concepts/use_concepts_cn.rst -- `深度学习入门课程 `_ +- `深度学习入门课程 `_ diff --git a/doc/getstarted/index_en.rst b/doc/getstarted/index_en.rst index 9f771e93e8b63eb98e31ec12667bd1aa007af20e..be3253e3d41b99a2b696e2c5ef6463ed49680d69 100644 --- a/doc/getstarted/index_en.rst +++ b/doc/getstarted/index_en.rst @@ -6,4 +6,4 @@ GET STARTED build_and_install/index_en.rst -- `Deep Learning 101 `_ +- `Deep Learning 101 `_ diff --git a/doc/howto/deep_model/rnn/hierarchical_layer_cn.rst b/doc/howto/deep_model/rnn/hierarchical_layer_cn.rst index 79048e92482851af6c2dd7d055868ebcaa7a298b..e05173c2006ff47ecb6ca5a4fe1502de750acc59 100644 --- a/doc/howto/deep_model/rnn/hierarchical_layer_cn.rst +++ b/doc/howto/deep_model/rnn/hierarchical_layer_cn.rst @@ -28,17 +28,17 @@ pooling 的使用示例如下,详细见 :ref:`api_v2.layer_pooling` 配置API seq_pool = pooling(input=layer, pooling_type=pooling.Max(), - agg_level=AggregateLevel.EACH_SEQUENCE) + agg_level=AggregateLevel.TO_SEQUENCE) - `pooling_type` 目前支持两种,分别是:pooling.Max()和pooling.Avg()。 -- `agg_level=AggregateLevel.EACH_TIMESTEP` 时(默认值): +- `agg_level=AggregateLevel.TO_NO_SEQUENCE` 时(默认值): - 作用:双层序列经过运算变成一个0层序列,或单层序列经过运算变成一个0层序列 - 输入:一个双层序列,或一个单层序列 - 输出:一个0层序列,即整个输入序列(单层或双层)的平均值(或最大值) -- `agg_level=AggregateLevel.EACH_SEQUENCE` 时: +- `agg_level=AggregateLevel.TO_SEQUENCE` 时: - 作用:一个双层序列经过运算变成一个单层序列 - 输入:必须是一个双层序列 @@ -52,15 +52,15 @@ last_seq 的使用示例如下( :ref:`api_v2.layer_first_seq` 类似),详 .. code-block:: bash last = last_seq(input=layer, - agg_level=AggregateLevel.EACH_SEQUENCE) + agg_level=AggregateLevel.TO_SEQUENCE) -- `agg_level=AggregateLevel.EACH_TIMESTEP` 时(默认值): +- `agg_level=AggregateLevel.TO_NO_SEQUENCE` 时(默认值): - 作用:一个双层序列经过运算变成一个0层序列,或一个单层序列经过运算变成一个0层序列 - 输入:一个双层序列或一个单层序列 - 输出:一个0层序列,即整个输入序列(双层或者单层)最后一个,或第一个元素。 -- `agg_level=AggregateLevel.EACH_SEQUENCE` 时: +- `agg_level=AggregateLevel.TO_SEQUENCE` 时: - 作用:一个双层序列经过运算变成一个单层序列 - 输入:必须是一个双层序列 - 输出:一个单层序列,其中每个元素是双层序列中每个subseq最后一个(或第一个)元素。 @@ -74,9 +74,9 @@ expand 的使用示例如下,详细见 :ref:`api_v2.layer_expand` 配置API。 ex = expand(input=layer1, expand_as=layer2, - expand_level=ExpandLevel.FROM_TIMESTEP) + expand_level=ExpandLevel.FROM_NO_SEQUENCE) -- `expand_level=ExpandLevel.FROM_TIMESTEP` 时(默认值): +- `expand_level=ExpandLevel.FROM_NO_SEQUENCE` 时(默认值): - 作用:一个0层序列经过运算扩展成一个单层序列,或者一个双层序列 - 输入:layer1必须是一个0层序列,是待扩展的数据;layer2 可以是一个单层序列,或者是一个双层序列,提供扩展的长度信息 diff --git a/doc/howto/deep_model/rnn/hrnn_rnn_api_compare_cn.rst b/doc/howto/deep_model/rnn/hrnn_rnn_api_compare_cn.rst index 96e52b910a22576fd75c9d4e1bef6e2cf74bc84f..efdc44455ea4dc81a87b4d4fc8a81e78b15cb06a 100644 --- a/doc/howto/deep_model/rnn/hrnn_rnn_api_compare_cn.rst +++ b/doc/howto/deep_model/rnn/hrnn_rnn_api_compare_cn.rst @@ -81,7 +81,7 @@ * 在本例中,我们将原始数据的每一组,通过\ :code:`recurrent_group`\ 进行拆解,拆解成的每一句话再通过一个LSTM网络。这和单层RNN的配置是等价的。 -* 与单层RNN的配置类似,我们只需要使用LSTM encode成的最后一个向量。所以对\ :code:`recurrent_group`\ 进行了\ :code:`last_seq`\ 操作。但和单层RNN不同,我们是对每一个子序列取最后一个元素,因此\ :code:`agg_level=AggregateLevel.EACH_SEQUENCE`\ 。 +* 与单层RNN的配置类似,我们只需要使用LSTM encode成的最后一个向量。所以对\ :code:`recurrent_group`\ 进行了\ :code:`last_seq`\ 操作。但和单层RNN不同,我们是对每一个子序列取最后一个元素,因此\ :code:`agg_level=AggregateLevel.TO_SEQUENCE`\ 。 * 至此,\ :code:`lstm_last`\ 便和单层RNN配置中的\ :code:`lstm_last`\ 具有相同的结果了。 diff --git a/doc/howto/deep_model/rnn/index_cn.rst b/doc/howto/deep_model/rnn/index_cn.rst index 9e805ca85191b793c8798a239927a318c70b96f5..9ecab5594cff47cde4700b7ce0f58013a960a16e 100644 --- a/doc/howto/deep_model/rnn/index_cn.rst +++ b/doc/howto/deep_model/rnn/index_cn.rst @@ -4,6 +4,7 @@ RNN相关模型 .. toctree:: :maxdepth: 1 + rnn_config_cn.rst recurrent_group_cn.md hierarchical_layer_cn.rst hrnn_rnn_api_compare_cn.rst diff --git a/doc/howto/deep_model/rnn/index_en.rst b/doc/howto/deep_model/rnn/index_en.rst index 13a153b05c578e0af82ee29db5ea27fd4b6d6f59..7adc79873d699fdfd5a85034bcef964dd1f19132 100644 --- a/doc/howto/deep_model/rnn/index_en.rst +++ b/doc/howto/deep_model/rnn/index_en.rst @@ -1,2 +1,7 @@ RNN Models ========== + +.. toctree:: + :maxdepth: 1 + + rnn_config_en.rst diff --git a/doc/howto/deep_model/rnn/rnn_config_cn.rst b/doc/howto/deep_model/rnn/rnn_config_cn.rst index ac2bd0775f4ab2e0a0c37462e2c23001123b152b..4d684cf8ad5a8082cf31fb27027119b3d3e700b6 100644 --- a/doc/howto/deep_model/rnn/rnn_config_cn.rst +++ b/doc/howto/deep_model/rnn/rnn_config_cn.rst @@ -5,36 +5,13 @@ RNN配置 中配置循环神经网络(RNN)。PaddlePaddle 高度支持灵活和高效的循环神经网络配置。 在本教程中,您将了解如何: -- 准备用来学习循环神经网络的序列数据。 - 配置循环神经网络架构。 - 使用学习完成的循环神经网络模型生成序列。 我们将使用 vanilla 循环神经网络和 sequence to sequence 模型来指导你完成这些步骤。sequence to sequence -模型的代码可以在\ ``demo / seqToseq``\ 找到。 - -准备序列数据 ------------- - -PaddlePaddle -不需要对序列数据进行任何预处理,例如填充。唯一需要做的是将相应类型设置为输入。例如,以下代码段定义了三个输入。 -它们都是序列,它们的大小是\ ``src_dict``\ ,\ ``trg_dict``\ 和\ ``trg_dict``\ : - -.. code:: python - - settings.input_types = [ - integer_value_sequence(len(settings.src_dict)), - integer_value_sequence(len(settings.trg_dict)), - integer_value_sequence(len(settings.trg_dict))] - -在\ ``process``\ 函数中,每个\ ``yield``\ 函数将返回三个整数列表。每个整数列表被视为一个整数序列: - -.. code:: python - - yield src_ids, trg_ids, trg_ids_next - -有关如何编写数据提供程序的更多细节描述,请参考 :ref:`api_pydataprovider2` 。完整的数据提供文件在 -``demo/seqToseq/dataprovider.py``\ 。 +模型的代码可以在 `book/08.machine_translation `_ 找到。 +wmt14数据的提供文件在 `python/paddle/v2/dataset/wmt14.py `_ 。 配置循环神经网络架构 -------------------- @@ -85,19 +62,19 @@ vanilla act=None, rnn_layer_attr=None): def __rnn_step__(ipt): - out_mem = memory(name=name, size=size) - rnn_out = mixed_layer(input = [full_matrix_projection(ipt), - full_matrix_projection(out_mem)], - name = name, - bias_attr = rnn_bias_attr, - act = act, - layer_attr = rnn_layer_attr, - size = size) + out_mem = paddle.layer.memory(name=name, size=size) + rnn_out = paddle.layer.mixed(input = [paddle.layer.full_matrix_projection(input=ipt), + paddle.layer.full_matrix_projection(input=out_mem)], + name = name, + bias_attr = rnn_bias_attr, + act = act, + layer_attr = rnn_layer_attr, + size = size) return rnn_out - return recurrent_group(name='%s_recurrent_group' % name, - step=__rnn_step__, - reverse=reverse, - input=input) + return paddle.layer.recurrent_group(name='%s_recurrent_group' % name, + step=__rnn_step__, + reverse=reverse, + input=input) PaddlePaddle 使用“Memory”(记忆模块)实现单步函数。\ **Memory**\ 是在PaddlePaddle中构造循环神经网络时最重要的概念。 @@ -140,43 +117,52 @@ Sequence to Sequence Model with Attention .. code:: python # 定义源语句的数据层 - src_word_id = data_layer(name='source_language_word', size=source_dict_dim) + src_word_id = paddle.layer.data( + name='source_language_word', + type=paddle.data_type.integer_value_sequence(source_dict_dim)) # 计算每个词的词向量 - src_embedding = embedding_layer( + src_embedding = paddle.layer.embedding( input=src_word_id, size=word_vector_dim, - param_attr=ParamAttr(name='_source_language_embedding')) + param_attr=paddle.attr.ParamAttr(name='_source_language_embedding')) # 应用前向循环神经网络 - src_forward = grumemory(input=src_embedding, size=encoder_size) + src_forward = paddle.networks.simple_gru( + input=src_embedding, size=encoder_size) # 应用反向递归神经网络(reverse=True表示反向循环神经网络) - src_backward = grumemory(input=src_embedding, - size=encoder_size, - reverse=True) + src_backward = paddle.networks.simple_gru( + input=src_embedding, size=encoder_size, reverse=True) # 将循环神经网络的前向和反向部分混合在一起 - encoded_vector = concat_layer(input=[src_forward, src_backward]) + encoded_vector = paddle.layer.concat(input=[src_forward, src_backward]) # 投射编码向量到 decoder_size - encoder_proj = mixed_layer(input = [full_matrix_projection(encoded_vector)], - size = decoder_size) + encoded_proj = paddle.layer.mixed( + size=decoder_size, + input=paddle.layer.full_matrix_projection(encoded_vector)) # 计算反向RNN的第一个实例 - backward_first = first_seq(input=src_backward) + backward_first = paddle.layer.first_seq(input=src_backward) # 投射反向RNN的第一个实例到 decoder size - decoder_boot = mixed_layer(input=[full_matrix_projection(backward_first)], size=decoder_size, act=TanhActivation()) + decoder_boot = paddle.layer.mixed( + size=decoder_size, + act=paddle.activation.Tanh(), + input=paddle.layer.full_matrix_projection(backward_first)) 解码器使用 ``recurrent_group`` 来定义循环神经网络。单步函数和输出函数在 ``gru_decoder_with_attention`` 中定义: .. code:: python - group_inputs=[StaticInput(input=encoded_vector,is_seq=True), - StaticInput(input=encoded_proj,is_seq=True)] - trg_embedding = embedding_layer( - input=data_layer(name='target_language_word', - size=target_dict_dim), - size=word_vector_dim, - param_attr=ParamAttr(name='_target_language_embedding')) + group_input1 = paddle.layer.StaticInput(input=encoded_vector, is_seq=True) + group_input2 = paddle.layer.StaticInput(input=encoded_proj, is_seq=True) + group_inputs = [group_input1, group_input2] + trg_embedding = paddle.layer.embedding( + input=paddle.layer.data( + name='target_language_word', + type=paddle.data_type.integer_value_sequence(target_dict_dim)), + size=word_vector_dim, + param_attr=paddle.attr.ParamAttr(name='_target_language_embedding')) + group_inputs.append(trg_embedding) group_inputs.append(trg_embedding) # 对于配备有注意力机制的解码器,在训练中, @@ -185,9 +171,10 @@ Sequence to Sequence Model with Attention # StaticInput 意味着不同时间步的输入都是相同的值, # 否则它以一个序列输入,不同时间步的输入是不同的。 # 所有输入序列应该有相同的长度。 - decoder = recurrent_group(name=decoder_group_name, - step=gru_decoder_with_attention, - input=group_inputs) + decoder = paddle.layer.recurrent_group( + name=decoder_group_name, + step=gru_decoder_with_attention, + input=group_inputs) 单步函数的实现如下所示。首先,它定义解码网络的\ **Memory**\ 。然后定义 attention,门控循环单元单步函数和输出函数: @@ -198,27 +185,32 @@ attention,门控循环单元单步函数和输出函数: # 定义解码器的Memory # Memory的输出定义在 gru_step 内 # 注意 gru_step 应该与它的Memory名字相同 - decoder_mem = memory(name='gru_decoder', - size=decoder_size, - boot_layer=decoder_boot) + decoder_mem = paddle.layer.memory( + name='gru_decoder', size=decoder_size, boot_layer=decoder_boot) # 计算 attention 加权编码向量 - context = simple_attention(encoded_sequence=enc_vec, - encoded_proj=enc_proj, - decoder_state=decoder_mem) + context = paddle.networks.simple_attention( + encoded_sequence=enc_vec, + encoded_proj=enc_proj, + decoder_state=decoder_mem) # 混合当前词向量和attention加权编码向量 - decoder_inputs = mixed_layer(inputs = [full_matrix_projection(context), - full_matrix_projection(current_word)], - size = decoder_size * 3) + decoder_inputs = paddle.layer.mixed( + size=decoder_size * 3, + input=[ + paddle.layer.full_matrix_projection(input=context), + paddle.layer.full_matrix_projection(input=current_word) + ]) # 定义门控循环单元循环神经网络单步函数 - gru_step = gru_step_layer(name='gru_decoder', - input=decoder_inputs, - output_mem=decoder_mem, - size=decoder_size) + gru_step = paddle.layer.gru_step( + name='gru_decoder', + input=decoder_inputs, + output_mem=decoder_mem, + size=decoder_size) # 定义输出函数 - out = mixed_layer(input=[full_matrix_projection(input=gru_step)], - size=target_dict_dim, - bias_attr=True, - act=SoftmaxActivation()) + out = paddle.layer.mixed( + size=target_dict_dim, + bias_attr=True, + act=paddle.activation.Softmax(), + input=paddle.layer.full_matrix_projection(input=gru_step)) return out 生成序列 @@ -238,41 +230,32 @@ attention,门控循环单元单步函数和输出函数: - ``beam_size``: beam search 算法中的beam大小。 - ``max_length``: 生成序列的最大长度。 -- 使用 ``seqtext_printer_evaluator`` - 根据索引矩阵和字典打印文本。这个函数需要设置: - - - ``id_input``: 数据的整数ID,用于标识生成的文件中的相应输出。 - - ``dict_file``: 用于将词ID转换为词的字典文件。 - - ``result_file``: 生成结果文件的路径。 - 代码如下: .. code:: python - group_inputs=[StaticInput(input=encoded_vector,is_seq=True), - StaticInput(input=encoded_proj,is_seq=True)] + group_input1 = paddle.layer.StaticInput(input=encoded_vector, is_seq=True) + group_input2 = paddle.layer.StaticInput(input=encoded_proj, is_seq=True) + group_inputs = [group_input1, group_input2] # 在生成时,解码器基于编码源序列和最后生成的目标词预测下一目标词。 # 编码源序列(编码器输出)必须由只读Memory的 StaticInput 指定。 # 这里, GeneratedInputs 自动获取上一个生成的词,并在最开始初始化为起始词,如 。 - trg_embedding = GeneratedInput( - size=target_dict_dim, - embedding_name='_target_language_embedding', - embedding_size=word_vector_dim) + trg_embedding = paddle.layer.GeneratedInput( + size=target_dict_dim, + embedding_name='_target_language_embedding', + embedding_size=word_vector_dim) group_inputs.append(trg_embedding) - beam_gen = beam_search(name=decoder_group_name, - step=gru_decoder_with_attention, - input=group_inputs, - bos_id=0, # Beginnning token. - eos_id=1, # End of sentence token. - beam_size=beam_size, - max_length=max_length) - - seqtext_printer_evaluator(input=beam_gen, - id_input=data_layer(name="sent_id", size=1), - dict_file=trg_dict_path, - result_file=gen_trans_file) - outputs(beam_gen) - -注意,这种生成技术只用于类似解码器的生成过程。如果你正在处理序列标记任务,请参阅 :ref:`semantic_role_labeling` 了解更多详细信息。 - -完整的配置文件在\ ``demo/seqToseq/seqToseq_net.py``\ 。 + beam_gen = paddle.layer.beam_search( + name=decoder_group_name, + step=gru_decoder_with_attention, + input=group_inputs, + bos_id=0, # Beginnning token. + eos_id=1, # End of sentence token. + beam_size=beam_size, + max_length=max_length) + + return beam_gen + +注意,这种生成技术只用于类似解码器的生成过程。如果你正在处理序列标记任务,请参阅 `book/06.understand_sentiment `_ 了解更多详细信息。 + +完整的配置文件在 `book/08.machine_translation/train.py `_ 。 diff --git a/doc/howto/deep_model/rnn/rnn_config_en.rst b/doc/howto/deep_model/rnn/rnn_config_en.rst index 73f5d5371fcd3ce95253cad47b0d8e738284441c..2b581290a41005c04cb1d8b6febe57f17d2416d3 100644 --- a/doc/howto/deep_model/rnn/rnn_config_en.rst +++ b/doc/howto/deep_model/rnn/rnn_config_en.rst @@ -3,34 +3,11 @@ RNN Configuration This tutorial will guide you how to configure recurrent neural network in PaddlePaddle. PaddlePaddle supports highly flexible and efficient recurrent neural network configuration. In this tutorial, you will learn how to: -- prepare sequence data for learning recurrent neural networks. - configure recurrent neural network architecture. - generate sequence with learned recurrent neural network models. -We will use vanilla recurrent neural network, and sequence to sequence model to guide you through these steps. The code of sequence to sequence model can be found at :code:`demo/seqToseq`. - -===================== -Prepare Sequence Data -===================== - -PaddlePaddle does not need any preprocessing to sequence data, such as padding. The only thing that needs to be done is to set the type of the corresponding type to input. For example, the following code snippets defines three input. All of them are sequences, and the size of them are :code:`src_dict`, :code:`trg_dict`, and :code:`trg_dict`: - -.. code-block:: python - - settings.input_types = [ - integer_value_sequence(len(settings.src_dict)), - integer_value_sequence(len(settings.trg_dict)), - integer_value_sequence(len(settings.trg_dict))] - - -Then at the :code:`process` function, each :code:`yield` function will return three integer lists. Each integer list is treated as a sequence of integers: - -.. code-block:: python - - yield src_ids, trg_ids, trg_ids_next - - -For more details description of how to write a data provider, please refer to :ref:`api_pydataprovider2` . The full data provider file is located at :code:`demo/seqToseq/dataprovider.py`. +We will use vanilla recurrent neural network, and sequence to sequence model to guide you through these steps. The code of sequence to sequence model can be found at `book/08.machine_translation `_ . +And the data preparation of this model can be found at `python/paddle/v2/dataset/wmt14.py `_ =============================================== Configure Recurrent Neural Network Architecture @@ -75,19 +52,19 @@ Its **output function** simply takes :math:`x_t` as the output. act=None, rnn_layer_attr=None): def __rnn_step__(ipt): - out_mem = memory(name=name, size=size) - rnn_out = mixed_layer(input = [full_matrix_projection(ipt), - full_matrix_projection(out_mem)], - name = name, - bias_attr = rnn_bias_attr, - act = act, - layer_attr = rnn_layer_attr, - size = size) + out_mem = paddle.layer.memory(name=name, size=size) + rnn_out = paddle.layer.mixed(input = [paddle.layer.full_matrix_projection(input=ipt), + paddle.layer.full_matrix_projection(input=out_mem)], + name = name, + bias_attr = rnn_bias_attr, + act = act, + layer_attr = rnn_layer_attr, + size = size) return rnn_out - return recurrent_group(name='%s_recurrent_group' % name, - step=__rnn_step__, - reverse=reverse, - input=input) + return paddle.layer.recurrent_group(name='%s_recurrent_group' % name, + step=__rnn_step__, + reverse=reverse, + input=input) PaddlePaddle uses memory to construct step function. **Memory** is the most important concept when constructing recurrent neural networks in PaddlePaddle. A memory is a state that is used recurrently in step functions, such as :math:`x_{t+1} = f_x(x_t)`. One memory contains an **output** and a **input**. The output of memory at the current time step is utilized as the input of the memory at the next time step. A memory can also has a **boot layer**, whose output is utilized as the initial value of the memory. In our case, the output of the gated recurrent unit is employed as the output memory. Notice that the name of the layer :code:`rnn_out` is the same as the name of :code:`out_mem`. This means the output of the layer :code:`rnn_out` (:math:`x_{t+1}`) is utilized as the **output** of :code:`out_mem` memory. @@ -113,43 +90,52 @@ We also project the encoder vector to :code:`decoder_size` dimensional space, ge .. code-block:: python # Define the data layer of the source sentence. - src_word_id = data_layer(name='source_language_word', size=source_dict_dim) + src_word_id = paddle.layer.data( + name='source_language_word', + type=paddle.data_type.integer_value_sequence(source_dict_dim)) # Calculate the word embedding of each word. - src_embedding = embedding_layer( + src_embedding = paddle.layer.embedding( input=src_word_id, size=word_vector_dim, - param_attr=ParamAttr(name='_source_language_embedding')) + param_attr=paddle.attr.ParamAttr(name='_source_language_embedding')) # Apply forward recurrent neural network. - src_forward = grumemory(input=src_embedding, size=encoder_size) + src_forward = paddle.networks.simple_gru( + input=src_embedding, size=encoder_size) # Apply backward recurrent neural network. reverse=True means backward recurrent neural network. - src_backward = grumemory(input=src_embedding, - size=encoder_size, - reverse=True) + src_backward = paddle.networks.simple_gru( + input=src_embedding, size=encoder_size, reverse=True) # Mix the forward and backward parts of the recurrent neural network together. - encoded_vector = concat_layer(input=[src_forward, src_backward]) + encoded_vector = paddle.layer.concat(input=[src_forward, src_backward]) # Project encoding vector to decoder_size. - encoder_proj = mixed_layer(input = [full_matrix_projection(encoded_vector)], - size = decoder_size) + encoded_proj = paddle.layer.mixed( + size=decoder_size, + input=paddle.layer.full_matrix_projection(encoded_vector)) # Compute the first instance of the backward RNN. - backward_first = first_seq(input=src_backward) + backward_first = paddle.layer.first_seq(input=src_backward) # Project the first instance of backward RNN to decoder size. - decoder_boot = mixed_layer(input=[full_matrix_projection(backward_first)], size=decoder_size, act=TanhActivation()) + decoder_boot = paddle.layer.mixed( + size=decoder_size, + act=paddle.activation.Tanh(), + input=paddle.layer.full_matrix_projection(backward_first)) The decoder uses :code:`recurrent_group` to define the recurrent neural network. The step and output functions are defined in :code:`gru_decoder_with_attention`: .. code-block:: python - group_inputs=[StaticInput(input=encoded_vector,is_seq=True), - StaticInput(input=encoded_proj,is_seq=True)] - trg_embedding = embedding_layer( - input=data_layer(name='target_language_word', - size=target_dict_dim), - size=word_vector_dim, - param_attr=ParamAttr(name='_target_language_embedding')) + group_input1 = paddle.layer.StaticInput(input=encoded_vector, is_seq=True) + group_input2 = paddle.layer.StaticInput(input=encoded_proj, is_seq=True) + group_inputs = [group_input1, group_input2] + trg_embedding = paddle.layer.embedding( + input=paddle.layer.data( + name='target_language_word', + type=paddle.data_type.integer_value_sequence(target_dict_dim)), + size=word_vector_dim, + param_attr=paddle.attr.ParamAttr(name='_target_language_embedding')) + group_inputs.append(trg_embedding) group_inputs.append(trg_embedding) # For decoder equipped with attention mechanism, in training, @@ -158,9 +144,10 @@ The decoder uses :code:`recurrent_group` to define the recurrent neural network. # StaticInput means the same value is utilized at different time steps. # Otherwise, it is a sequence input. Inputs at different time steps are different. # All sequence inputs should have the same length. - decoder = recurrent_group(name=decoder_group_name, - step=gru_decoder_with_attention, - input=group_inputs) + decoder = paddle.layer.recurrent_group( + name=decoder_group_name, + step=gru_decoder_with_attention, + input=group_inputs) The implementation of the step function is listed as below. First, it defines the **memory** of the decoder network. Then it defines attention, gated recurrent unit step function, and the output function: @@ -171,27 +158,32 @@ The implementation of the step function is listed as below. First, it defines th # Defines the memory of the decoder. # The output of this memory is defined in gru_step. # Notice that the name of gru_step should be the same as the name of this memory. - decoder_mem = memory(name='gru_decoder', - size=decoder_size, - boot_layer=decoder_boot) + decoder_mem = paddle.layer.memory( + name='gru_decoder', size=decoder_size, boot_layer=decoder_boot) # Compute attention weighted encoder vector. - context = simple_attention(encoded_sequence=enc_vec, - encoded_proj=enc_proj, - decoder_state=decoder_mem) + context = paddle.networks.simple_attention( + encoded_sequence=enc_vec, + encoded_proj=enc_proj, + decoder_state=decoder_mem) # Mix the current word embedding and the attention weighted encoder vector. - decoder_inputs = mixed_layer(inputs = [full_matrix_projection(context), - full_matrix_projection(current_word)], - size = decoder_size * 3) + decoder_inputs = paddle.layer.mixed( + size=decoder_size * 3, + input=[ + paddle.layer.full_matrix_projection(input=context), + paddle.layer.full_matrix_projection(input=current_word) + ]) # Define Gated recurrent unit recurrent neural network step function. - gru_step = gru_step_layer(name='gru_decoder', - input=decoder_inputs, - output_mem=decoder_mem, - size=decoder_size) + gru_step = paddle.layer.gru_step( + name='gru_decoder', + input=decoder_inputs, + output_mem=decoder_mem, + size=decoder_size) # Defines the output function. - out = mixed_layer(input=[full_matrix_projection(input=gru_step)], - size=target_dict_dim, - bias_attr=True, - act=SoftmaxActivation()) + out = paddle.layer.mixed( + size=target_dict_dim, + bias_attr=True, + act=paddle.activation.Softmax(), + input=paddle.layer.full_matrix_projection(input=gru_step)) return out @@ -207,45 +199,37 @@ After training the model, we can use it to generate sequences. A common practice - :code:`eos_id`: the end token. Every sentence ends with the end token. - :code:`beam_size`: the beam size used in beam search. - :code:`max_length`: the maximum length of the generated sentences. - -* use :code:`seqtext_printer_evaluator` to print text according to index matrix and dictionary. This function needs to set: - - - :code:`id_input`: the integer ID of the data, used to identify the corresponding output in the generated files. - - :code:`dict_file`: the dictionary file for converting word id to word. - - :code:`result_file`: the path of the generation result file. The code is listed below: .. code-block:: python - group_inputs=[StaticInput(input=encoded_vector,is_seq=True), - StaticInput(input=encoded_proj,is_seq=True)] + group_input1 = paddle.layer.StaticInput(input=encoded_vector, is_seq=True) + group_input2 = paddle.layer.StaticInput(input=encoded_proj, is_seq=True) + group_inputs = [group_input1, group_input2] # In generation, decoder predicts a next target word based on # the encoded source sequence and the last generated target word. # The encoded source sequence (encoder's output) must be specified by # StaticInput which is a read-only memory. # Here, GeneratedInputs automatically fetchs the last generated word, # which is initialized by a start mark, such as . - trg_embedding = GeneratedInput( - size=target_dict_dim, - embedding_name='_target_language_embedding', - embedding_size=word_vector_dim) + trg_embedding = paddle.layer.GeneratedInput( + size=target_dict_dim, + embedding_name='_target_language_embedding', + embedding_size=word_vector_dim) group_inputs.append(trg_embedding) - beam_gen = beam_search(name=decoder_group_name, - step=gru_decoder_with_attention, - input=group_inputs, - bos_id=0, # Beginnning token. - eos_id=1, # End of sentence token. - beam_size=beam_size, - max_length=max_length) + beam_gen = paddle.layer.beam_search( + name=decoder_group_name, + step=gru_decoder_with_attention, + input=group_inputs, + bos_id=0, # Beginnning token. + eos_id=1, # End of sentence token. + beam_size=beam_size, + max_length=max_length) - seqtext_printer_evaluator(input=beam_gen, - id_input=data_layer(name="sent_id", size=1), - dict_file=trg_dict_path, - result_file=gen_trans_file) - outputs(beam_gen) + return beam_gen -Notice that this generation technique is only useful for decoder like generation process. If you are working on sequence tagging tasks, please refer to :ref:`semantic_role_labeling` for more details. +Notice that this generation technique is only useful for decoder like generation process. If you are working on sequence tagging tasks, please refer to `book/06.understand_sentiment `_ for more details. -The full configuration file is located at :code:`demo/seqToseq/seqToseq_net.py`. +The full configuration file is located at `book/08.machine_translation/train.py `_ . diff --git a/doc/howto/dev/contribute_to_paddle_cn.md b/doc/howto/dev/contribute_to_paddle_cn.md index a48b143c760c6fc6fc08e793e4cf2f82f6713dc0..699390145226ec2b65fdf5122db187e1d30d669e 100644 --- a/doc/howto/dev/contribute_to_paddle_cn.md +++ b/doc/howto/dev/contribute_to_paddle_cn.md @@ -84,7 +84,7 @@ no changes added to commit (use "git add" and/or "git commit -a") ➜ docker build -t paddle:dev . ``` -随后可以用这个开发镜像开build PaddlePaddle的源码。比如如果要build一个不依赖GPU,但是支持AVX指令集,并且包括unit tests的PaddlePaddle,可以: +随后可以用这个开发镜像开始build PaddlePaddle的源码。比如如果要build一个不依赖GPU,但是支持AVX指令集,并且包括unit tests的PaddlePaddle,可以: ```bash ➜ docker run -v $(pwd):/paddle -e "WITH_GPU=OFF" -e "WITH_AVX=ON" -e "WITH_TEST=ON" paddle:dev diff --git a/doc/howto/dev/contribute_to_paddle_en.md b/doc/howto/dev/contribute_to_paddle_en.md index 9b0d3e83c0dc264650eda73e6801c60a75439b4a..40d1eb62d722244139cc84eb170c190d988f5626 100644 --- a/doc/howto/dev/contribute_to_paddle_en.md +++ b/doc/howto/dev/contribute_to_paddle_en.md @@ -4,9 +4,9 @@ We sincerely appreciate your contributions. You can use fork and pull request workflow to merge your code. ## Code Requirements -- Your code must be fully documented by - [doxygen](http://www.stack.nl/~dimitri/doxygen/) style. -- Make sure the compiler option WITH\_STYLE\_CHECK is on and the compiler +- Your code comments must be fully documented by + [Doxygen](http://www.stack.nl/~dimitri/doxygen/) style. +- Make sure the compiler option `WITH_STYLE_CHECK` is on and the compiler passes the code style check. - All code must have unit test. - Pass all unit tests. @@ -20,32 +20,25 @@ It's just that simple. ## Clone -Paddle is currently using [git-flow branching model](http://nvie.com/posts/a-successful-git-branching-model/). -The **develop** is the main branch, and other user's branches are feature branches. +Clone remote repository. -Once you've created a fork, you can use your favorite git client to clone your -repo or just head straight to the command line: - -```shell -# Clone your fork to your local machine -git clone --branch develop https://github.com/USERNAME/Paddle.git -``` -If your repository doesn't contain **develop** branch, just create it by your own. - -```shell -git clone https://github.com/USERNAME/Paddle.git Paddle -cd Paddle -git checkout -b develop # create develop branch. -git remote add upstream https://github.com/PaddlePaddle/Paddle.git # add upstream to baidu/Paddle -git pull upstream develop # update to upstream +```bash +➜ git clone https://github.com/USERNAME/Paddle +➜ cd Paddle ``` -Then you can start to develop by making a local developement branch +## Create a local branch + +Paddle is currently using [Git-flow branching model](http://nvie.com/posts/a-successful-git-branching-model/). -```shell -git checkout -b MY_COOL_STUFF_BRANCH +All feature and bug fix development work should be done on a new branch, generally create new branch from `develop` branch . + +```bash +➜ git checkout -b my-cool-stuff ``` +Before the checkout, you need to keep the current branch directory clean, otherwise the untracked file will be brought to the new branch, which can be inspected by `git status`. + ## Using `pre-commit` hook Paddle developers use [pre-commit](http://pre-commit.com/) tool to manage git @@ -58,89 +51,169 @@ To use [pre-commit](http://pre-commit.com/), you should install it by `pip install pre-commit`, and currently, Paddle uses `clang-format` to format c/cpp sources. Please make sure clang-format 3.8+ installed. -Then just run `pre-commit install` in your Paddle clone directory. When you -commit your code, the pre-commit hook will check the local code if there is +Install and run it as follow: + +```bash +➜ pip install pre-commit +➜ pre-commit install +``` + +When you commit your code, the pre-commit hook will check the local code if there is anything not suitable to commit, and so on. +## Start to develop + +In this tutorial, I delete a line in README.md and created a new file. + +We can use `git status` to inspect the changes of current directory, `git diff` to see difference. + +```bash +➜ git status +On branch test +Changes not staged for commit: + (use "git add ..." to update what will be committed) + (use "git checkout -- ..." to discard changes in working directory) + + modified: README.md + +Untracked files: + (use "git add ..." to include in what will be committed) + + test + +no changes added to commit (use "git add" and/or "git commit -a") +``` +## Build and Test + +We package PaddlePaddle's compile environment into a Docker image, called the develop image named `paddle:dev`, it contains all compiling tools that PaddlePaddle needs. + +If you want to build the develop image, just run: + +```bash +➜ docker build -t paddle:dev . +``` + +Then we can use the develop image to build PaddlePaddle source. For example: + +```bash +➜ docker run -v $(pwd):/paddle -e "WITH_GPU=OFF" -e "WITH_AVX=ON" -e "WITH_TEST=ON" paddle:dev +``` + +The above command will compile PaddlePaddle and create a Dockerfile for building production image. All the generated files are in the build directory. "WITH_GPU" controls if the generated production image supports GPU. "WITH_AVX" controls if the generated production image supports AVX. "WITH_TEST" controls if the unit test will be generated. + +Then we can generate the production image by copying the compiled PaddlePaddle program into the image by + +```bash +➜ docker build -t paddle:prod -f build/Dockerfile . +``` + +Run unit test finally: + +```bash +➜ docker run -it -v $(pwd):/paddle paddle:dev bash -c "cd /paddle/build && ctest" +``` + +For more details, you can read [this doc](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/getstarted/build_and_install/docker_install_en.rst). + ## Commit -Commit your changes by following command lines: +Next we cancel the changes to the README.md file and then commit our changes by following command lines: + +```bash +➜ git checkout -- README.md +➜ git status +On branch test +Untracked files: + (use "git add ..." to include in what will be committed) + + test + +nothing added to commit but untracked files present (use "git add" to track) +➜ git add test +``` -```shell -# show the working tree status -git status -# add modified files -git add xx -env EDITOR=vim git commit # You can write your comments by vim/nano/emacs. +We should write a description of each commit by `git commit` to allow others to know +the changes in these files. + +```bash +➜ git commit +CRLF end-lines remover...............................(no files to check)Skipped +yapf.................................................(no files to check)Skipped +Check for added large files..............................................Passed +Check for merge conflicts................................................Passed +Check for broken symlinks................................................Passed +Detect Private Key...................................(no files to check)Skipped +Fix End of Files.....................................(no files to check)Skipped +clang-formater.......................................(no files to check)Skipped +[my-cool-stuff c703c041] add test file + 1 file changed, 0 insertions(+), 0 deletions(-) + create mode 100644 233 ``` -The first line of commit infomation is the title. The second and later lines -are the details if any. ## Keeping Fork Up to Date Before pull your request, you should sync your code from the latest PaddlePaddle. To do this, you'll need to add a remote at first: -```shell -# see the current configured remote repository -git remote -v -# add upstream repository -git remote add upstream https://github.com/PaddlePaddle/Paddle.git -# verify the new upstream -git remote -v +```bash +➜ git remote add upstream https://github.com/PaddlePaddle/Paddle +➜ git remote +origin +upstream ``` Update your fork with the latest upstream changes: -```shell -git pull --rebase upstream develop +```bash +➜ git fetch upstream +➜ git pull upstream develop ``` -If there are no unique commits locally, git will simply perform a fast-forward. -However, if you have been making changes (in the vast majority of cases you -probably shouldn't be), you may have to deal with conflicts. - Now, your local master branch is up-to-date with everything modified upstream. ## Push to GitHub -```shell +```bash # push to your repository in Github -git push -u origin MY_COOL_STUFF_BRANCH # create remote branch MY_COOL_STUFF_BRANCH to origin. +➜ git push origin my-cool-stuff ``` -## Pull Request +## Create an issue and a Pull Request + +Create an Issue to describe the problem and record its number. Go to the page for your fork on GitHub, select your development branch, -and click the **pull request button**. - -## Update your pull request with the lastest version - -During the code review, your pull request may become stale because new commits in -baidu/Paddle. GitHub allows autmotic update if there is no conflict. You can do this -by clicking the "Update Branch" button in your pull request page. However, in the case -of conflict, you need to do the update manually. You need to do the following on -your local repository: -```shell -git checkout MY_COOL_STUFF_BRANCH -git pull upstream develop -# You may need to resolve the conflict according to the git prompt. -# Make and test your code. -git push origin MY_COOL_STUFF_BRANCH +and click the `New pull request`. + +screen shot 2017-04-26 at 9 09 28 pm + +Then select the target branch: + +screen shot 2017-04-26 at 9 11 52 pm + +We can add `resolve #Issue number` in PR description to close the issue automatically after the PR is merge. More details in . + +Then wait for review, if there need to modify, refer to the above steps to update the corresponding origin branch. + +## Delete origin branch + +After the PR is merge into the main repository, we can delete the remote branch on the PR page. + +screen shot 2017-04-26 at 9 18 24 pm + +Or just run: + +```bash +➜ git push origin :my-cool-stuff ``` -Now your Pull Request is updated with the latest version. -## Revise your pull request +## Delete local branch -When you revise your pull request according to reviewer's comments, please use 'git commit' instead of 'git commit --amend' to commit your changes so that the reviewers can see the difference between the new pull requrest and the old pull request. +Finally, we delete local branch: -The possible commands are +```bash +➜ git checkout develop -```shell -git checkout MY_COOL_STUFF_BRANCH -git pull upstream develop # update local to newest code base. -# May be some conflicts will occured. -# And develop your cool stuff -env EDITOR=vim git commit # add your revise log -git push origin MY_COOL_STUFF_BRANCH +# delete my-cool-stuff branch +➜ git branch -D my-cool-stuff ``` diff --git a/doc/howto/dev/new_layer_cn.rst b/doc/howto/dev/new_layer_cn.rst index 9489a921c70ad6ee5709f46445554f5d9640162c..75037e693b32f923ee7dc9dfec322495fe4ce10a 100644 --- a/doc/howto/dev/new_layer_cn.rst +++ b/doc/howto/dev/new_layer_cn.rst @@ -37,7 +37,7 @@ \frac{\partial c(y)}{\partial x} = \frac{\partial c(y)}{\partial y} \frac{\partial y}{\partial x} -假设 :math:`z = f(W^T x + b)` ,那么 +假设 :math:`z = W^T x + b` ,那么 .. math:: diff --git a/doc/howto/dev/new_layer_en.rst b/doc/howto/dev/new_layer_en.rst index 46481f5ead33dc6a26507e021fd9ae0f8316e940..110a9fb38f890a766bb4480e91feb22d3b0838a5 100644 --- a/doc/howto/dev/new_layer_en.rst +++ b/doc/howto/dev/new_layer_en.rst @@ -29,7 +29,7 @@ Fully connected layer takes a dense input vector with dimension :math:`D_i`. It where :math:`f(.)` is an nonlinear *activation* function, such as sigmoid, tanh, and Relu. -The transformation matrix :math:`W` and bias vector :math:`b` are the *parameters* of the layer. The *parameters* of a layer are learned during training in the *backward pass*. The backward pass computes the gradients of the output function with respect to all parameters and inputs. The optimizer can use chain rule to compute the gradients of the loss function with respect to each parameter. +The transformation matrix :math:`W` and bias vector :math:`b` are the *parameters* of the layer. The *parameters* of a layer are learned during training in the *backward pass*. The backward pass computes the gradients of the output function with respect to all parameters and inputs. The optimizer can use chain rule to compute the gradients of the loss function with respect to each parameter. Suppose our loss function is :math:`c(y)`, then @@ -37,7 +37,7 @@ Suppose our loss function is :math:`c(y)`, then \frac{\partial c(y)}{\partial x} = \frac{\partial c(y)}{\partial y} \frac{\partial y}{\partial x} -Suppose :math:`z = f(W^T x + b)`, then +Suppose :math:`z = W^T x + b`, then .. math:: @@ -48,7 +48,7 @@ This derivative can be automatically computed by our base layer class. Then, for fully connected layer, we need to compute: .. math:: - + \frac{\partial z}{\partial x} = W, \frac{\partial z_j}{\partial W_{ij}} = x_i, \frac{\partial z}{\partial b} = \mathbf 1 where :math:`\mathbf 1` is an all one vector, :math:`W_{ij}` is the number at the i-th row and j-th column of the matrix :math:`W`, :math:`z_j` is the j-th component of the vector :math:`z`, and :math:`x_i` is the i-th component of the vector :math:`x`. @@ -322,7 +322,7 @@ All the gradient check unit tests are located in :code:`paddle/gserver/tests/tes /* weight */ true); } } - + If you are creating a new file for the test, such as :code:`paddle/gserver/tests/testFCGrad.cpp`, you need to add the file to :code:`paddle/gserver/tests/CMakeLists.txt`. An example is given below. All the unit tests will run when you execute the command :code:`make tests`. Notice that some layers might need high accuracy for the gradient check unit tests to work well. You need to configure :code:`WITH_DOUBLE` to `ON` when configuring cmake. .. code-block:: bash diff --git a/doc/howto/dev/write_docs_cn.rst b/doc/howto/dev/write_docs_cn.rst index d536f53abc031e9d279ace0e231a381a2f1e81b6..36e5d420c986fc8d88eefee4aa221dba0a0480f2 100644 --- a/doc/howto/dev/write_docs_cn.rst +++ b/doc/howto/dev/write_docs_cn.rst @@ -41,7 +41,7 @@ PaddlePaddle文档需要准备的环境相对较复杂,所以我们推荐使 python -c "import py_paddle" -如果提示错误,那么用户需要在本地编译安装PaddlePaddle,请参考 `源码编译文档 `_ 。 +如果提示错误,那么用户需要在本地编译安装PaddlePaddle,请参考 `源码编译文档 `_ 。 注意,用户在首次编译安装PaddlePaddle时,请将WITH_DOC选项关闭。在编译安装正确之后,请再次确认py_paddle包已经安装,即可进行下一步操作。 如果提示正确,可以执行以下命令编译生成文档,即 @@ -68,9 +68,9 @@ PaddlePaddle文档使用 `sphinx`_ 自动生成,用户可以参考sphinx教程 如何更新www.paddlepaddle.org文档 ================================ -开发者给PaddlePaddle代码增加的注释以PR的形式提交到github中,提交方式可参见 `贡献文档 `_ 。 -目前PaddlePaddle的develop分支的文档是自动触发更新的,用户可以分别查看最新的 `中文文档 `_ 和 -`英文文档 `_ 。 +开发者给PaddlePaddle代码增加的注释以PR的形式提交到github中,提交方式可参见 `贡献文档 `_ 。 +目前PaddlePaddle的develop分支的文档是自动触发更新的,用户可以分别查看最新的 `中文文档 `_ 和 +`英文文档 `_ 。 diff --git a/doc/templates/conf.py.cn.in b/doc/templates/conf.py.cn.in index 95cad835b11816f4d2e256c2abd662a545a5bad2..41b35b5b233abd737db07aaeb6c6dd4bf6d42b08 100644 --- a/doc/templates/conf.py.cn.in +++ b/doc/templates/conf.py.cn.in @@ -13,22 +13,18 @@ # serve to show the default. import sys import os, subprocess +sys.path.insert(0, os.path.abspath('@PADDLE_SOURCE_DIR@/python')) import shlex from recommonmark import parser, transform -try: - import py_paddle - import paddle - import paddle.v2 -except ImportError: - print("Must install paddle python package before generating documentation") - sys.exit(1) +import paddle +import paddle.v2 MarkdownParser = parser.CommonMarkParser AutoStructify = transform.AutoStructify # If extensions (or modules to document with autodoc) are in another directory, # add these directories to sys.path here. If the directory is relative to the # documentation root, use os.path.abspath to make it absolute, like shown here. -templates_path = ["@PROJ_ROOT@/doc_theme/templates"] +templates_path = ["@PADDLE_SOURCE_DIR@/doc_theme/templates"] # -- General configuration ------------------------------------------------ @@ -124,7 +120,7 @@ html_theme = 'sphinx_rtd_theme' # Add any paths that contain custom static files (such as style sheets) here, # relative to this directory. They are copied after the builtin static files, # so a file named "default.css" will overwrite the builtin "default.css". -html_static_path = ['@PROJ_ROOT@/doc_theme/static'] +html_static_path = ['@PADDLE_SOURCE_DIR@/doc_theme/static'] # Output file base name for HTML help builder. htmlhelp_basename = project + 'doc' diff --git a/doc/templates/conf.py.en.in b/doc/templates/conf.py.en.in index b477f0120c4fa0544012080b7cfb8572d3c44b04..5822c2481dd61da2084b0de76f6f65aa4e32e033 100644 --- a/doc/templates/conf.py.en.in +++ b/doc/templates/conf.py.en.in @@ -13,15 +13,11 @@ # serve to show the default. import sys import os, subprocess +sys.path.insert(0, os.path.abspath('@PADDLE_SOURCE_DIR@/python')) import shlex from recommonmark import parser, transform -try: - import py_paddle - import paddle - import paddle.v2 -except ImportError: - print("Must install paddle python package before generating documentation") - sys.exit(1) +import paddle +import paddle.v2 MarkdownParser = parser.CommonMarkParser @@ -29,7 +25,7 @@ AutoStructify = transform.AutoStructify # If extensions (or modules to document with autodoc) are in another directory, # add these directories to sys.path here. If the directory is relative to the # documentation root, use os.path.abspath to make it absolute, like shown here. -templates_path = ["@PROJ_ROOT@/doc_theme/templates"] +templates_path = ["@PADDLE_SOURCE_DIR@/doc_theme/templates"] # -- General configuration ------------------------------------------------ @@ -124,7 +120,7 @@ html_theme = 'sphinx_rtd_theme' # Add any paths that contain custom static files (such as style sheets) here, # relative to this directory. They are copied after the builtin static files, # so a file named "default.css" will overwrite the builtin "default.css". -html_static_path = ['@PROJ_ROOT@/doc_theme/static'] +html_static_path = ['@PADDLE_SOURCE_DIR@/doc_theme/static'] # Output file base name for HTML help builder. htmlhelp_basename = project + 'doc' diff --git a/doc_theme/templates/layout.html b/doc_theme/templates/layout.html index 65e61c5f298e19adc6330c378779a6edf418752e..9fca69dc4e7f0827acfc755a97a662350214b90e 100644 --- a/doc_theme/templates/layout.html +++ b/doc_theme/templates/layout.html @@ -101,7 +101,7 @@