提交 18a3588b 编写于 作者: Y Yu Yang

Merge branch 'develop' of github.com:baidu/Paddle into feature/c_api

...@@ -12,19 +12,26 @@ ...@@ -12,19 +12,26 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License # limitations under the License
cmake_minimum_required(VERSION 3.0)
project(paddle CXX C)
set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_SOURCE_DIR}/cmake") set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_SOURCE_DIR}/cmake")
set(PROJ_ROOT ${CMAKE_SOURCE_DIR}) set(PROJ_ROOT ${CMAKE_SOURCE_DIR})
include(system)
if(ANDROID)
cmake_minimum_required(VERSION 3.7)
else()
cmake_minimum_required(VERSION 3.0)
endif()
project(paddle CXX C)
find_package(Sphinx) find_package(Sphinx)
find_package(CUDA QUIET) if(NOT CMAKE_CROSSCOMPILING)
find_package(CUDA QUIET)
endif(NOT CMAKE_CROSSCOMPILING)
find_package(Git REQUIRED) find_package(Git REQUIRED)
find_package(Threads REQUIRED) find_package(Threads REQUIRED)
include(system)
include(simd) include(simd)
################################ Configurations ####################################### ################################ Configurations #######################################
...@@ -52,6 +59,21 @@ if(NOT CMAKE_BUILD_TYPE) ...@@ -52,6 +59,21 @@ if(NOT CMAKE_BUILD_TYPE)
FORCE) FORCE)
endif() endif()
if(ANDROID)
if(${CMAKE_SYSTEM_VERSION} VERSION_LESS "21")
message(FATAL_ERROR "Unsupport standalone toolchains with Android API level lower than 21")
endif()
set(WITH_GPU OFF CACHE STRING
"Disable GPU when cross-compiling for Android" FORCE)
set(WITH_AVX OFF CACHE STRING
"Disable AVX when cross-compiling for Android" FORCE)
set(WITH_PYTHON OFF CACHE STRING
"Disable PYTHON when cross-compiling for Android" FORCE)
set(WITH_RDMA OFF CACHE STRING
"Disable RDMA when cross-compiling for Android" FORCE)
endif(ANDROID)
set(THIRD_PARTY_PATH "${PROJ_ROOT}/third_party" CACHE STRING set(THIRD_PARTY_PATH "${PROJ_ROOT}/third_party" CACHE STRING
"A path setting third party libraries download & build directories.") "A path setting third party libraries download & build directories.")
...@@ -72,6 +94,7 @@ include(external/python) # download, build, install python ...@@ -72,6 +94,7 @@ include(external/python) # download, build, install python
include(external/openblas) # download, build, install openblas include(external/openblas) # download, build, install openblas
include(external/swig) # download, build, install swig include(external/swig) # download, build, install swig
include(external/warpctc) # download, build, install warpctc include(external/warpctc) # download, build, install warpctc
include(external/any) # download libn::any
include(package) # set paddle packages include(package) # set paddle packages
include(cpplint) # set paddle c++ style include(cpplint) # set paddle c++ style
...@@ -82,7 +105,6 @@ include(flags) # set paddle compile flags ...@@ -82,7 +105,6 @@ include(flags) # set paddle compile flags
include(cudnn) # set cudnn libraries include(cudnn) # set cudnn libraries
include(version) # set PADDLE_VERSION include(version) # set PADDLE_VERSION
include(coveralls) # set code coverage include(coveralls) # set code coverage
include(configure) # add paddle env configuration include(configure) # add paddle env configuration
include_directories("${PROJ_ROOT}") include_directories("${PROJ_ROOT}")
......
# A image for building paddle binaries # A image for building paddle binaries
# Use cuda devel base image for both cpu and gpu environment # Use cuda devel base image for both cpu and gpu environment
FROM nvidia/cuda:7.5-cudnn5-devel-ubuntu14.04 FROM nvidia/cuda:8.0-cudnn5-devel-ubuntu14.04
MAINTAINER PaddlePaddle Authors <paddle-dev@baidu.com> MAINTAINER PaddlePaddle Authors <paddle-dev@baidu.com>
ARG UBUNTU_MIRROR ARG UBUNTU_MIRROR
RUN /bin/bash -c 'if [[ -n ${UBUNTU_MIRROR} ]]; then sed -i 's#http://archive.ubuntu.com/ubuntu#${UBUNTU_MIRROR}#g' /etc/apt/sources.list; fi' RUN /bin/bash -c 'if [[ -n ${UBUNTU_MIRROR} ]]; then sed -i 's#http://archive.ubuntu.com/ubuntu#${UBUNTU_MIRROR}#g' /etc/apt/sources.list; fi'
# ENV variables # ENV variables
ARG BUILD_WOBOQ
ARG WITH_GPU ARG WITH_GPU
ARG WITH_AVX ARG WITH_AVX
ARG WITH_DOC ARG WITH_DOC
ARG WITH_STYLE_CHECK ARG WITH_STYLE_CHECK
ENV BUILD_WOBOQ=${BUILD_WOBOQ:-OFF} ENV WOBOQ OFF
ENV WITH_GPU=${WITH_AVX:-OFF} ENV WITH_GPU=${WITH_AVX:-OFF}
ENV WITH_AVX=${WITH_AVX:-ON} ENV WITH_AVX=${WITH_AVX:-ON}
ENV WITH_DOC=${WITH_DOC:-OFF} ENV WITH_DOC=${WITH_DOC:-OFF}
...@@ -37,18 +36,20 @@ RUN git config --global credential.helper store ...@@ -37,18 +36,20 @@ RUN git config --global credential.helper store
# Fix locales to en_US.UTF-8 # Fix locales to en_US.UTF-8
RUN localedef -i en_US -f UTF-8 en_US.UTF-8 RUN localedef -i en_US -f UTF-8 en_US.UTF-8
# FIXME: due to temporary ipykernel dependency issue, specify ipykernel jupyter
# version util jupyter fixes this issue.
RUN pip install --upgrade pip && \ RUN pip install --upgrade pip && \
pip install -U 'protobuf==3.1.0' && \ pip install -U 'protobuf==3.1.0' && \
pip install -U wheel pillow BeautifulSoup && \ pip install -U wheel pillow BeautifulSoup && \
pip install -U docopt PyYAML sphinx && \ pip install -U docopt PyYAML sphinx && \
pip install -U sphinx-rtd-theme==0.1.9 recommonmark && \ pip install -U sphinx-rtd-theme==0.1.9 recommonmark && \
pip install -U pre-commit 'requests==2.9.2' jupyter pip install pre-commit 'requests==2.9.2' 'ipykernel==4.6.0' 'jupyter==1.0.0'
RUN curl -sSL https://cmake.org/files/v3.4/cmake-3.4.1.tar.gz | tar -xz && \ RUN curl -sSL https://cmake.org/files/v3.4/cmake-3.4.1.tar.gz | tar -xz && \
cd cmake-3.4.1 && ./bootstrap && make -j `nproc` && make install && \ cd cmake-3.4.1 && ./bootstrap && make -j `nproc` && make install && \
cd .. && rm -rf cmake-3.4.1 cd .. && rm -rf cmake-3.4.1
VOLUME ["/usr/share/nginx/html/data", "/usr/share/nginx/html/paddle"] VOLUME ["/woboq_out"]
# Configure OpenSSH server. c.f. https://docs.docker.com/engine/examples/running_ssh_service # Configure OpenSSH server. c.f. https://docs.docker.com/engine/examples/running_ssh_service
RUN mkdir /var/run/sshd RUN mkdir /var/run/sshd
......
...@@ -19,9 +19,9 @@ set(CBLAS_FOUND OFF) ...@@ -19,9 +19,9 @@ set(CBLAS_FOUND OFF)
set(INTEL_ROOT "/opt/intel" CACHE PATH "Folder contains intel libs") set(INTEL_ROOT "/opt/intel" CACHE PATH "Folder contains intel libs")
set(MKL_ROOT ${INTEL_ROOT}/mkl CACHE PATH "Folder contains MKL") set(MKL_ROOT ${INTEL_ROOT}/mkl CACHE PATH "Folder contains MKL")
find_path(MKL_INCLUDE_DIR mkl.h PATHS find_path(MKL_INC_DIR mkl.h PATHS
${MKL_ROOT}/include) ${MKL_ROOT}/include)
find_path(MKL_INCLUDE_DIR mkl_lapacke.h PATHS find_path(MKL_LAPACK_INC_DIR mkl_lapacke.h PATHS
${MKL_ROOT}/include) ${MKL_ROOT}/include)
find_library(MKL_CORE_LIB NAMES mkl_core PATHS find_library(MKL_CORE_LIB NAMES mkl_core PATHS
${MKL_ROOT}/lib ${MKL_ROOT}/lib
...@@ -34,15 +34,19 @@ find_library(MKL_INTEL_LP64 NAMES mkl_intel_lp64 PATHS ...@@ -34,15 +34,19 @@ find_library(MKL_INTEL_LP64 NAMES mkl_intel_lp64 PATHS
${MKL_ROOT}/lib/intel64) ${MKL_ROOT}/lib/intel64)
if(MKL_INCLUDE_DIR AND MKL_CORE_LIB AND MKL_SEQUENTIAL_LIB AND MKL_INTEL_LP64) if(MKL_INC_DIR AND MKL_CORE_LIB AND MKL_SEQUENTIAL_LIB AND MKL_INTEL_LP64)
set(CBLAS_PROVIDER MKL) set(CBLAS_PROVIDER MKL)
set(CBLAS_INC_DIR ${MKL_INCLUDE_DIR}) set(CBLAS_INC_DIR ${MKL_INC_DIR})
set(CBLAS_LIBRARIES ${MKL_INTEL_LP64} set(CBLAS_LIBRARIES ${MKL_INTEL_LP64}
${MKL_SEQUENTIAL_LIB} ${MKL_SEQUENTIAL_LIB}
${MKL_CORE_LIB}) ${MKL_CORE_LIB})
add_definitions(-DPADDLE_USE_MKL) add_definitions(-DPADDLE_USE_MKL)
message(STATUS "Found MKL (include: ${CBLAS_INC_DIR}, library: ${CBLAS_LIBRARIES})") message(STATUS "Found MKL (include: ${CBLAS_INC_DIR}, library: ${CBLAS_LIBRARIES})")
set(CBLAS_FOUND ON) set(CBLAS_FOUND ON)
if(${MKL_LAPACK_INC_DIR})
add_definitions(-DPADDLE_USE_LAPACK)
message(STATUS "Found lapack in MKL (include: ${MKL_LAPACK_INC_DIR})")
endif()
return() # return file. return() # return file.
endif() endif()
...@@ -68,13 +72,17 @@ find_library(ATLAS_CBLAS_LIB NAMES cblas libcblas.so.3 ...@@ -68,13 +72,17 @@ find_library(ATLAS_CBLAS_LIB NAMES cblas libcblas.so.3
find_library(ATLAS_LIB NAMES lapack_atlas liblapack_atlas.so.3 find_library(ATLAS_LIB NAMES lapack_atlas liblapack_atlas.so.3
PATHS ${ATLAS_LIB_SEARCH_PATHS}) PATHS ${ATLAS_LIB_SEARCH_PATHS})
if(ATLAS_INC_DIR AND ATLAS_CBLAS_LIB AND ATLAS_LIB) if(ATLAS_INC_DIR AND ATLAS_CBLAS_LIB AND ATLAS_LIB AND NOT CBLAS_FOUND)
set(CBLAS_PROVIDER ATLAS) set(CBLAS_PROVIDER ATLAS)
set(CBLAS_INC_DIR ${ATLAS_INC_DIR} ${ATLAS_CLAPACK_INC_DIR}) set(CBLAS_INC_DIR ${ATLAS_INC_DIR})
set(CBLAS_LIBRARIES ${ATLAS_LIB} ${ATLAS_CBLAS_LIB}) set(CBLAS_LIBRARIES ${ATLAS_LIB} ${ATLAS_CBLAS_LIB})
add_definitions(-DPADDLE_USE_ATLAS) add_definitions(-DPADDLE_USE_ATLAS)
message(STATUS "Found Atlas (include: ${CBLAS_INC_DIR}, library: ${CBLAS_LIBRARIES})") message(STATUS "Found ATLAS (include: ${CBLAS_INC_DIR}, library: ${CBLAS_LIBRARIES})")
set(CBLAS_FOUND ON) set(CBLAS_FOUND ON)
if(ATLAS_CLAPACK_INC_DIR)
add_definitions(-DPADDLE_USE_LAPACK)
message(STATUS "Found lapack in ATLAS (include: ${ATLAS_CLAPACK_INC_DIR})")
endif()
return() return()
endif() endif()
...@@ -103,8 +111,12 @@ if(OPENBLAS_INC_DIR AND OPENBLAS_LIB) ...@@ -103,8 +111,12 @@ if(OPENBLAS_INC_DIR AND OPENBLAS_LIB)
set(CBLAS_PROVIDER OPENBLAS) set(CBLAS_PROVIDER OPENBLAS)
set(CBLAS_INC_DIR ${OPENBLAS_INC_DIR}) set(CBLAS_INC_DIR ${OPENBLAS_INC_DIR})
set(CBLAS_LIBRARIES ${OPENBLAS_LIB}) set(CBLAS_LIBRARIES ${OPENBLAS_LIB})
message(STATUS "Found OpenBlas (include: ${CBLAS_INC_DIR}, library: ${CBLAS_LIBRARIES})") message(STATUS "Found OpenBLAS (include: ${CBLAS_INC_DIR}, library: ${CBLAS_LIBRARIES})")
set(CBLAS_FOUND ON) set(CBLAS_FOUND ON)
if(OPENBLAS_LAPACKE_INC_DIR)
add_definitions(-DPADDLE_USE_LAPACK)
message(STATUS "Found lapack in OpenBLAS (include: ${OPENBLAS_LAPACKE_INC_DIR})")
endif()
return() return()
endif() endif()
......
...@@ -32,6 +32,14 @@ if(NOT WITH_PROFILER) ...@@ -32,6 +32,14 @@ if(NOT WITH_PROFILER)
add_definitions(-DPADDLE_DISABLE_PROFILER) add_definitions(-DPADDLE_DISABLE_PROFILER)
endif(NOT WITH_PROFILER) endif(NOT WITH_PROFILER)
if(NOT CMAKE_CROSSCOMPILING)
if(WITH_AVX AND AVX_FOUND)
set(SIMD_FLAG ${AVX_FLAG})
elseif(SSE3_FOUND)
set(SIMD_FLAG ${SSE3_FLAG})
endif()
endif()
if(NOT WITH_GPU) if(NOT WITH_GPU)
add_definitions(-DPADDLE_ONLY_CPU) add_definitions(-DPADDLE_ONLY_CPU)
add_definitions(-DHPPL_STUB_FUNC) add_definitions(-DHPPL_STUB_FUNC)
...@@ -48,21 +56,12 @@ else() ...@@ -48,21 +56,12 @@ else()
message(FATAL_ERROR "Paddle need cudnn to compile") message(FATAL_ERROR "Paddle need cudnn to compile")
endif() endif()
if(WITH_AVX) set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "-Xcompiler ${SIMD_FLAG}")
set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "-Xcompiler ${AVX_FLAG}")
else(WITH_AVX)
set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "-Xcompiler ${SSE3_FLAG}")
endif(WITH_AVX)
# Include cuda and cudnn # Include cuda and cudnn
include_directories(${CUDNN_INCLUDE_DIR}) include_directories(${CUDNN_INCLUDE_DIR})
include_directories(${CUDA_TOOLKIT_INCLUDE}) include_directories(${CUDA_TOOLKIT_INCLUDE})
endif(NOT WITH_GPU) endif(NOT WITH_GPU)
if(WITH_AVX) set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${SIMD_FLAG}")
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${AVX_FLAG}") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${SIMD_FLAG}")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${AVX_FLAG}")
else(WITH_AVX)
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${SSE3_FLAG}")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${SSE3_FLAG}")
endif(WITH_AVX)
if(NOT WITH_GPU)
return()
endif()
set(CUDNN_ROOT "" CACHE PATH "CUDNN ROOT") set(CUDNN_ROOT "" CACHE PATH "CUDNN ROOT")
find_path(CUDNN_INCLUDE_DIR cudnn.h find_path(CUDNN_INCLUDE_DIR cudnn.h
PATHS ${CUDNN_ROOT} ${CUDNN_ROOT}/include PATHS ${CUDNN_ROOT} ${CUDNN_ROOT}/include
...@@ -11,6 +15,7 @@ list(APPEND CUDNN_CHECK_LIBRARY_DIRS ...@@ -11,6 +15,7 @@ list(APPEND CUDNN_CHECK_LIBRARY_DIRS
${CUDNN_ROOT} ${CUDNN_ROOT}
${CUDNN_ROOT}/lib64 ${CUDNN_ROOT}/lib64
${CUDNN_ROOT}/lib ${CUDNN_ROOT}/lib
${CUDNN_ROOT}/lib/x86_64-linux-gnu
$ENV{CUDNN_ROOT} $ENV{CUDNN_ROOT}
$ENV{CUDNN_ROOT}/lib64 $ENV{CUDNN_ROOT}/lib64
$ENV{CUDNN_ROOT}/lib $ENV{CUDNN_ROOT}/lib
......
INCLUDE(ExternalProject)
SET(ANY_SOURCE_DIR ${THIRD_PARTY_PATH}/any)
INCLUDE_DIRECTORIES(${ANY_SOURCE_DIR}/src/linb_any)
ExternalProject_Add(
linb_any
${EXTERNAL_PROJECT_LOG_ARGS}
GIT_REPOSITORY "https://github.com/thelink2012/any.git"
GIT_TAG "8fef1e93710a0edf8d7658999e284a1142c4c020"
PREFIX ${ANY_SOURCE_DIR}
UPDATE_COMMAND ""
CONFIGURE_COMMAND ""
BUILD_COMMAND ""
INSTALL_COMMAND ""
TEST_COMMAND ""
)
add_definitions(-DANY_IMPL_ANY_CAST_MOVEABLE)
...@@ -31,9 +31,17 @@ ExternalProject_Add( ...@@ -31,9 +31,17 @@ ExternalProject_Add(
GIT_REPOSITORY "https://github.com/gflags/gflags.git" GIT_REPOSITORY "https://github.com/gflags/gflags.git"
PREFIX ${GFLAGS_SOURCES_DIR} PREFIX ${GFLAGS_SOURCES_DIR}
UPDATE_COMMAND "" UPDATE_COMMAND ""
CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
CMAKE_ARGS -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
CMAKE_ARGS -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
CMAKE_ARGS -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${GFLAGS_INSTALL_DIR} CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${GFLAGS_INSTALL_DIR}
CMAKE_ARGS -DCMAKE_POSITION_INDEPENDENT_CODE=ON CMAKE_ARGS -DCMAKE_POSITION_INDEPENDENT_CODE=ON
CMAKE_ARGS -DBUILD_TESTING=OFF CMAKE_ARGS -DBUILD_TESTING=OFF
CMAKE_ARGS -DCMAKE_BUILD_TYPE=Release
CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${GFLAGS_INSTALL_DIR}
-DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
-DCMAKE_BUILD_TYPE:STRING=Release
) )
LIST(APPEND external_project_dependencies gflags) LIST(APPEND external_project_dependencies gflags)
...@@ -33,11 +33,19 @@ ExternalProject_Add( ...@@ -33,11 +33,19 @@ ExternalProject_Add(
GIT_REPOSITORY "https://github.com/google/glog.git" GIT_REPOSITORY "https://github.com/google/glog.git"
PREFIX ${GLOG_SOURCES_DIR} PREFIX ${GLOG_SOURCES_DIR}
UPDATE_COMMAND "" UPDATE_COMMAND ""
CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
CMAKE_ARGS -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
CMAKE_ARGS -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
CMAKE_ARGS -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${GLOG_INSTALL_DIR} CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${GLOG_INSTALL_DIR}
CMAKE_ARGS -DCMAKE_POSITION_INDEPENDENT_CODE=ON CMAKE_ARGS -DCMAKE_POSITION_INDEPENDENT_CODE=ON
CMAKE_ARGS -DWITH_GFLAGS=ON CMAKE_ARGS -DWITH_GFLAGS=ON
CMAKE_ARGS -Dgflags_DIR=${GFLAGS_INSTALL_DIR}/lib/cmake/gflags CMAKE_ARGS -Dgflags_DIR=${GFLAGS_INSTALL_DIR}/lib/cmake/gflags
CMAKE_ARGS -DBUILD_TESTING=OFF CMAKE_ARGS -DBUILD_TESTING=OFF
CMAKE_ARGS -DCMAKE_BUILD_TYPE=Release
CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${GLOG_INSTALL_DIR}
-DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
-DCMAKE_BUILD_TYPE:STRING=Release
) )
LIST(APPEND external_project_dependencies glog) LIST(APPEND external_project_dependencies glog)
...@@ -41,11 +41,19 @@ IF(WITH_TESTING) ...@@ -41,11 +41,19 @@ IF(WITH_TESTING)
GIT_TAG "release-1.8.0" GIT_TAG "release-1.8.0"
PREFIX ${GTEST_SOURCES_DIR} PREFIX ${GTEST_SOURCES_DIR}
UPDATE_COMMAND "" UPDATE_COMMAND ""
CMAKE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${GTEST_INSTALL_DIR} CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
CMAKE_ARGS -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
CMAKE_ARGS -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
CMAKE_ARGS -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${GTEST_INSTALL_DIR}
CMAKE_ARGS -DCMAKE_POSITION_INDEPENDENT_CODE=ON CMAKE_ARGS -DCMAKE_POSITION_INDEPENDENT_CODE=ON
CMAKE_ARGS -DBUILD_GMOCK=ON CMAKE_ARGS -DBUILD_GMOCK=ON
CMAKE_ARGS -Dgtest_disable_pthreads=ON CMAKE_ARGS -Dgtest_disable_pthreads=ON
CMAKE_ARGS -Dgtest_force_shared_crt=ON CMAKE_ARGS -Dgtest_force_shared_crt=ON
CMAKE_ARGS -DCMAKE_BUILD_TYPE=Release
CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${GTEST_INSTALL_DIR}
-DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
-DCMAKE_BUILD_TYPE:STRING=Release
) )
LIST(APPEND external_project_dependencies gtest) LIST(APPEND external_project_dependencies gtest)
ENDIF(WITH_TESTING) ENDIF(WITH_TESTING)
...@@ -29,7 +29,24 @@ IF(NOT ${CBLAS_FOUND}) ...@@ -29,7 +29,24 @@ IF(NOT ${CBLAS_FOUND})
IF(CMAKE_COMPILER_IS_GNUCC) IF(CMAKE_COMPILER_IS_GNUCC)
ENABLE_LANGUAGE(Fortran) ENABLE_LANGUAGE(Fortran)
LIST(APPEND CBLAS_LIBRARIES gfortran pthread) if (NOT CMAKE_Fortran_COMPILER_VERSION)
# cmake < 3.4 cannot get CMAKE_Fortran_COMPILER_VERSION directly.
execute_process(COMMAND ${CMAKE_Fortran_COMPILER} -dumpversion
OUTPUT_VARIABLE CMAKE_Fortran_COMPILER_VERSION)
endif()
string(REGEX MATCHALL "[0-9]+" Fortran_VERSION ${CMAKE_Fortran_COMPILER_VERSION})
list(GET Fortran_VERSION 0 Fortran_MAJOR)
list(GET Fortran_VERSION 1 Fortran_MINOR)
find_library(GFORTRAN_LIBRARY NAMES gfortran PATHS
/lib
/usr/lib
/usr/lib/gcc/x86_64-linux-gnu/${Fortran_MAJOR}.${Fortran_MINOR}/
/usr/lib/gcc/x86_64-linux-gnu/${Fortran_MAJOR}/)
if (NOT GFORTRAN_LIBRARY)
message(FATAL_ERROR "Cannot found gfortran library which it is used by openblas")
endif()
find_package(Threads REQUIRED)
LIST(APPEND CBLAS_LIBRARIES ${GFORTRAN_LIBRARY} ${CMAKE_THREAD_LIBS_INIT})
ENDIF(CMAKE_COMPILER_IS_GNUCC) ENDIF(CMAKE_COMPILER_IS_GNUCC)
IF(NOT CMAKE_Fortran_COMPILER) IF(NOT CMAKE_Fortran_COMPILER)
...@@ -37,6 +54,8 @@ IF(NOT ${CBLAS_FOUND}) ...@@ -37,6 +54,8 @@ IF(NOT ${CBLAS_FOUND})
"you need to set gfortran compiler: cmake .. -DCMAKE_Fortran_COMPILER=...") "you need to set gfortran compiler: cmake .. -DCMAKE_Fortran_COMPILER=...")
ENDIF(NOT CMAKE_Fortran_COMPILER) ENDIF(NOT CMAKE_Fortran_COMPILER)
ADD_DEFINITIONS(-DPADDLE_USE_LAPACK)
ExternalProject_Add( ExternalProject_Add(
openblas openblas
${EXTERNAL_PROJECT_LOG_ARGS} ${EXTERNAL_PROJECT_LOG_ARGS}
......
...@@ -58,12 +58,20 @@ IF(NOT PROTOBUF_FOUND) ...@@ -58,12 +58,20 @@ IF(NOT PROTOBUF_FOUND)
GIT_TAG "9f75c5aa851cd877fb0d93ccc31b8567a6706546" GIT_TAG "9f75c5aa851cd877fb0d93ccc31b8567a6706546"
CONFIGURE_COMMAND CONFIGURE_COMMAND
${CMAKE_COMMAND} ${PROTOBUF_SOURCES_DIR}/src/protobuf/cmake ${CMAKE_COMMAND} ${PROTOBUF_SOURCES_DIR}/src/protobuf/cmake
-Dprotobuf_BUILD_TESTS=OFF -Dprotobuf_BUILD_TESTS=OFF
-DZLIB_ROOT:FILEPATH=${ZLIB_ROOT} -DZLIB_ROOT:FILEPATH=${ZLIB_ROOT}
-DCMAKE_POSITION_INDEPENDENT_CODE=ON -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
-DCMAKE_BUILD_TYPE=Release -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
-DCMAKE_INSTALL_PREFIX=${PROTOBUF_INSTALL_DIR} -DCMAKE_POSITION_INDEPENDENT_CODE=ON
-DCMAKE_INSTALL_LIBDIR=lib -DCMAKE_BUILD_TYPE=Release
-DCMAKE_INSTALL_PREFIX=${PROTOBUF_INSTALL_DIR}
-DCMAKE_INSTALL_LIBDIR=lib
CMAKE_CACHE_ARGS
-DCMAKE_INSTALL_PREFIX:PATH=${PROTOBUF_INSTALL_DIR}
-DCMAKE_BUILD_TYPE:STRING=Release
-DCMAKE_VERBOSE_MAKEFILE:BOOL=OFF
-DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
-DZLIB_ROOT:STRING=${ZLIB_ROOT}
) )
LIST(APPEND external_project_dependencies protobuf) LIST(APPEND external_project_dependencies protobuf)
......
...@@ -219,9 +219,9 @@ ELSE(PYTHONLIBS_FOUND AND PYTHONINTERP_FOUND) ...@@ -219,9 +219,9 @@ ELSE(PYTHONLIBS_FOUND AND PYTHONINTERP_FOUND)
ENDIF(PYTHONLIBS_FOUND AND PYTHONINTERP_FOUND) ENDIF(PYTHONLIBS_FOUND AND PYTHONINTERP_FOUND)
INCLUDE_DIRECTORIES(${PYTHON_INCLUDE_DIR}) IF(WITH_PYTHON)
INCLUDE_DIRECTORIES(${PYTHON_NUMPY_INCLUDE_DIR}) INCLUDE_DIRECTORIES(${PYTHON_INCLUDE_DIR})
INCLUDE_DIRECTORIES(${PYTHON_NUMPY_INCLUDE_DIR})
IF(NOT WITH_PYTHON) ELSE()
SET(PYTHON_LIBRARIES "") SET(PYTHON_LIBRARIES "")
ENDIF() ENDIF()
...@@ -50,12 +50,19 @@ ExternalProject_Add( ...@@ -50,12 +50,19 @@ ExternalProject_Add(
UPDATE_COMMAND "" UPDATE_COMMAND ""
CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
CMAKE_ARGS -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} CMAKE_ARGS -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
CMAKE_ARGS -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
CMAKE_ARGS -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${WARPCTC_INSTALL_DIR} CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${WARPCTC_INSTALL_DIR}
CMAKE_ARGS -DWITH_GPU=${WITH_GPU} CMAKE_ARGS -DWITH_GPU=${WITH_GPU}
CMAKE_ARGS -DWITH_OMP=${USE_OMP} CMAKE_ARGS -DWITH_OMP=${USE_OMP}
CMAKE_ARGS -DWITH_TORCH=OFF CMAKE_ARGS -DWITH_TORCH=OFF
CMAKE_ARGS -DCMAKE_DISABLE_FIND_PACKAGE_Torch=TRUE CMAKE_ARGS -DCMAKE_DISABLE_FIND_PACKAGE_Torch=ON
CMAKE_ARGS -DBUILD_SHARED=ON CMAKE_ARGS -DBUILD_SHARED=ON
CMAKE_ARGS -DCMAKE_POSITION_INDEPENDENT_CODE=ON
CMAKE_ARGS -DCMAKE_BUILD_TYPE=Release
CMAKE_CACHE_ARGS -DCMAKE_BUILD_TYPE:STRING=Release
-DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
-DCMAKE_INSTALL_PREFIX:PATH=${WARPCTC_INSTALL_DIR}
) )
LIST(APPEND external_project_dependencies warpctc) LIST(APPEND external_project_dependencies warpctc)
...@@ -22,7 +22,7 @@ SET(ZLIB_INCLUDE_DIR "${ZLIB_INSTALL_DIR}/include" CACHE PATH "zlib include dire ...@@ -22,7 +22,7 @@ SET(ZLIB_INCLUDE_DIR "${ZLIB_INSTALL_DIR}/include" CACHE PATH "zlib include dire
IF(WIN32) IF(WIN32)
SET(ZLIB_LIBRARIES "${ZLIB_INSTALL_DIR}/lib/zlibstatic.lib" CACHE FILEPATH "zlib library." FORCE) SET(ZLIB_LIBRARIES "${ZLIB_INSTALL_DIR}/lib/zlibstatic.lib" CACHE FILEPATH "zlib library." FORCE)
ELSE(WIN32) ELSE(WIN32)
set(ZLIB_LIBRARIES "${ZLIB_INSTALL_DIR}/lib/libz.a" CACHE FILEPATH "zlib library." FORCE) SET(ZLIB_LIBRARIES "${ZLIB_INSTALL_DIR}/lib/libz.a" CACHE FILEPATH "zlib library." FORCE)
ENDIF(WIN32) ENDIF(WIN32)
INCLUDE_DIRECTORIES(${ZLIB_INCLUDE_DIR}) INCLUDE_DIRECTORIES(${ZLIB_INCLUDE_DIR})
...@@ -34,10 +34,18 @@ ExternalProject_Add( ...@@ -34,10 +34,18 @@ ExternalProject_Add(
GIT_TAG "v1.2.8" GIT_TAG "v1.2.8"
PREFIX ${ZLIB_SOURCES_DIR} PREFIX ${ZLIB_SOURCES_DIR}
UPDATE_COMMAND "" UPDATE_COMMAND ""
CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
CMAKE_ARGS -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
CMAKE_ARGS -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
CMAKE_ARGS -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${ZLIB_INSTALL_DIR} CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${ZLIB_INSTALL_DIR}
CMAKE_ARGS -DBUILD_SHARED_LIBS=OFF CMAKE_ARGS -DBUILD_SHARED_LIBS=OFF
CMAKE_ARGS -DCMAKE_POSITION_INDEPENDENT_CODE=ON CMAKE_ARGS -DCMAKE_POSITION_INDEPENDENT_CODE=ON
CMAKE_ARGS -DCMAKE_MACOSX_RPATH=ON CMAKE_ARGS -DCMAKE_MACOSX_RPATH=ON
CMAKE_ARGS -DCMAKE_BUILD_TYPE=Release
CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${ZLIB_INSTALL_DIR}
-DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
-DCMAKE_BUILD_TYPE:STRING=Release
) )
LIST(APPEND external_project_dependencies zlib) LIST(APPEND external_project_dependencies zlib)
...@@ -2,6 +2,7 @@ ...@@ -2,6 +2,7 @@
include(CheckCXXCompilerFlag) include(CheckCXXCompilerFlag)
include(CheckCCompilerFlag) include(CheckCCompilerFlag)
include(CheckCXXSymbolExists) include(CheckCXXSymbolExists)
include(CheckTypeSize)
function(CheckCompilerCXX11Flag) function(CheckCompilerCXX11Flag)
if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU") if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
...@@ -25,7 +26,7 @@ function(CheckCompilerCXX11Flag) ...@@ -25,7 +26,7 @@ function(CheckCompilerCXX11Flag)
endfunction() endfunction()
CheckCompilerCXX11Flag() CheckCompilerCXX11Flag()
LIST(APPEND CMAKE_CXX_FLAGS -std=c++11) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
# safe_set_flag # safe_set_flag
# #
...@@ -83,6 +84,17 @@ if(NOT UINT64_MAX_EXISTS) ...@@ -83,6 +84,17 @@ if(NOT UINT64_MAX_EXISTS)
endif() endif()
endif() endif()
SET(CMAKE_EXTRA_INCLUDE_FILES "pthread.h")
CHECK_TYPE_SIZE(pthread_spinlock_t SPINLOCK_FOUND)
CHECK_TYPE_SIZE(pthread_barrier_t BARRIER_FOUND)
if(SPINLOCK_FOUND)
add_definitions(-DPADDLE_USE_PTHREAD_SPINLOCK)
endif(SPINLOCK_FOUND)
if(BARRIER_FOUND)
add_definitions(-DPADDLE_USE_PTHREAD_BARRIER)
endif(BARRIER_FOUND)
SET(CMAKE_EXTRA_INCLUDE_FILES "")
# Common flags. the compiler flag used for C/C++ sources whenever release or debug # Common flags. the compiler flag used for C/C++ sources whenever release or debug
# Do not care if this flag is support for gcc. # Do not care if this flag is support for gcc.
set(COMMON_FLAGS set(COMMON_FLAGS
......
...@@ -2,6 +2,7 @@ ...@@ -2,6 +2,7 @@
# so that PaddlePaddle can unleash the vectorization power of muticore. # so that PaddlePaddle can unleash the vectorization power of muticore.
INCLUDE(CheckCXXSourceRuns) INCLUDE(CheckCXXSourceRuns)
INCLUDE(CheckCXXSourceCompiles)
IF(CMAKE_COMPILER_IS_GNUCC OR CMAKE_COMPILER_IS_GNUCXX OR CMAKE_CXX_COMPILER_ID MATCHES "Clang") IF(CMAKE_COMPILER_IS_GNUCC OR CMAKE_COMPILER_IS_GNUCXX OR CMAKE_CXX_COMPILER_ID MATCHES "Clang")
set(MMX_FLAG "-mmmx") set(MMX_FLAG "-mmmx")
...@@ -17,6 +18,8 @@ ELSEIF(MSVC) ...@@ -17,6 +18,8 @@ ELSEIF(MSVC)
SET(AVX2_FLAG "/arch:AVX2") SET(AVX2_FLAG "/arch:AVX2")
ENDIF() ENDIF()
set(CMAKE_REQUIRED_FLAGS_RETAINED ${CMAKE_REQUIRED_FLAGS})
# Check MMX # Check MMX
set(CMAKE_REQUIRED_FLAGS ${MMX_FLAG}) set(CMAKE_REQUIRED_FLAGS ${MMX_FLAG})
CHECK_CXX_SOURCE_RUNS(" CHECK_CXX_SOURCE_RUNS("
...@@ -73,4 +76,5 @@ int main() ...@@ -73,4 +76,5 @@ int main()
return 0; return 0;
}" AVX2_FOUND) }" AVX2_FOUND)
set(CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS_RETAINED})
mark_as_advanced(MMX_FOUND SSE2_FOUND SSE3_FOUND AVX_FOUND AVX2_FOUND) mark_as_advanced(MMX_FOUND SSE2_FOUND SSE3_FOUND AVX_FOUND AVX2_FOUND)
...@@ -67,6 +67,12 @@ MARK_AS_ADVANCED(HOST_SYSTEM CPU_CORES) ...@@ -67,6 +67,12 @@ MARK_AS_ADVANCED(HOST_SYSTEM CPU_CORES)
MESSAGE(STATUS "Found Paddle host system: ${HOST_SYSTEM}") MESSAGE(STATUS "Found Paddle host system: ${HOST_SYSTEM}")
MESSAGE(STATUS "Found Paddle host system's CPU: ${CPU_CORES} cores") MESSAGE(STATUS "Found Paddle host system's CPU: ${CPU_CORES} cores")
IF(DEFINED CMAKE_SYSTEM_NAME)
IF(${CMAKE_SYSTEM_NAME} STREQUAL "Android")
SET(ANDROID TRUE)
ENDIF()
ENDIF()
# external dependencies log output # external dependencies log output
SET(EXTERNAL_PROJECT_LOG_ARGS SET(EXTERNAL_PROJECT_LOG_ARGS
LOG_DOWNLOAD 0 # Wrap download in script to log output LOG_DOWNLOAD 0 # Wrap download in script to log output
......
...@@ -90,6 +90,10 @@ function(link_paddle_exe TARGET_NAME) ...@@ -90,6 +90,10 @@ function(link_paddle_exe TARGET_NAME)
${RDMA_LD_FLAGS} ${RDMA_LD_FLAGS}
${RDMA_LIBS}) ${RDMA_LIBS})
if(ANDROID)
target_link_libraries(${TARGET_NAME} log)
endif(ANDROID)
add_dependencies(${TARGET_NAME} ${external_project_dependencies}) add_dependencies(${TARGET_NAME} ${external_project_dependencies})
endfunction() endfunction()
......
import sys import sys
import paddle.v2 as paddle import paddle.v2 as paddle
def seqToseq_net(source_dict_dim, target_dict_dim): def seqToseq_net(source_dict_dim, target_dict_dim, is_generating=False):
### Network Architecture ### Network Architecture
word_vector_dim = 512 # dimension of word vector word_vector_dim = 512 # dimension of word vector
decoder_size = 512 # dimension of hidden unit in GRU Decoder network decoder_size = 512 # dimension of hidden unit in GRU Decoder network
encoder_size = 512 # dimension of hidden unit in GRU Encoder network encoder_size = 512 # dimension of hidden unit in GRU Encoder network
beam_size = 3
max_length = 250
#### Encoder #### Encoder
src_word_id = paddle.layer.data( src_word_id = paddle.layer.data(
name='source_language_word', name='source_language_word',
...@@ -67,79 +71,143 @@ def seqToseq_net(source_dict_dim, target_dict_dim): ...@@ -67,79 +71,143 @@ def seqToseq_net(source_dict_dim, target_dict_dim):
group_input2 = paddle.layer.StaticInputV2(input=encoded_proj, is_seq=True) group_input2 = paddle.layer.StaticInputV2(input=encoded_proj, is_seq=True)
group_inputs = [group_input1, group_input2] group_inputs = [group_input1, group_input2]
trg_embedding = paddle.layer.embedding( if not is_generating:
input=paddle.layer.data( trg_embedding = paddle.layer.embedding(
name='target_language_word', input=paddle.layer.data(
type=paddle.data_type.integer_value_sequence(target_dict_dim)), name='target_language_word',
size=word_vector_dim, type=paddle.data_type.integer_value_sequence(target_dict_dim)),
param_attr=paddle.attr.ParamAttr(name='_target_language_embedding')) size=word_vector_dim,
group_inputs.append(trg_embedding) param_attr=paddle.attr.ParamAttr(name='_target_language_embedding'))
group_inputs.append(trg_embedding)
# For decoder equipped with attention mechanism, in training,
# target embeding (the groudtruth) is the data input, # For decoder equipped with attention mechanism, in training,
# while encoded source sequence is accessed to as an unbounded memory. # target embeding (the groudtruth) is the data input,
# Here, the StaticInput defines a read-only memory # while encoded source sequence is accessed to as an unbounded memory.
# for the recurrent_group. # Here, the StaticInput defines a read-only memory
decoder = paddle.layer.recurrent_group( # for the recurrent_group.
name=decoder_group_name, decoder = paddle.layer.recurrent_group(
step=gru_decoder_with_attention, name=decoder_group_name,
input=group_inputs) step=gru_decoder_with_attention,
input=group_inputs)
lbl = paddle.layer.data(
name='target_language_next_word', lbl = paddle.layer.data(
type=paddle.data_type.integer_value_sequence(target_dict_dim)) name='target_language_next_word',
cost = paddle.layer.classification_cost(input=decoder, label=lbl) type=paddle.data_type.integer_value_sequence(target_dict_dim))
cost = paddle.layer.classification_cost(input=decoder, label=lbl)
return cost
return cost
else:
# In generation, the decoder predicts a next target word based on
# the encoded source sequence and the last generated target word.
# The encoded source sequence (encoder's output) must be specified by
# StaticInput, which is a read-only memory.
# Embedding of the last generated word is automatically gotten by
# GeneratedInputs, which is initialized by a start mark, such as <s>,
# and must be included in generation.
trg_embedding = paddle.layer.GeneratedInputV2(
size=target_dict_dim,
embedding_name='_target_language_embedding',
embedding_size=word_vector_dim)
group_inputs.append(trg_embedding)
beam_gen = paddle.layer.beam_search(
name=decoder_group_name,
step=gru_decoder_with_attention,
input=group_inputs,
bos_id=0,
eos_id=1,
beam_size=beam_size,
max_length=max_length)
return beam_gen
def main(): def main():
paddle.init(use_gpu=False, trainer_count=1) paddle.init(use_gpu=False, trainer_count=1)
is_generating = False
# source and target dict dim. # source and target dict dim.
dict_size = 30000 dict_size = 30000
source_dict_dim = target_dict_dim = dict_size source_dict_dim = target_dict_dim = dict_size
# define network topology # train the network
cost = seqToseq_net(source_dict_dim, target_dict_dim) if not is_generating:
parameters = paddle.parameters.create(cost) cost = seqToseq_net(source_dict_dim, target_dict_dim)
parameters = paddle.parameters.create(cost)
# define optimize method and trainer
optimizer = paddle.optimizer.Adam( # define optimize method and trainer
learning_rate=5e-5, optimizer = paddle.optimizer.Adam(
regularization=paddle.optimizer.L2Regularization(rate=1e-3)) learning_rate=5e-5,
trainer = paddle.trainer.SGD(cost=cost, regularization=paddle.optimizer.L2Regularization(rate=8e-4))
parameters=parameters, trainer = paddle.trainer.SGD(cost=cost,
update_equation=optimizer) parameters=parameters,
update_equation=optimizer)
# define data reader # define data reader
feeding = { wmt14_reader = paddle.batch(
'source_language_word': 0, paddle.reader.shuffle(
'target_language_word': 1, paddle.dataset.wmt14.train(dict_size), buf_size=8192),
'target_language_next_word': 2 batch_size=5)
}
# define event_handler callback
wmt14_reader = paddle.batch( def event_handler(event):
paddle.reader.shuffle( if isinstance(event, paddle.event.EndIteration):
paddle.dataset.wmt14.train(dict_size=dict_size), buf_size=8192), if event.batch_id % 10 == 0:
batch_size=5) print "\nPass %d, Batch %d, Cost %f, %s" % (
event.pass_id, event.batch_id, event.cost,
# define event_handler callback event.metrics)
def event_handler(event): else:
if isinstance(event, paddle.event.EndIteration): sys.stdout.write('.')
if event.batch_id % 10 == 0: sys.stdout.flush()
print "\nPass %d, Batch %d, Cost %f, %s" % (
event.pass_id, event.batch_id, event.cost, event.metrics) # start to train
trainer.train(
reader=wmt14_reader, event_handler=event_handler, num_passes=2)
# generate a english sequence to french
else:
# use the first 3 samples for generation
gen_creator = paddle.dataset.wmt14.gen(dict_size)
gen_data = []
gen_num = 3
for item in gen_creator():
gen_data.append((item[0], ))
if len(gen_data) == gen_num:
break
beam_gen = seqToseq_net(source_dict_dim, target_dict_dim, is_generating)
# get the pretrained model, whose bleu = 26.92
parameters = paddle.dataset.wmt14.model()
# prob is the prediction probabilities, and id is the prediction word.
beam_result = paddle.infer(
output_layer=beam_gen,
parameters=parameters,
input=gen_data,
field=['prob', 'id'])
# get the dictionary
src_dict, trg_dict = paddle.dataset.wmt14.get_dict(dict_size)
# the delimited element of generated sequences is -1,
# the first element of each generated sequence is the sequence length
seq_list = []
seq = []
for w in beam_result[1]:
if w != -1:
seq.append(w)
else: else:
sys.stdout.write('.') seq_list.append(' '.join([trg_dict.get(w) for w in seq[1:]]))
sys.stdout.flush() seq = []
# start to train prob = beam_result[0]
trainer.train( beam_size = 3
reader=wmt14_reader, for i in xrange(gen_num):
event_handler=event_handler, print "\n*******************************************************\n"
num_passes=10000, print "src:", ' '.join(
feeding=feeding) [src_dict.get(w) for w in gen_data[i][0]]), "\n"
for j in xrange(beam_size):
print "prob = %f:" % (prob[i][j]), seq_list[i * beam_size + j]
if __name__ == '__main__': if __name__ == '__main__':
......
# Design Doc: Distributed Training
## Objective
In [this slides](https://www.slideshare.net/cxwangyi/paddlepaddle-a-complete-solution-for-businesses), we explained that we'd like PaddlePaddle running on general-purpose clusters like those managed by Kubernetes, so to address demands for AI from both Internet and non-Internet industries.
This poses technical challenges to PaddlePaddle:
1. Support fault-recovery.
1. Support both offline and online training.
1. [Serverless computing](https://en.wikipedia.org/wiki/Serverless_computing) of distributed training.
## Training Job
A training job will be created once user asks Paddle cloud to train a model. The training job is made up of different processes that collaboratively consume data and produce a trained model. There are three kinds of processes:
1. the *master process*, which dispatches tasks to
1. one or more *trainer processes*, which run distributed training and synchronize gradients/models via
1. one or more *parameter server processes*, where each holds a shard of the global model.
Their relation is illustrated in the following graph:
<img src="src/paddle-model-sharding.png"/>
### Master Process
The master process will:
- Partition a dataset into [tasks](#task) and dispatch tasks to trainers.
- Keep track of training progress on the dataset with [task queue](#task-queue). A training job will iterate on the dataset for a full pass until it goes into next pass.
#### Task
A task is a data shard to be trained. The total number of tasks will be much bigger than the total number of trainers. The number of data instances inside a task will be much bigger than the mini-batch size.
#### Task Queue
The master process has three task queues to track training progress. As illustrated in the graph below, Job A and Job B both have one master process. Each master process has three task queues.
<img src="src/paddle-task-queues.png"/>
- The todo queue holds tasks to be dispatched. When a job starts, the master process fills in the todo queue with all tasks.
- The pending queue holds tasks that are currently training by trainers.
- the done queue holds tasks that are already trained.
The life cycle of a single task is illustrated below:
<img src="src/paddle-task-states.png"/>
1. When a new pass of training starts, all tasks will be placed in the todo queue.
1. The master process will dispatch few tasks to each trainer at a time, puts them in the pending queue and waits for completion.
1. The trainer will work on its tasks and tell the master process once a task is completed. The master process will dispatch a new task to that trainer.
1. If a task timeout. the master process will move it back to the todo queue. The timeout count will increase by one. If the timeout count is above a threshold, the task is likely to cause a trainer to crash, so it will be discarded.
1. The master process will move completed task to the done queue. When the todo queue is empty, the master process will start a new pass by moving all tasks in the done queue to todo queue and reset the timeout counter of all tasks to zero.
### Trainer Process
The trainer process will:
- Receive tasks from the master.
- Work on the tasks: calculate and upload gradient to parameter servers, and update local model by downloading new parameters from parameter servers.
### Parameter Server Process
Parameter server processes hold the parameters collaboratively. The parameters are partitioned on different parameter servers.
The parameter server will:
- Receive gradient from the trainers, update its parameters, and give the trainers the latest parameters.
- Periodically save its parameters to distributed file system by overriding the previous save.
### Optimization Algorithms
The communication pattern between the trainers and the parameter servers depends on the category of optimization algorithm:
- Synchronous Stochastic Gradient Descent (sync-SGD)
Parameter server will wait for all trainer finish n-th mini-batch calculation and send their gradients before broadcasting new parameters to every trainer. Every trainer will wait for the new parameters before starting n+1-th mini-batch.
- Asynchronous Stochastic Gradient Descent (async-SGD)
There will no synchronization between different trainers, and parameter server updates its parameter as soon as it receives new gradient:
- Each trainer uploads its accumulated gradient every n mini-batches.
- Every m mini-batches, the trainer downloads new parameters from parameter server.
- n and m do not have to be equal.
## Fault Tolerant
The training job will pause if the master processes is dead, or any of the parameter server process is dead. They will be started by [Kubernetes](https://kubernetes.io/) and recover in few minutes. Please refer to [fault recovery](#fault-recovery).
The training job will continue to make progress if there is at least one training process running. The strategy depends on the type of optimization algorithm:
- sync-SGD
TODO
- async-SGD
Since async-SGD does not require synchronization between mini-batches, the system will by definition make process if at least one trainer is running.
## Fault Recovery
PaddlePaddle uses [etcd](https://github.com/coreos/etcd) to keep track of the states of processes. Because etcd is a distributed reliable key-value store, the restarted process can recover its states from etcd. The model parameters are periodically saved into distributed file system, so a restarted parameter server can recover its parameters from the saved file.
Now we will introduce how each process recovers from a failure, the graph below shows how etcd is used:
<img src="src/paddle-etcd.png"/>
### Master Process
When the master is started by the Kubernetes, it executes the following steps at startup:
1. Grabs a unique *master* lock in etcd, which prevents concurrent master instantiations.
1. Recovers the task queues from etcd if they already exist, otherwise, the master will create them.
1. Watches the trainer prefix keys `/trainer/` on etcd to find the live trainers.
1. Starts dispatching the tasks to the trainers, and updates task queue using an etcd transaction to ensure lock is held during the update.
The master process will kill itself if its etcd lease expires.
When the master process is dead for any reason, Kubernetes will restart it. It will be online again with all states recovered from etcd in few minutes.
### Trainer Process
When the trainer is started by the Kubernetes, it executes the following steps at startup:
1. Watches the available parameter server prefix keys `/ps/` on etcd and waits until the count of parameter servers reaches the desired count.
1. Generates a unique ID, and sets key `/trainer/<unique ID>` with its contact address as value. The key will be deleted when the lease expires, so the master will be aware of the trainer being online and offline.
1. Waits for tasks from the master to start training.
If trainer's etcd lease expires, it will try set key `/trainer/<unique ID>` again so that the master process can discover the trainer again.
### Parameter Server Process
When the parameter server is started by Kubernetes, it executes the following steps at startup:
1. Read desired total number of parameter servers from etcd `/ps_desired`
1. Search through etcd keys `/ps/<index>` (`/ps/0`, `/ps/1`, ...) to find the first non-existant key whose index is smaller than the total number of parameter servers. Set the key using a transaction to avoid concurrent writes. The parameter server's index is inferred from the key name.
The desired number of parameter servers is 3:
<img src="src/paddle-ps-0.png"/>
The third parameter server joined:
<img src="src/paddle-ps-1.png"/>
1. The parameter server can load parameters if there are already saved parameters in the save path (inferred from its index).
1. Now the parameter server is ready for the trainers' requests.
If the parameter server's etcd lease expires, the parameter server will kill itself.
## Dynamic Scaling
### Trainer Scaling
TODO
### Parameter Server Scaling
Not planned for v1.
## Training Dataset Format
TODO
## User Interface
TODO
...@@ -4,119 +4,139 @@ PaddlePaddle的Docker容器使用方式 ...@@ -4,119 +4,139 @@ PaddlePaddle的Docker容器使用方式
PaddlePaddle目前唯一官方支持的运行的方式是Docker容器。因为Docker能在所有主要操作系统(包括Linux,Mac OS X和Windows)上运行。 请注意,您需要更改 `Dockers设置 <https://github.com/PaddlePaddle/Paddle/issues/627>`_ 才能充分利用Mac OS X和Windows上的硬件资源。 PaddlePaddle目前唯一官方支持的运行的方式是Docker容器。因为Docker能在所有主要操作系统(包括Linux,Mac OS X和Windows)上运行。 请注意,您需要更改 `Dockers设置 <https://github.com/PaddlePaddle/Paddle/issues/627>`_ 才能充分利用Mac OS X和Windows上的硬件资源。
纯CPU和GPU的docker镜像使用说明 PaddlePaddle发布的docker镜像使用说明
------------------------------ ------------------------------
对于每一个PaddlePaddle版本,我们都会发布两个Docker镜像:纯CPU的和GPU的 对于每一个PaddlePaddle版本,我们都会发布两种Docker镜像:开发镜像、运行镜像。运行镜像包括纯CPU版本和GPU版本以及其对应的非AVX版本
我们通过设置 `dockerhub.com <https://hub.docker.com/r/paddledev/paddle/>`_ 自动生成最新的docker镜像: 我们会在 `dockerhub.com <https://hub.docker.com/r/paddledev/paddle/>`_ 提供最新的docker镜像,可以在"tags"标签下找到最新的Paddle镜像版本。
`paddledev/paddle:0.10.0rc1-cpu` 和 `paddledev/paddle:0.10.0rc1-gpu`。 1. 开发镜像::code:`paddlepaddle/paddle:<version>-dev`
以交互容器方式运行纯CPU的镜像: 这个镜像包含了Paddle相关的开发工具以及编译和运行环境。用户可以使用开发镜像代替配置本地环境,完成开发,编译,发布,
文档编写等工作。由于不同的Paddle的版本可能需要不同的依赖和工具,所以如果需要自行配置开发环境需要考虑版本的因素。
开发镜像包含了以下工具:
- gcc/clang
- nvcc
- Python
- sphinx
- woboq
- sshd
很多开发者会使用远程的安装有GPU的服务器工作,用户可以使用ssh登录到这台服务器上并执行 :code:`docker exec`进入开发镜像并开始工作,
也可以在开发镜像中启动一个SSHD服务,方便开发者直接登录到镜像中进行开发:
.. code-block:: bash 以交互容器方式运行开发镜像:
docker run -it --rm paddledev/paddle:0.10.0rc1-cpu /bin/bash .. code-block:: bash
或者,可以以后台进程方式运行容器: docker run -it --rm paddledev/paddle:<version>-dev /bin/bash
.. code-block:: bash 或者,可以以后台进程方式运行容器:
docker run -d -p 2202:22 -p 8888:8888 paddledev/paddle:0.10.0rc1-cpu .. code-block:: bash
然后用密码 :code:`root` SSH进入容器: docker run -d -p 2202:22 -p 8888:8888 paddledev/paddle:<version>-dev
.. code-block:: bash 然后用密码 :code:`root` SSH进入容器:
ssh -p 2202 root@localhost .. code-block:: bash
SSH方式的一个优点是我们可以从多个终端进入容器。比如,一个终端运行vi,另一个终端运行Python。另一个好处是我们可以把PaddlePaddle容器运行在远程服务器上,并在笔记本上通过SSH与其连接。 ssh -p 2202 root@localhost
SSH方式的一个优点是我们可以从多个终端进入容器。比如,一个终端运行vi,另一个终端运行Python。另一个好处是我们可以把PaddlePaddle容器运行在远程服务器上,并在笔记本上通过SSH与其连接。
以上方法在GPU镜像里也能用-只是请不要忘记按装CUDA驱动,以及告诉Docker: 2. 运行镜像:根据CPU、GPU和非AVX区分了如下4个镜像:
- GPU/AVX::code:`paddlepaddle/paddle:<version>-gpu`
- GPU/no-AVX::code:`paddlepaddle/paddle:<version>-gpu-noavx`
- CPU/AVX::code:`paddlepaddle/paddle:<version>`
- CPU/no-AVX::code:`paddlepaddle/paddle:<version>-noavx`
.. code-block:: bash 纯CPU镜像以及GPU镜像都会用到AVX指令集,但是2008年之前生产的旧电脑不支持AVX。以下指令能检查Linux电脑是否支持AVX:
export CUDA_SO="$(\ls /usr/lib64/libcuda* | xargs -I{} echo '-v {}:{}') $(\ls /usr/lib64/libnvidia* | xargs -I{} echo '-v {}:{}')" .. code-block:: bash
export DEVICES=$(\ls /dev/nvidia* | xargs -I{} echo '--device {}:{}')
docker run ${CUDA_SO} ${DEVICES} -it paddledev/paddle:0.10.0rc1-gpu
if cat /proc/cpuinfo | grep -i avx; then echo Yes; else echo No; fi
运行PaddlePaddle书籍 如果输出是No,就需要选择使用no-AVX的镜像
---------------------
Jupyter Notebook是一个开源的web程序,大家可以通过它制作和分享带有代码、公式、图表、文字的交互式文档。用户可以通过网页浏览文档。 以上方法在GPU镜像里也能用,只是请不要忘记提前在物理机上安装GPU最新驱动。
为了保证GPU驱动能够在镜像里面正常运行,我们推荐使用[nvidia-docker](https://github.com/NVIDIA/nvidia-docker)来运行镜像。
PaddlePaddle书籍是为用户和开发者制作的一个交互式的Jupyter Nodebook。 .. code-block:: bash
如果您想要更深入了解deep learning,PaddlePaddle书籍一定是您最好的选择。
当您进入容器内之后,只用运行以下命令: nvidia-docker run -it --rm paddledev/paddle:0.10.0rc1-gpu /bin/bash
.. code-block:: bash 注意: 如果使用nvidia-docker存在问题,你也许可以尝试更老的方法,具体如下,但是我们并不推荐这种方法。:
jupyter notebook
然后在浏览器中输入以下网址: .. code-block:: bash
.. code-block:: text
http://localhost:8888/ export CUDA_SO="$(\ls /usr/lib64/libcuda* | xargs -I{} echo '-v {}:{}') $(\ls /usr/lib64/libnvidia* | xargs -I{} echo '-v {}:{}')"
export DEVICES=$(\ls /dev/nvidia* | xargs -I{} echo '--device {}:{}')
docker run ${CUDA_SO} ${DEVICES} -it paddledev/paddle:<version>-gpu
就这么简单,享受您的旅程! 3. 使用运行镜像发布你的AI程序
假设您已经完成了一个AI训练的python程序 :code:`a.py`,这个程序是您在开发机上使用开发镜像完成开发。此时您可以运行这个命令在开发机上进行测试运行:
.. code-block:: bash
非AVX镜像 docker run -it -v $PWD:/work paddle /work/a.py
---------
纯CPU镜像以及GPU镜像都会用到AVX指令集,但是2008年之前生产的旧电脑不支持AVX。以下指令能检查Linux电脑是否支持AVX: 这里`a.py`包含的所有依赖假设都可以在Paddle的运行容器中。如果需要包含更多的依赖、或者需要发布您的应用的镜像,可以编写`Dockerfile`使用`FROM paddledev/paddle:<version>`
创建和发布自己的AI程序镜像。
.. code-block:: bash 运行PaddlePaddle书籍
---------------------
if cat /proc/cpuinfo | grep -i avx; then echo Yes; else echo No; fi Jupyter Notebook是一个开源的web程序,大家可以通过它制作和分享带有代码、公式、图表、文字的交互式文档。用户可以通过网页浏览文档。
如果输出是No,我们就需要手动编译一个非AVX版本的镜像: PaddlePaddle书籍是为用户和开发者制作的一个交互式的Jupyter Nodebook。
如果您想要更深入了解deep learning,PaddlePaddle书籍一定是您最好的选择。
我们提供可以直接运行PaddlePaddle书籍的docker镜像,直接运行:
.. code-block:: bash .. code-block:: bash
cd ~ docker run -p 8888:8888 paddlepaddle/book
git clone https://github.com/PaddlePaddle/Paddle.git
cd Paddle
docker build --build-arg WITH_AVX=OFF -t paddle:cpu-noavx -f paddle/scripts/docker/Dockerfile .
docker build --build-arg WITH_AVX=OFF -t paddle:gpu-noavx -f paddle/scripts/docker/Dockerfile.gpu .
然后在浏览器中输入以下网址:
.. code-block:: text
http://localhost:8888/
就这么简单,享受您的旅程!
通过Docker容器开发PaddlePaddle 通过Docker容器开发PaddlePaddle
------------------------------ ------------------------------
开发人员可以在Docker中开发PaddlePaddle。这样开发人员可以以一致的方式在不同的平台上工作 - Linux,Mac OS X和Windows。 开发人员可以在Docker开发镜像中开发PaddlePaddle。这样开发人员可以以一致的方式在不同的平台上工作 - Linux,Mac OS X和Windows。
1. 构建开发镜像
1. 将开发环境构建为Docker镜像
.. code-block:: bash .. code-block:: bash
git clone --recursive https://github.com/PaddlePaddle/Paddle git clone --recursive https://github.com/PaddlePaddle/Paddle
cd Paddle cd Paddle
docker build -t paddle:dev -f paddle/scripts/docker/Dockerfile . docker build -t paddle:dev .
请注意,默认情况下,:code:`docker build` 不会将源码导入到镜像中并编译它。如果我们想这样做,需要设置一个参数 请注意,默认情况下,:code:`docker build` 不会将源码导入到镜像中并编译它。如果我们想这样做,需要构建完开发镜像,然后执行
.. code-block:: bash .. code-block:: bash
docker build -t paddle:dev -f paddle/scripts/docker/Dockerfile --build-arg BUILD_AND_INSTALL=ON . docker run -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_AVX=ON" -e "TEST=OFF" paddle:dev
2. 运行开发环境 2. 运行开发环境
当我们编译好了 :code:`paddle:dev`, 我们可以在docker容器里做开发,源代码可以通过挂载本地文件来被载入Docker的开发环境里面: 当我们编译好了 :code:`paddle:dev`, 我们可以在docker容器里做开发,源代码可以通过挂载本地文件来被载入Docker的开发环境里面:
.. code-block:: bash .. code-block:: bash
docker run -d -p 2202:22 -v $PWD:/paddle paddle:dev docker run -d -p 2202:22 -v $PWD:/paddle paddle:dev sshd
以上代码会启动一个带有PaddlePaddle开发环境的docker容器,源代码会被挂载到 :code:`/paddle` 。 以上代码会启动一个带有PaddlePaddle开发环境的docker容器,源代码会被挂载到 :code:`/paddle` 。
请注意, :code:`paddle:dev` 的默认入口是 :code:`sshd` 。以上的 :code:`docker run` 命令其实会启动一个在2202端口监听的SSHD服务器。这样,我们就能SSH进入我们的开发容器了: 以上的 :code:`docker run` 命令其实会启动一个在2202端口监听的SSHD服务器。这样,我们就能SSH进入我们的开发容器了:
.. code-block:: bash .. code-block:: bash
ssh root@localhost -p 2202 ssh root@localhost -p 2202
...@@ -124,13 +144,13 @@ PaddlePaddle书籍是为用户和开发者制作的一个交互式的Jupyter Nod ...@@ -124,13 +144,13 @@ PaddlePaddle书籍是为用户和开发者制作的一个交互式的Jupyter Nod
3. 在Docker开发环境中编译与安装PaddlPaddle代码 3. 在Docker开发环境中编译与安装PaddlPaddle代码
当在容器里面的时候,可以用脚本 :code:`paddle/scripts/docker/build.sh` 来编译、安装与测试PaddlePaddle: 当在容器里面的时候,可以用脚本 :code:`paddle/scripts/docker/build.sh` 来编译、安装与测试PaddlePaddle:
.. code-block:: bash .. code-block:: bash
/paddle/paddle/scripts/docker/build.sh /paddle/paddle/scripts/docker/build.sh
以上指令会在 :code:`/paddle/build` 中编译PaddlePaddle。通过以下指令可以运行单元测试: 以上指令会在 :code:`/paddle/build` 中编译PaddlePaddle。通过以下指令可以运行单元测试:
.. code-block:: bash .. code-block:: bash
cd /paddle/build cd /paddle/build
...@@ -140,14 +160,14 @@ PaddlePaddle书籍是为用户和开发者制作的一个交互式的Jupyter Nod ...@@ -140,14 +160,14 @@ PaddlePaddle书籍是为用户和开发者制作的一个交互式的Jupyter Nod
文档 文档
---- ----
Paddle的Docker镜像带有一个通过 `woboq code browser Paddle的Docker开发镜像带有一个通过 `woboq code browser
<https://github.com/woboq/woboq_codebrowser>`_ 生成的HTML版本的C++源代码,便于用户浏览C++源码。 <https://github.com/woboq/woboq_codebrowser>`_ 生成的HTML版本的C++源代码,便于用户浏览C++源码。
只要在Docker里启动PaddlePaddle的时候给它一个名字,就可以再运行另一个Nginx Docker镜像来服务HTML代码: 只要在Docker里启动PaddlePaddle的时候给它一个名字,就可以再运行另一个Nginx Docker镜像来服务HTML代码:
.. code-block:: bash .. code-block:: bash
docker run -d --name paddle-cpu-doc paddle:0.10.0rc1-cpu docker run -d --name paddle-cpu-doc paddle:<version>-dev
docker run -d --volumes-from paddle-cpu-doc -p 8088:80 nginx docker run -d --volumes-from paddle-cpu-doc -p 8088:80 nginx
接着我们就能够打开浏览器在 http://localhost:8088/paddle/ 浏览代码。 接着我们就能够打开浏览器在 http://localhost:8088/paddle/ 浏览代码。
...@@ -8,173 +8,255 @@ Please be aware that you will need to change `Dockers settings ...@@ -8,173 +8,255 @@ Please be aware that you will need to change `Dockers settings
<https://github.com/PaddlePaddle/Paddle/issues/627>`_ to make full use <https://github.com/PaddlePaddle/Paddle/issues/627>`_ to make full use
of your hardware resource on Mac OS X and Windows. of your hardware resource on Mac OS X and Windows.
Working With Docker
-------------------
Docker is simple as long as we understand a few basic concepts:
- *image*: A Docker image is a pack of software. It could contain one or more programs and all their dependencies. For example, the PaddlePaddle's Docker image includes pre-built PaddlePaddle and Python and many Python packages. We can run a Docker image directly, other than installing all these software. We can type
.. code-block:: bash
docker images
to list all images in the system. We can also run
.. code-block:: bash
docker pull paddlepaddle/paddle:0.10.0rc2
to download a Docker image, paddlepaddle/paddle in this example,
from Dockerhub.com.
- *container*: considering a Docker image a program, a container is a
"process" that runs the image. Indeed, a container is exactly an
operating system process, but with a virtualized filesystem, network
port space, and other virtualized environment. We can type
.. code-block:: bash
docker run paddlepaddle/paddle:0.10.0rc2
to start a container to run a Docker image, paddlepaddle/paddle in this example.
- By default docker container have an isolated file system namespace,
we can not see the files in the host file system. By using *volume*,
mounted files in host will be visible inside docker container.
Following command will mount current dirctory into /data inside
docker container, run docker container from debian image with
command :code:`ls /data`.
.. code-block:: bash
docker run --rm -v $(pwd):/data debian ls /data
Usage of CPU-only and GPU Images Usage of CPU-only and GPU Images
---------------------------------- ----------------------------------
For each version of PaddlePaddle, we release 2 Docker images, a For each version of PaddlePaddle, we release two types of Docker images:
CPU-only one and a CUDA GPU one. We do so by configuring development image and production image. Production image includes
`dockerhub.com <https://hub.docker.com/r/paddledev/paddle/>`_ CPU-only version and a CUDA GPU version and their no-AVX versions. We
automatically generate the latest docker images `paddledev/paddle:0.10.0rc1-cpu` put the docker images on `dockerhub.com
and `paddledev/paddle:0.10.0rc1-gpu`. <https://hub.docker.com/r/paddledev/paddle/>`_. You can find the
latest versions under "tags" tab at dockerhub.com
To run the CPU-only image as an interactive container: 1. Production images, this image might have multiple variants:
.. code-block:: bash - GPU/AVX::code:`paddlepaddle/paddle:<version>-gpu`
- GPU/no-AVX::code:`paddlepaddle/paddle:<version>-gpu-noavx`
- CPU/AVX::code:`paddlepaddle/paddle:<version>`
- CPU/no-AVX::code:`paddlepaddle/paddle:<version>-noavx`
docker run -it --rm paddledev/paddle:0.10.0rc1-cpu /bin/bash Please be aware that the CPU-only and the GPU images both use the
AVX instruction set, but old computers produced before 2008 do not
support AVX. The following command checks if your Linux computer
supports AVX:
or, we can run it as a daemon container .. code-block:: bash
.. code-block:: bash if cat /proc/cpuinfo | grep -i avx; then echo Yes; else echo No; fi
docker run -d -p 2202:22 -p 8888:8888 paddledev/paddle:0.10.0rc1-cpu
To run the CPU-only image as an interactive container:
and SSH to this container using password :code:`root`: .. code-block:: bash
.. code-block:: bash docker run -it --rm paddlepaddle/paddle:0.10.0rc2 /bin/bash
ssh -p 2202 root@localhost Above method work with the GPU image too -- the recommended way is
using `nvidia-docker <https://github.com/NVIDIA/nvidia-docker>`_.
An advantage of using SSH is that we can connect to PaddlePaddle from Please install nvidia-docker first following this `tutorial
more than one terminals. For example, one terminal running vi and <https://github.com/NVIDIA/nvidia-docker#quick-start>`_.
another one running Python interpreter. Another advantage is that we
can run the PaddlePaddle container on a remote server and SSH to it
from a laptop.
Above methods work with the GPU image too -- just please don't forget Now you can run a GPU image:
to install CUDA driver and let Docker knows about it:
.. code-block:: bash .. code-block:: bash
export CUDA_SO="$(\ls /usr/lib64/libcuda* | xargs -I{} echo '-v {}:{}') $(\ls /usr/lib64/libnvidia* | xargs -I{} echo '-v {}:{}')" nvidia-docker run -it --rm paddlepaddle/paddle:0.10.0rc2-gpu /bin/bash
export DEVICES=$(\ls /dev/nvidia* | xargs -I{} echo '--device {}:{}')
docker run ${CUDA_SO} ${DEVICES} -it paddledev/paddle:0.10.0rc1-gpu
2. development image :code:`paddlepaddle/paddle:<version>-dev`
PaddlePaddle Book This image has packed related develop tools and runtime
------------------ environment. Users and developers can use this image instead of
their own local computer to accomplish development, build,
releasing, document writing etc. While different version of paddle
may depends on different version of libraries and tools, if you
want to setup a local environment, you must pay attention to the
versions. The development image contains:
- gcc/clang
- nvcc
- Python
- sphinx
- woboq
- sshd
Many developers use servers with GPUs, they can use ssh to login to
the server and run :code:`docker exec` to enter the docker
container and start their work. Also they can start a development
docker image with SSHD service, so they can login to the container
and start work.
The Jupyter Notebook is an open-source web application that allows
you to create and share documents that contain live code, equations,
visualizations and explanatory text in a single browser.
PaddlePaddle Book is an interactive Jupyter Notebook for users and developers. Train Model Using Python API
We already exposed port 8888 for this book. If you want to ----------------------------
dig deeper into deep learning, PaddlePaddle Book definitely is your best choice.
Once you are inside the container, simply issue the command: Our official docker image provides a runtime for PaddlePaddle
programs. The typical workflow will be as follows:
Create a directory as workspace:
.. code-block:: bash .. code-block:: bash
jupyter notebook
Then, you would back and paste the address into the local browser: mkdir ~/workspace
.. code-block:: text
http://localhost:8888/ Edit a PaddlePaddle python program using your favourite editor
That's all. Enjoy your journey! .. code-block:: bash
emacs ~/workspace/example.py
Non-AVX Images Run the program using docker:
--------------
Please be aware that the CPU-only and the GPU images both use the AVX .. code-block:: bash
instruction set, but old computers produced before 2008 do not support
AVX. The following command checks if your Linux computer supports docker run --rm -v ~/workspace:/workspace paddlepaddle/paddle:0.10.0rc2 python /workspace/example.py
AVX:
Or if you are using GPU for training:
.. code-block:: bash .. code-block:: bash
if cat /proc/cpuinfo | grep -i avx; then echo Yes; else echo No; fi nvidia-docker run --rm -v ~/workspace:/workspace paddlepaddle/paddle:0.10.0rc2-gpu python /workspace/example.py
Above commands will start a docker container by running :code:`python
/workspace/example.py`. It will stop once :code:`python
/workspace/example.py` finishes.
If it doesn't, we will need to build non-AVX images manually from Another way is to tell docker to start a :code:`/bin/bash` session and
source code: run PaddlePaddle program interactively:
.. code-block:: bash .. code-block:: bash
cd ~ docker run -it -v ~/workspace:/workspace paddlepaddle/paddle:0.10.0rc2 /bin/bash
git clone https://github.com/PaddlePaddle/Paddle.git # now we are inside docker container
cd Paddle cd /workspace
docker build --build-arg WITH_AVX=OFF -t paddle:cpu-noavx -f paddle/scripts/docker/Dockerfile . python example.py
docker build --build-arg WITH_AVX=OFF -t paddle:gpu-noavx -f paddle/scripts/docker/Dockerfile.gpu .
Running with GPU is identical:
Development Using Docker .. code-block:: bash
------------------------
Developers can work on PaddlePaddle using Docker. This allows nvidia-docker run -it -v ~/workspace:/workspace paddlepaddle/paddle:0.10.0rc2-gpu /bin/bash
developers to work on different platforms -- Linux, Mac OS X, and # now we are inside docker container
Windows -- in a consistent way. cd /workspace
python example.py
1. Build the Development Environment as a Docker Image
.. code-block:: bash Develop PaddlePaddle or Train Model Using C++ API
---------------------------------------------------
git clone --recursive https://github.com/PaddlePaddle/Paddle We will be using PaddlePaddle development image since it contains all
cd Paddle compiling tools and dependencies.
docker build -t paddle:dev -f paddle/scripts/docker/Dockerfile .
Let's clone PaddlePaddle repo first:
Note that by default :code:`docker build` wouldn't import source .. code-block:: bash
tree into the image and build it. If we want to do that, we need
to set a build arg:
.. code-block:: bash git clone https://github.com/PaddlePaddle/Paddle.git && cd Paddle
docker build -t paddle:dev -f paddle/scripts/docker/Dockerfile --build-arg BUILD_AND_INSTALL=ON . Mount both workspace folder and paddle code folder into docker
container, so we can access them inside docker container. There are
two ways of using PaddlePaddle development docker image:
- run interactive bash directly
2. Run the Development Environment .. code-block:: bash
Once we got the image :code:`paddle:dev`, we can use it to develop # use nvidia-docker instead of docker if you need to use GPU
Paddle by mounting the local source code tree into a container that docker run -it -v ~/workspace:/workspace -v $(pwd):/paddle paddlepaddle/paddle:0.10.0rc2-dev /bin/bash
runs the image: # now we are inside docker container
.. code-block:: bash - or, we can run it as a daemon container
docker run -d -p 2202:22 -p 8888:8888 -v $PWD:/paddle paddle:dev .. code-block:: bash
This runs a container of the development environment Docker image # use nvidia-docker instead of docker if you need to use GPU
with the local source tree mounted to :code:`/paddle` of the docker run -d -p 2202:22 -p 8888:8888 -v ~/workspace:/workspace -v $(pwd):/paddle paddlepaddle/paddle:0.10.0rc2-dev /usr/sbin/sshd -D
container.
Note that the default entry-point of :code:`paddle:dev` is and SSH to this container using password :code:`root`:
:code:`sshd`, and above :code:`docker run` commands actually starts
an SSHD server listening on port 2202. This allows us to log into
this container with:
.. code-block:: bash .. code-block:: bash
ssh root@localhost -p 2202 ssh -p 2202 root@localhost
Usually, I run above commands on my Mac. I can also run them on a An advantage is that we can run the PaddlePaddle container on a
GPU server :code:`xxx.yyy.zzz.www` and ssh from my Mac to it: remote server and SSH to it from a laptop.
.. code-block:: bash When developing PaddlePaddle, you can edit PaddlePaddle source code
from outside of docker container using your favoriate editor. To
compile PaddlePaddle, run inside container:
my-mac$ ssh root@xxx.yyy.zzz.www -p 2202 .. code-block:: bash
3. Build and Install Using the Development Environment WITH_GPU=OFF WITH_AVX=ON WITH_TEST=ON bash /paddle/paddle/scripts/docker/build.sh
Once I am in the container, I can use This builds everything about Paddle in :code:`/paddle/build`. And we
:code:`paddle/scripts/docker/build.sh` to build, install, and test can run unit tests there:
Paddle:
.. code-block:: bash .. code-block:: bash
/paddle/paddle/scripts/docker/build.sh cd /paddle/build
ctest
This builds everything about Paddle in :code:`/paddle/build`. And When training model using C++ API, we can edit paddle program in
we can run unit tests there: ~/workspace outside of docker. And build from /workspace inside of
docker.
.. code-block:: bash PaddlePaddle Book
------------------
The Jupyter Notebook is an open-source web application that allows
you to create and share documents that contain live code, equations,
visualizations and explanatory text in a single browser.
PaddlePaddle Book is an interactive Jupyter Notebook for users and developers.
We already exposed port 8888 for this book. If you want to
dig deeper into deep learning, PaddlePaddle Book definitely is your best choice.
We provide a packaged book image, simply issue the command:
.. code-block:: bash
cd /paddle/build docker run -p 8888:8888 paddlepaddle/book
ctest
Then, you would back and paste the address into the local browser:
.. code-block:: text
http://localhost:8888/
That's all. Enjoy your journey!
Documentation Documentation
...@@ -191,7 +273,7 @@ container: ...@@ -191,7 +273,7 @@ container:
.. code-block:: bash .. code-block:: bash
docker run -d --name paddle-cpu-doc paddle:0.10.0rc1-cpu docker run -d --name paddle-cpu-doc paddle:<version>
docker run -d --volumes-from paddle-cpu-doc -p 8088:80 nginx docker run -d --volumes-from paddle-cpu-doc -p 8088:80 nginx
......
...@@ -46,7 +46,6 @@ PaddlePaddle提供了ubuntu 14.04 deb安装包。 ...@@ -46,7 +46,6 @@ PaddlePaddle提供了ubuntu 14.04 deb安装包。
with_double: OFF with_double: OFF
with_python: ON with_python: ON
with_rdma: OFF with_rdma: OFF
with_metric_learning:
with_timer: OFF with_timer: OFF
with_predict_sdk: with_predict_sdk:
......
...@@ -55,6 +55,7 @@ extensions = [ ...@@ -55,6 +55,7 @@ extensions = [
'sphinx.ext.napoleon', 'sphinx.ext.napoleon',
'sphinx.ext.graphviz' 'sphinx.ext.graphviz'
] ]
mathjax_path="https://cdn.bootcss.com/mathjax/2.7.0/MathJax.js"
table_styling_embed_css = True table_styling_embed_css = True
autodoc_member_order = 'bysource' autodoc_member_order = 'bysource'
......
...@@ -8,16 +8,13 @@ add_subdirectory(gserver) ...@@ -8,16 +8,13 @@ add_subdirectory(gserver)
add_subdirectory(pserver) add_subdirectory(pserver)
add_subdirectory(trainer) add_subdirectory(trainer)
add_subdirectory(scripts) add_subdirectory(scripts)
if(WITH_C_API) if(WITH_C_API)
add_subdirectory(capi) add_subdirectory(capi)
endif() endif()
configure_file(${CMAKE_CURRENT_SOURCE_DIR}/setup.py.in
${CMAKE_CURRENT_SOURCE_DIR}/setup.py)
if(WITH_PREDICT_SDK)
add_subdirectory(predict)
endif()
if(WITH_SWIG_PY) if(WITH_SWIG_PY)
configure_file(${CMAKE_CURRENT_SOURCE_DIR}/setup.py.in
${CMAKE_CURRENT_SOURCE_DIR}/setup.py)
add_subdirectory(api) add_subdirectory(api)
endif() endif()
...@@ -76,8 +76,6 @@ SWIG_LINK_LIBRARIES(swig_paddle ...@@ -76,8 +76,6 @@ SWIG_LINK_LIBRARIES(swig_paddle
${CMAKE_DL_LIBS} ${CMAKE_DL_LIBS}
${EXTERNAL_LIBS} ${EXTERNAL_LIBS}
${CMAKE_THREAD_LIBS_INIT} ${CMAKE_THREAD_LIBS_INIT}
${RDMA_LD_FLAGS}
${RDMA_LIBS}
${START_END} ${START_END}
) )
......
...@@ -17,7 +17,11 @@ limitations under the License. */ ...@@ -17,7 +17,11 @@ limitations under the License. */
#include <stdio.h> #include <stdio.h>
#include "hl_base.h" #include "hl_base.h"
#if defined(__ARM_NEON__) || defined(__ARM_NEON)
#include "hl_neon_matrix_kernel.cuh"
#else
#include "hl_sse_matrix_kernel.cuh" #include "hl_sse_matrix_kernel.cuh"
#endif
/** /**
* @brief cpu element wise unary operator. * @brief cpu element wise unary operator.
......
...@@ -66,6 +66,8 @@ typedef BaseOp SSESquaredDiff; ...@@ -66,6 +66,8 @@ typedef BaseOp SSESquaredDiff;
typedef BaseOp SSEFirst; typedef BaseOp SSEFirst;
typedef BaseOp SSESecond; typedef BaseOp SSESecond;
typedef BaseOp SSEClassificationError; typedef BaseOp SSEClassificationError;
#elif defined(__ARM__NEON__) || defined(__ARM_NEON)
#include "hl_matrix_base_neon.cuh"
#else #else
#include "hl_matrix_base_sse.cuh" #include "hl_matrix_base_sse.cuh"
#endif #endif
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifndef HL_MATRIX_BASE_NEON_CUH_
#define HL_MATRIX_BASE_NEON_CUH_
namespace aggregate {
class SSESum {
public:
static const bool sse = true;
INLINE float32x4_t vecOp(const float32x4_t a, const float32x4_t b) const {
return vaddq_f32(a, b);
}
};
class SSEMax {
public:
static const bool sse = true;
INLINE float32x4_t vecOp(const float32x4_t a, const float32x4_t b) const {
return vmaxq_f32(a, b);
}
};
class SSEMin {
public:
static const bool sse = true;
INLINE float32x4_t vecOp(const float32x4_t a, const float32x4_t b) const {
return vminq_f32(a, b);
}
};
} // namespace aggregate
namespace base {
namespace unary {
class SSEIdentity {
public:
static const bool sse = true;
INLINE float32x4_t vecOp(const float32x4_t a) const {
return a;
}
};
} // namespace unary
namespace binary {
class SSEAdd {
public:
static const bool sse = true;
INLINE float32x4_t vecOp(const float32x4_t a, const float32x4_t b) const {
return vaddq_f32(a, b);
}
};
class SSEAdd2 {
public:
static const bool sse = true;
const real p1;
const real p2;
float32x4_t mp1;
float32x4_t mp2;
public:
SSEAdd2(const real s1, const real s2) : p1(s1), p2(s2) {
mp1 = vdupq_n_f32(p1);
mp2 = vdupq_n_f32(p2);
}
INLINE float32x4_t vecOp(const float32x4_t a, const float32x4_t b) const {
float32x4_t tmp1, tmp2;
tmp1 = vmulq_f32(mp1, a);
tmp2 = vmulq_f32(mp2, b);
return vaddq_f32(tmp1, tmp2);
}
};
class SSESub {
public:
static const bool sse = true;
INLINE float32x4_t vecOp(const float32x4_t a, const float32x4_t b) const {
return vsubq_f32(a, b);
}
};
class SSEMul {
public:
static const bool sse = true;
INLINE float32x4_t vecOp(const float32x4_t a, const float32x4_t b) const {
return vmulq_f32(a, b);
}
};
class SSEDiv {
public:
static const bool sse = true;
INLINE float32x4_t vecOp(const float32x4_t a, const float32x4_t b) const {
float32x4_t tmp;
tmp = vrecpeq_f32(b);
return vmulq_f32(a, tmp);
}
};
class SSESquaredDiff {
public:
static const bool sse = true;
INLINE float32x4_t vecOp(const float32x4_t a, const float32x4_t b) const {
float32x4_t tmp;
tmp = vsubq_f32(a, b);
return vmulq_f32(tmp, tmp);
}
};
class SSEFirst {
public:
static const bool sse = true;
INLINE float32x4_t vecOp(const float32x4_t a, const float32x4_t b) const {
return a;
}
};
class SSESecond {
public:
static const bool sse = true;
INLINE float32x4_t vecOp(const float32x4_t a, const float32x4_t b) const {
return b;
}
};
class SSEClassificationError {
public:
static const bool sse = true;
const real p;
float32x4_t mp;
uint32x4_t result;
public:
explicit SSEClassificationError(const real s) : p(s) {
mp = vdupq_n_f32(p);
result = vdupq_n_u32(1);
}
// TODO: to be check
INLINE float32x4_t vecOp(const float32x4_t a, const float32x4_t b) const {
uint32x4_t tmp1 = vcgtq_f32(a, mp);
uint32x4_t tmp2 = vcgtq_f32(b, mp);
uint32x4_t tmp3 = veorq_u32(tmp1, tmp2);
return vcvtq_f32_u32(vandq_u32(tmp3, result));
}
};
} // namespace binary
} // namespace base
#endif /* HL_MATRIX_BASE_NEON_CUH_ */
...@@ -17,13 +17,20 @@ limitations under the License. */ ...@@ -17,13 +17,20 @@ limitations under the License. */
#include "hl_base.h" #include "hl_base.h"
#ifdef __CUDA_ARCH__ #if defined(__CUDA_ARCH__)
#include <vector_types.h> #include <vector_types.h>
#ifndef PADDLE_TYPE_DOUBLE #ifndef PADDLE_TYPE_DOUBLE
typedef float4 vecType; typedef float4 vecType;
#else #else
typedef double2 vecType; typedef double2 vecType;
#endif #endif
#elif (defined __ARM_NEON) || (defined __ARM_NEON__)
#include <arm_neon.h>
#ifndef PADDLE_TYPE_DOUBLE
typedef float32x4_t vecType;
#else
#error NEON instructions does not support double precision
#endif
#else #else
#include <mmintrin.h> #include <mmintrin.h>
#include <xmmintrin.h> #include <xmmintrin.h>
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifndef HL_NEON_MATRIX_KERNEL_CUH_
#define HL_NEON_MATRIX_KERNEL_CUH_
#include "hl_matrix_type.cuh"
#define VECTOR_SIZE 16
/* number of float in vector */
#define VECTOR_LEN 4
#define VECTOR_SET vdupq_n_f32
inline bool hl_check_align(size_t size) {
return !(size & (VECTOR_SIZE - 1));
}
inline bool hl_check_align(void *ptr) {
return hl_check_align(reinterpret_cast<size_t>(ptr));
}
template <class Agg>
inline real hl_agg_op(Agg agg, vecType mm) {
float32x4_t rev = vrev64q_f32(mm);
float32x4_t tmp1 = agg.vecOp(rev, rev);
float32x2_t lo = vget_high_f32(rev);
float32x2_t hi = vget_low_f32(rev);
float32x4_t tmp2 = vcombine_f32(hi, lo);
float32x4_t ret = agg.vecOp(tmp1, tmp2);
return vgetq_lane_f32(ret, 0);
}
template <class Agg, class Op, class Saver>
void hl_sse_matrix_row_op(Agg agg, Op op, Saver sv,
int dimM, int dimN,
real *dst, int ld,
real *A, int lda) {
for (int i = 0; i < dimM; i++, A += lda) {
vecType mm = VECTOR_SET(agg.init());
vecType *a = (vecType*)(A);
for (int j = 0; j < dimN / VECTOR_LEN; j++, a++) {
mm = agg.vecOp(mm, op.vecOp(*a));
}
int rem = dimN % VECTOR_LEN;
if (rem) {
real tmp = hl_agg_op(agg, mm);
real *a = A + (dimN / VECTOR_LEN) * VECTOR_LEN;
for (int j = 0; j < rem; j++) {
tmp = agg(tmp, op(a[j]));
}
dst[i*ld] = sv(dst[i*ld], tmp);
} else {
dst[i*ld] = sv(dst[i*ld], hl_agg_op(agg, mm));
}
}
}
template <class Agg, class Op, class Saver>
void hl_sse_matrix_row_op(Agg agg, Op op, Saver sv,
int dimM, int dimN,
real *dst, int ld,
real *A, int lda,
real *B, int ldb) {
for (int i = 0; i < dimM; i++, A += lda, B += ldb) {
vecType mm = VECTOR_SET(agg.init());
vecType *a = (vecType*)(A);
vecType *b = (vecType*)(B);
for (int j = 0; j < dimN / VECTOR_LEN; j++, a++, b++) {
mm = agg.vecOp(mm, op.vecOp(*a, *b));
}
int rem = dimN % VECTOR_LEN;
if (rem) {
real tmp = hl_agg_op(agg, mm);
real *a = A + (dimN / VECTOR_LEN) * VECTOR_LEN;
real *b = B + (dimN / VECTOR_LEN) * VECTOR_LEN;
for (int j = 0; j < rem; j++) {
tmp = agg(tmp, op(a[j], b[j]));
}
dst[i*ld] = sv(dst[i*ld], tmp);
} else {
dst[i*ld] = sv(dst[i*ld], hl_agg_op(agg, mm));
}
}
}
template <class Agg, class Op, class Saver>
void hl_matrix_column_op(Agg agg, Op op, Saver sv,
int dimM, int dimN,
real *dst,
real *A, int lda) {
for (int j = 0; j < dimN; j++) {
real tmp = agg.init();
for (int i = 0; i < dimM; i++) {
tmp = agg(tmp, op(A[i * lda + j]));
}
dst[j] = sv(dst[j], tmp);
}
}
template <class Agg, class Op, class Saver>
void hl_matrix_column_op(Agg agg, Op op, Saver sv,
int dimM, int dimN,
real *dst,
real *A, int lda,
real *B, int ldb) {
for (int j = 0; j < dimN; j++) {
real tmp = agg.init();
for (int i = 0; i < dimM; i++) {
tmp = agg(tmp, op(A[i * lda + j], B[i * ldb + j]));
}
dst[j] = sv(dst[j], tmp);
}
}
/*
* MaxRow greater than or equal dimN
* dimN is multiples of VECTOR_LEN
* so rem <= MaxRow / VECTOR_LEN
*/
template <int MaxRow, class Agg, class Op, class Saver>
void hl_sse_column_op_with_rem(Agg agg, Op op, Saver sv,
int dimM, int dimN,
real *dst,
real *A, int lda) {
vecType mm[MaxRow / VECTOR_LEN];
for (int n = 0; n < MaxRow / VECTOR_LEN; n++) {
mm[n] = VECTOR_SET(agg.init());
}
for (int i = 0; i < dimM; i++) {
vecType *a = (vecType*)(A + i * lda);
for (int n = 0; n < dimN / VECTOR_LEN; n++) {
mm[n] = agg.vecOp(mm[n], op.vecOp(a[n]));
}
}
vecType *result = (vecType*)(dst);
for (int n = 0; n < dimN / VECTOR_LEN; n++) {
result[n] = sv.vecOp(result[n], mm[n]);
}
int rem = dimN % VECTOR_LEN;
if (rem) {
A += (dimN / VECTOR_LEN) * VECTOR_LEN;
dst += (dimN / VECTOR_LEN) * VECTOR_LEN;
hl_matrix_column_op(agg, op, sv, dimM, rem, dst, A, lda);
}
}
/*
* dimN is multiples of VECTOR_LEN
* dimN greater than Step
*/
template <int Step, class Agg, class Op, class Saver>
void hl_sse_matrix_column_op(Agg agg, Op op, Saver sv,
int dimM, int dimN,
real *dst,
real *A, int lda) {
for (int j = 0; j < dimN / Step; j++, dst += Step, A += Step) {
vecType mm[Step / VECTOR_LEN];
for (int n = 0; n < Step / VECTOR_LEN; n++) {
mm[n] = VECTOR_SET(agg.init());
}
for (int i = 0; i < dimM; i++) {
vecType *a = (vecType*)(A + i * lda);
for (int n = 0; n < Step / VECTOR_LEN; n++) {
mm[n] = agg.vecOp(mm[n], op.vecOp(a[n]));
}
}
vecType *result = (vecType*)(dst);
for (int n = 0; n < Step / VECTOR_LEN; n++) {
result[n] = sv.vecOp(result[n], mm[n]);
}
}
int remRow = dimN % Step;
if (remRow) {
hl_sse_column_op_with_rem<Step>(agg, op, sv, dimM, remRow, dst, A, lda);
}
}
template <class Agg, class Op, class Saver>
void hl_sse_matrix_column_op(Agg agg, Op op, Saver sv,
int dimM, int dimN,
real *dst,
real *A, int lda) {
if (dimN <= 16) {
hl_sse_matrix_column_op<16>(agg, op, sv, dimM, dimN, dst, A, lda);
} else if (dimN <= 32) {
hl_sse_matrix_column_op<32>(agg, op, sv, dimM, dimN, dst, A, lda);
} else if (dimN <= 1024 || dimM <= 512) {
hl_sse_matrix_column_op<64>(agg, op, sv, dimM, dimN, dst, A, lda);
} else {
hl_sse_matrix_column_op<1024>(agg, op, sv, dimM, dimN, dst, A, lda);
}
}
template <int MaxRow, class Agg, class Op, class Saver>
void hl_sse_column_op_with_rem(Agg agg, Op op, Saver sv,
int dimM, int dimN,
real *dst,
real *A, int lda,
real *B, int ldb) {
vecType mm[MaxRow / VECTOR_LEN];
for (int n = 0; n < MaxRow / VECTOR_LEN; n++) {
mm[n] = VECTOR_SET(agg.init());
}
for (int i = 0; i < dimM; i++) {
vecType *a = (vecType*)(A + i * lda);
vecType *b = (vecType*)(B + i * ldb);
for (int n = 0; n < dimN / VECTOR_LEN; n++) {
mm[n] = agg.vecOp(mm[n], op.vecOp(a[n], b[n]));
}
}
vecType *result = (vecType*)(dst);
for (int n = 0; n < dimN / VECTOR_LEN; n++) {
result[n] = sv.vecOp(result[n], mm[n]);
}
int rem = dimN % VECTOR_LEN;
if (rem) {
A += (dimN / VECTOR_LEN) * VECTOR_LEN;
B += (dimN / VECTOR_LEN) * VECTOR_LEN;
dst += (dimN / VECTOR_LEN) * VECTOR_LEN;
hl_matrix_column_op(agg, op, sv, dimM, rem, dst, A, lda, B, ldb);
}
}
template <int Step, class Agg, class Op, class Saver>
void hl_sse_matrix_column_op(Agg agg, Op op, Saver sv,
int dimM, int dimN,
real *dst,
real *A, int lda,
real *B, int ldb) {
for (int j = 0; j < dimN / Step; j++, dst += Step, A += Step, B += Step) {
vecType mm[Step / VECTOR_LEN];
for (int n = 0; n < Step / VECTOR_LEN; n++) {
mm[n] = VECTOR_SET(agg.init());
}
for (int i = 0; i < dimM; i++) {
vecType *a = (vecType*)(A + i * lda);
vecType *b = (vecType*)(B + i * ldb);
for (int n = 0; n < Step / VECTOR_LEN; n++) {
mm[n] = agg.vecOp(mm[n], op.vecOp(a[n], b[n]));
}
}
vecType *result = (vecType*)(dst);
for (int n = 0; n < Step / VECTOR_LEN; n++) {
result[n] = sv.vecOp(result[n], mm[n]);
}
}
int remRow = dimN % Step;
if (remRow) {
hl_sse_column_op_with_rem<Step>(
agg, op, sv, dimM, remRow, dst, A, lda, B, ldb);
}
}
template <class Agg, class Op, class Saver>
void hl_sse_matrix_column_op(Agg agg, Op op, Saver sv,
int dimM, int dimN,
real *dst,
real *A, int lda,
real *B, int ldb) {
if (dimN <= 16) {
hl_sse_matrix_column_op<16>(agg, op, sv, dimM, dimN, dst, A, lda, B, ldb);
} else if (dimN <= 32) {
hl_sse_matrix_column_op<32>(agg, op, sv, dimM, dimN, dst, A, lda, B, ldb);
} else if (dimN <= 1024 || dimM <= 512) {
hl_sse_matrix_column_op<64>(agg, op, sv, dimM, dimN, dst, A, lda, B, ldb);
} else {
hl_sse_matrix_column_op<1024>(agg, op, sv, dimM, dimN, dst, A, lda, B, ldb);
}
}
#endif /* HL_NEON_MATRIX_KERNEL_CUH_ */
...@@ -159,4 +159,10 @@ extern void hl_sequence_avg_forward(real* dst, ...@@ -159,4 +159,10 @@ extern void hl_sequence_avg_forward(real* dst,
int width, int width,
const int mode); const int mode);
extern void hl_sequence_avg_backward(real* dst,
real* src,
const int* starts,
int height,
int width,
const int mode);
#endif /* HL_SEQUENCE_H_ */ #endif /* HL_SEQUENCE_H_ */
...@@ -57,4 +57,10 @@ inline void hl_sequence_avg_forward(real* dst, ...@@ -57,4 +57,10 @@ inline void hl_sequence_avg_forward(real* dst,
int width, int width,
const int mode) {} const int mode) {}
inline void hl_sequence_avg_backward(real* dst,
real* src,
const int* starts,
int height,
int width,
const int mode) {}
#endif // HL_SEQUENCE_STUB_H_ #endif // HL_SEQUENCE_STUB_H_
...@@ -325,12 +325,12 @@ __global__ void KeSequenceAvgForward(real* dst, ...@@ -325,12 +325,12 @@ __global__ void KeSequenceAvgForward(real* dst,
int seqLength = end - start; int seqLength = end - start;
if (seqLength == 0) return; if (seqLength == 0) return;
real sum = 0.0; real sum = 0.0;
for (int i = 0; i < seqLength; i++) { for (int i = start; i < end; i++) {
sum += src[(start + i) * width + col]; sum += src[i * width + col];
} }
sum = mode == 1 ? sum : sum = mode == 1 ? sum :
(mode == 0 ? sum / seqLength : sum * my_rsqrt((real)seqLength)); (mode == 0 ? sum / seqLength : sum * my_rsqrt((real)seqLength));
dst[row * width + col] = sum; dst[gid] = sum;
} }
} }
...@@ -354,3 +354,48 @@ void hl_sequence_avg_forward(real* dst, ...@@ -354,3 +354,48 @@ void hl_sequence_avg_forward(real* dst,
(dst, src, starts, height, width, mode); (dst, src, starts, height, width, mode);
CHECK_SYNC("hl_sequence_avg_forward failed"); CHECK_SYNC("hl_sequence_avg_forward failed");
} }
__global__ void KeSequenceAvgBackward(real* dst,
real* src,
const int* starts,
int height,
int width,
const int mode) {
int gid = blockIdx.x * blockDim.x + threadIdx.x;
int row = gid / width;
int col = gid % width;
if (gid < height * width) {
int start = starts[row];
int end = starts[row + 1];
int seqLength = end - start;
if (seqLength == 0) return;
real grad = src[gid];
grad = mode == 1 ? grad :
(mode == 0 ? grad / seqLength : grad * my_rsqrt((real)seqLength));
for (int i = start; i < end; i++) {
dst[i * width + col] += grad;
}
}
}
void hl_sequence_avg_backward(real* dst,
real* src,
const int* starts,
int height,
int width,
const int mode) {
CHECK_NOTNULL(dst);
CHECK_NOTNULL(src);
CHECK_NOTNULL(starts);
int block = 512;
int grid = DIVUP(width * height, 512);
CHECK(mode == 0 || mode == 1 || mode == 2)
<< "mode error in hl_sequence_avg_backward!";
KeSequenceAvgBackward<<< grid, block, 0, STREAM_DEFAULT >>>
(dst, src, starts, height, width, mode);
CHECK_SYNC("hl_sequence_avg_backward failed");
}
...@@ -16,66 +16,6 @@ limitations under the License. */ ...@@ -16,66 +16,6 @@ limitations under the License. */
namespace paddle { namespace paddle {
template <>
size_t FuncConfig::get<size_t>(const std::string& key) const {
auto it = valueMap_.find(key);
CHECK(it != valueMap_.end()) << "Cannot find value: '" << key << "'";
return it->second.s;
}
template <>
real FuncConfig::get<real>(const std::string& key) const {
auto it = valueMap_.find(key);
CHECK(it != valueMap_.end()) << "Cannot find value: '" << key << "'";
return it->second.r;
}
template <>
int FuncConfig::get<int>(const std::string& key) const {
auto it = valueMap_.find(key);
CHECK(it != valueMap_.end()) << "Cannot find value: '" << key << "'";
return it->second.i;
}
template <>
bool FuncConfig::get<bool>(const std::string& key) const {
auto it = valueMap_.find(key);
CHECK(it != valueMap_.end()) << "Cannot find value: '" << key << "'";
return it->second.b;
}
template <>
FuncConfig& FuncConfig::set<size_t>(const std::string& key, size_t v) {
CHECK_EQ(static_cast<int>(valueMap_.count(key)), 0) << "Duplicated value: "
<< key;
valueMap_[key].s = v;
return *this;
}
template <>
FuncConfig& FuncConfig::set<real>(const std::string& key, real v) {
CHECK_EQ(static_cast<int>(valueMap_.count(key)), 0) << "Duplicated value: "
<< key;
valueMap_[key].r = v;
return *this;
}
template <>
FuncConfig& FuncConfig::set<int>(const std::string& key, int v) {
CHECK_EQ(static_cast<int>(valueMap_.count(key)), 0) << "Duplicated value: "
<< key;
valueMap_[key].i = v;
return *this;
}
template <>
FuncConfig& FuncConfig::set<bool>(const std::string& key, bool v) {
CHECK_EQ(static_cast<int>(valueMap_.count(key)), 0) << "Duplicated value: "
<< key;
valueMap_[key].b = v;
return *this;
}
void BufferArgs::addArg(const Matrix& arg, void BufferArgs::addArg(const Matrix& arg,
const TensorShape& shape, const TensorShape& shape,
ArgType argType) { ArgType argType) {
......
...@@ -18,32 +18,49 @@ limitations under the License. */ ...@@ -18,32 +18,49 @@ limitations under the License. */
#include <vector> #include <vector>
#include "BufferArg.h" #include "BufferArg.h"
#include "paddle/math/Matrix.h" #include "paddle/math/Matrix.h"
#include "paddle/utils/Any.h"
#include "paddle/utils/ClassRegistrar.h" #include "paddle/utils/ClassRegistrar.h"
#include "paddle/utils/Error.h"
namespace paddle { namespace paddle {
/** /**
* Function Configuration. * Function Configuration.
* The argument type of Function::init. * The argument type of Function::init.
* Follow-up will consider moving this data structure to Proto inside.
*/ */
class FuncConfig { class FuncConfig {
public: public:
union value {
size_t s;
real r;
int i;
bool b;
};
template <typename T> template <typename T>
T get(const std::string& key) const; T get(const std::string& key, Error* err = nullptr) const {
try {
return any_cast<T>(valueMap_.at(key));
} catch (std::exception& e) { // could be cast or out of range exception.
if (err) {
*err = Error(e.what());
} else {
LOG(FATAL) << "Cannot get key " << key << " with error " << e.what();
}
return T();
}
}
template <typename T> template <typename T>
FuncConfig& set(const std::string& key, T v); FuncConfig& set(const std::string& key, T v, Error* err = nullptr) {
auto it = valueMap_.find(key);
if (it != valueMap_.end()) { // already contains key.
if (err) {
*err = Error("Key %s is already set in FuncConfig", key.c_str());
} else {
LOG(FATAL) << "Key " << key << " is already set in FuncConfig.";
}
return *this;
}
valueMap_[key] = any(v);
return *this;
}
protected: protected:
std::map<std::string, value> valueMap_; mutable std::unordered_map<std::string, any> valueMap_;
}; };
/** /**
......
...@@ -25,9 +25,9 @@ void Pad<DEVICE_TYPE_CPU>(real* outputs, ...@@ -25,9 +25,9 @@ void Pad<DEVICE_TYPE_CPU>(real* outputs,
const int inH, const int inH,
const int inW, const int inW,
const PadConf& pad) { const PadConf& pad) {
int cstart = pad.channelStart, cend = pad.channelEnd; int cstart = pad.channel[0], cend = pad.channel[1];
int hstart = pad.heightStart, hend = pad.heightEnd; int hstart = pad.height[0], hend = pad.height[1];
int wstart = pad.widthStart, wend = pad.widthEnd; int wstart = pad.width[0], wend = pad.width[1];
int outC = inC + cstart + cend; int outC = inC + cstart + cend;
int outH = inH + hstart + hend; int outH = inH + hstart + hend;
int outW = inW + wstart + wend; int outW = inW + wstart + wend;
...@@ -51,9 +51,9 @@ void PadGrad<DEVICE_TYPE_CPU>(real* inGrad, ...@@ -51,9 +51,9 @@ void PadGrad<DEVICE_TYPE_CPU>(real* inGrad,
const int inH, const int inH,
const int inW, const int inW,
const PadConf& pad) { const PadConf& pad) {
int cstart = pad.channelStart, cend = pad.channelEnd; int cstart = pad.channel[0], cend = pad.channel[1];
int hstart = pad.heightStart, hend = pad.heightEnd; int hstart = pad.height[0], hend = pad.height[1];
int wstart = pad.widthStart, wend = pad.widthEnd; int wstart = pad.width[0], wend = pad.width[1];
int outC = inC + cstart + cend; int outC = inC + cstart + cend;
int outH = inH + hstart + hend; int outH = inH + hstart + hend;
int outW = inW + wstart + wend; int outW = inW + wstart + wend;
...@@ -71,6 +71,12 @@ void PadGrad<DEVICE_TYPE_CPU>(real* inGrad, ...@@ -71,6 +71,12 @@ void PadGrad<DEVICE_TYPE_CPU>(real* inGrad,
} }
} }
static inline PadConf castToPadConf(const FuncConfig& conf) {
return {conf.get<std::vector<uint32_t>>("channel"),
conf.get<std::vector<uint32_t>>("height"),
conf.get<std::vector<uint32_t>>("width")};
}
/** /**
* \brief Padding zeros to input according to the specify dimension. * \brief Padding zeros to input according to the specify dimension.
* The struct pad_ contains the padding size in each dimension. * The struct pad_ contains the padding size in each dimension.
...@@ -127,14 +133,7 @@ void PadGrad<DEVICE_TYPE_CPU>(real* inGrad, ...@@ -127,14 +133,7 @@ void PadGrad<DEVICE_TYPE_CPU>(real* inGrad,
template <DeviceType Device> template <DeviceType Device>
class PadFunc : public FunctionBase { class PadFunc : public FunctionBase {
public: public:
void init(const FuncConfig& config) override { void init(const FuncConfig& config) override { pad_ = castToPadConf(config); }
pad_.channelStart = config.get<int>("cstart");
pad_.channelEnd = config.get<int>("cend");
pad_.heightStart = config.get<int>("hstart");
pad_.heightEnd = config.get<int>("hend");
pad_.widthStart = config.get<int>("wstart");
pad_.widthEnd = config.get<int>("wend");
}
void calc(const BufferArgs& inputs, const BufferArgs& outputs) override { void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
CHECK_EQ(1UL, inputs.size()); CHECK_EQ(1UL, inputs.size());
...@@ -175,14 +174,7 @@ private: ...@@ -175,14 +174,7 @@ private:
template <DeviceType Device> template <DeviceType Device>
class PadGradFunc : public FunctionBase { class PadGradFunc : public FunctionBase {
public: public:
void init(const FuncConfig& config) override { void init(const FuncConfig& config) override { pad_ = castToPadConf(config); }
pad_.channelStart = config.get<int>("cstart");
pad_.channelEnd = config.get<int>("cend");
pad_.heightStart = config.get<int>("hstart");
pad_.heightEnd = config.get<int>("hend");
pad_.widthStart = config.get<int>("wstart");
pad_.widthEnd = config.get<int>("wend");
}
void calc(const BufferArgs& inputs, const BufferArgs& outputs) override { void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
CHECK_EQ(1UL, inputs.size()); CHECK_EQ(1UL, inputs.size());
......
...@@ -19,18 +19,12 @@ limitations under the License. */ ...@@ -19,18 +19,12 @@ limitations under the License. */
namespace paddle { namespace paddle {
struct PadConf { struct PadConf {
/// how many values to add before the data along channel dimension. /// how many values to add before/after the data along channel dimension.
int channelStart; std::vector<uint32_t> channel;
/// how many values to add after the data along channel dimension. /// how many values to add before/after the data along height dimension.
int channelEnd; std::vector<uint32_t> height;
/// how many values to add before the data along height dimension. /// how many values to add before/after the data along width dimension.
int heightStart; std::vector<uint32_t> width;
/// how many values to add after the data along height dimension.
int heightEnd;
/// how many values to add before the data along width dimension.
int widthStart;
/// how many values to add after the data along width dimension.
int widthEnd;
}; };
/** /**
......
...@@ -44,9 +44,9 @@ void Pad<DEVICE_TYPE_GPU>(real* outputs, ...@@ -44,9 +44,9 @@ void Pad<DEVICE_TYPE_GPU>(real* outputs,
size_t nth = num * inC * inH * inW; size_t nth = num * inC * inH * inW;
int blockSize = 1024; int blockSize = 1024;
int gridSize = (nth + 1024 - 1) / 1024; int gridSize = (nth + 1024 - 1) / 1024;
int cstart = pad.channelStart, cend = pad.channelEnd; int cstart = pad.channel[0], cend = pad.channel[1];
int hstart = pad.heightStart, hend = pad.heightEnd; int hstart = pad.height[0], hend = pad.height[1];
int wstart = pad.widthStart, wend = pad.widthEnd; int wstart = pad.width[0], wend = pad.width[1];
int outC = inC + cstart + cend; int outC = inC + cstart + cend;
int outH = inH + hstart + hend; int outH = inH + hstart + hend;
int outW = inW + wstart + wend; int outW = inW + wstart + wend;
...@@ -83,9 +83,9 @@ void PadGrad<DEVICE_TYPE_GPU>(real* inGrad, ...@@ -83,9 +83,9 @@ void PadGrad<DEVICE_TYPE_GPU>(real* inGrad,
int nth = num * inC * inH * inW; int nth = num * inC * inH * inW;
int blockSize = 1024; int blockSize = 1024;
int gridSize = (nth + 1024 - 1) / 1024; int gridSize = (nth + 1024 - 1) / 1024;
int cstart = pad.channelStart, cend = pad.channelEnd; int cstart = pad.channel[0], cend = pad.channel[1];
int hstart = pad.heightStart, hend = pad.heightEnd; int hstart = pad.height[0], hend = pad.height[1];
int wstart = pad.widthStart, wend = pad.widthEnd; int wstart = pad.width[0], wend = pad.width[1];
int outC = inC + cstart + cend; int outC = inC + cstart + cend;
int outH = inH + hstart + hend; int outH = inH + hstart + hend;
int outW = inW + wstart + wend; int outW = inW + wstart + wend;
......
...@@ -24,48 +24,22 @@ TEST(Pad, real) { ...@@ -24,48 +24,22 @@ TEST(Pad, real) {
for (size_t imgSizeW : {5, 32, 96}) { for (size_t imgSizeW : {5, 32, 96}) {
VLOG(3) << " numSamples=" << numSamples << " channels=" << channels VLOG(3) << " numSamples=" << numSamples << " channels=" << channels
<< " imgSizeH=" << imgSizeH << " imgSizeW=" << imgSizeW; << " imgSizeH=" << imgSizeH << " imgSizeW=" << imgSizeW;
for (bool test_grad : {false, true}) {
FunctionCompare compare("Pad", FunctionCompare compare(
FuncConfig() test_grad ? "PadGrad" : "Pad",
.set("cstart", 2) FuncConfig()
.set("cend", 3) .set<std::vector<uint32_t>>("channel", {2, 3})
.set("hstart", 1) .set<std::vector<uint32_t>>("height", {1, 2})
.set("hend", 2) .set<std::vector<uint32_t>>("width", {3, 2}));
.set("wstart", 3) TensorShape inDims{numSamples, channels, imgSizeH, imgSizeW};
.set("wend", 2)); TensorShape outDims{
TensorShape inDims{numSamples, channels, imgSizeH, imgSizeW}; numSamples, channels + 5, imgSizeH + 3, imgSizeW + 5};
TensorShape outDims{ compare.addInputs(
numSamples, channels + 5, imgSizeH + 3, imgSizeW + 5}; BufferArg(VALUE_TYPE_FLOAT, test_grad ? outDims : inDims));
compare.addInputs(BufferArg(VALUE_TYPE_FLOAT, inDims)); compare.addOutputs(BufferArg(
compare.addOutputs(BufferArg(VALUE_TYPE_FLOAT, outDims, ASSIGN_TO)); VALUE_TYPE_FLOAT, test_grad ? inDims : outDims, ASSIGN_TO));
compare.run(); compare.run();
} }
}
}
}
}
TEST(PadGrad, real) {
for (size_t numSamples : {5, 32}) {
for (size_t channels : {1, 5, 32}) {
for (size_t imgSizeH : {5, 33, 100}) {
for (size_t imgSizeW : {5, 32, 96}) {
VLOG(3) << " numSamples=" << numSamples << " channels=" << channels
<< " imgSizeH=" << imgSizeH << " imgSizeW=" << imgSizeW;
FunctionCompare compare("PadGrad",
FuncConfig()
.set("cstart", 2)
.set("cend", 3)
.set("hstart", 1)
.set("hend", 2)
.set("wstart", 3)
.set("wend", 2));
TensorShape inDims{numSamples, channels, imgSizeH, imgSizeW};
TensorShape outDims{
numSamples, channels + 5, imgSizeH + 3, imgSizeW + 5};
compare.addInputs(BufferArg(VALUE_TYPE_FLOAT, outDims));
compare.addOutputs(BufferArg(VALUE_TYPE_FLOAT, inDims, ASSIGN_TO));
compare.run();
} }
} }
} }
......
...@@ -42,7 +42,8 @@ void AgentLayer::forward(PassType passType) { ...@@ -42,7 +42,8 @@ void AgentLayer::forward(PassType passType) {
// get Arguments from real layers // get Arguments from real layers
if (numSamples_ > 0 && numSamples_ < realHeight) { if (numSamples_ > 0 && numSamples_ < realHeight) {
if (realOutput.ids) { if (realOutput.ids) {
output_.ids->subVecFrom(*realOutput.ids, 0, numSamples_); output_.ids =
IVector::create(realOutput.ids->getData(), numSamples_, useGpu_);
} else { } else {
output_.subArgFrom( output_.subArgFrom(
realOutput, /* offset */ 0, numSamples_, getSize(), useGpu_); realOutput, /* offset */ 0, numSamples_, getSize(), useGpu_);
......
...@@ -26,8 +26,6 @@ bool AverageLayer::init(const LayerMap& layerMap, ...@@ -26,8 +26,6 @@ bool AverageLayer::init(const LayerMap& layerMap,
const ParameterMap& parameterMap) { const ParameterMap& parameterMap) {
SequencePoolLayer::init(layerMap, parameterMap); SequencePoolLayer::init(layerMap, parameterMap);
dataMtx_ = Matrix::create(nullptr, 1, 1, false, useGpu_);
outMtx_ = Matrix::create(nullptr, 1, getSize(), false, useGpu_);
// average strategy // average strategy
if (config_.average_strategy() == "average") { if (config_.average_strategy() == "average") {
mode_ = kAverage; mode_ = kAverage;
...@@ -60,43 +58,9 @@ void AverageLayer::forward(PassType passType) { ...@@ -60,43 +58,9 @@ void AverageLayer::forward(PassType passType) {
void AverageLayer::backward(const UpdateCallback& callback) { void AverageLayer::backward(const UpdateCallback& callback) {
SequencePoolLayer::backward(callback); SequencePoolLayer::backward(callback);
const int* starts = startPositions_->getData(false); if (getInputGrad(0)) {
MatrixPtr grad = getInputGrad(0); getInputGrad(0)->sequenceAvgBackward(
*getOutputGrad(), *startPositions_->getVector(useGpu_), mode_);
if (grad) {
size_t dim = getSize();
real* gradientData = getInputGrad(0)->getData();
real* gradient = getOutputGrad()->getData();
size_t numSequences = startPositions_->getSize() - 1;
for (size_t sequenceId = 0; sequenceId < numSequences; ++sequenceId) {
// TODO(Dangqingqing) optimization for GPU
int sequenceLength = starts[sequenceId + 1] - starts[sequenceId];
if (0 == sequenceLength) {
// empty sequence
continue;
}
dataMtx_->setData(
gradientData + starts[sequenceId] * dim, sequenceLength, dim);
outMtx_->setData(gradient + sequenceId * dim);
switch (mode_) {
case kAverage: {
// plain average
dataMtx_->addBias(*outMtx_, 1.0f / sequenceLength);
break;
}
case kSum: {
// sum instead of average
dataMtx_->addBias(*outMtx_, 1.0f);
break;
}
case kAverageSquareRootN: {
// divide by square root of sequenceLength
dataMtx_->addBias(*outMtx_, 1.0f / sqrt(sequenceLength));
break;
}
default: { LOG(FATAL) << "should not reach here"; }
}
}
} }
} }
......
...@@ -45,8 +45,6 @@ public: ...@@ -45,8 +45,6 @@ public:
void backward(const UpdateCallback& callback = nullptr) override; void backward(const UpdateCallback& callback = nullptr) override;
protected: protected:
MatrixPtr outMtx_;
MatrixPtr dataMtx_;
int mode_; int mode_;
}; };
} // namespace paddle } // namespace paddle
...@@ -107,6 +107,10 @@ void ExpandConvBaseLayer::expandOneFrame(MatrixPtr image, ...@@ -107,6 +107,10 @@ void ExpandConvBaseLayer::expandOneFrame(MatrixPtr image,
int channel = isDeconv_ ? numFilters_ : channels_[inIdx]; int channel = isDeconv_ ? numFilters_ : channels_[inIdx];
resetExpandInput(subK_[inIdx] * groups_[inIdx], subN_[inIdx]); resetExpandInput(subK_[inIdx] * groups_[inIdx], subN_[inIdx]);
CHECK_EQ(image->getWidth(),
static_cast<size_t>(imgSizeH_[inIdx] * imgSizeW_[inIdx] * channel));
real *imgData = image->getData() + startIdx * image->getWidth(); real *imgData = image->getData() + startIdx * image->getWidth();
MatrixPtr imageTmp = MatrixPtr imageTmp =
Matrix::create(imgData, Matrix::create(imgData,
......
...@@ -36,7 +36,7 @@ namespace paddle { ...@@ -36,7 +36,7 @@ namespace paddle {
* | |- 5 * | |- 5
* | * |
* |-*- 0 * |-*- 0
* |- 1 * |- 1
* @endcode * @endcode
* *
* where * indicates an internal node, and each leaf node represents a class. * where * indicates an internal node, and each leaf node represents a class.
......
...@@ -36,12 +36,9 @@ bool PadLayer::init(const LayerMap& layerMap, ...@@ -36,12 +36,9 @@ bool PadLayer::init(const LayerMap& layerMap,
CHECK_EQ(2, pad_conf.pad_c_size()); CHECK_EQ(2, pad_conf.pad_c_size());
CHECK_EQ(2, pad_conf.pad_h_size()); CHECK_EQ(2, pad_conf.pad_h_size());
CHECK_EQ(2, pad_conf.pad_w_size()); CHECK_EQ(2, pad_conf.pad_w_size());
padc_.push_back(pad_conf.pad_c(0)); padc_ = {pad_conf.pad_c(0), pad_conf.pad_c(1)};
padc_.push_back(pad_conf.pad_c(1)); padh_ = {pad_conf.pad_h(0), pad_conf.pad_h(1)};
padh_.push_back(pad_conf.pad_h(0)); padw_ = {pad_conf.pad_w(0), pad_conf.pad_w(1)};
padh_.push_back(pad_conf.pad_h(1));
padw_.push_back(pad_conf.pad_w(0));
padw_.push_back(pad_conf.pad_w(1));
outDims_ = TensorShape(4); outDims_ = TensorShape(4);
setOutDims(0); setOutDims(0);
...@@ -49,21 +46,15 @@ bool PadLayer::init(const LayerMap& layerMap, ...@@ -49,21 +46,15 @@ bool PadLayer::init(const LayerMap& layerMap,
createFunction(forward_, createFunction(forward_,
"Pad", "Pad",
FuncConfig() FuncConfig()
.set("cstart", padc_[0]) .set("channel", padc_)
.set("cend", padc_[1]) .set("height", padh_)
.set("hstart", padh_[0]) .set("width", padw_));
.set("hend", padh_[1])
.set("wstart", padw_[0])
.set("wend", padw_[1]));
createFunction(backward_, createFunction(backward_,
"PadGrad", "PadGrad",
FuncConfig() FuncConfig()
.set("cstart", padc_[0]) .set("channel", padc_)
.set("cend", padc_[1]) .set("height", padh_)
.set("hstart", padh_[0]) .set("width", padw_));
.set("hend", padh_[1])
.set("wstart", padw_[0])
.set("wend", padw_[1]));
return true; return true;
} }
......
...@@ -38,9 +38,9 @@ protected: ...@@ -38,9 +38,9 @@ protected:
void setOutDims(const size_t batchSize); void setOutDims(const size_t batchSize);
void setTensorDim(const size_t batchSize); void setTensorDim(const size_t batchSize);
std::vector<int> padc_; std::vector<uint32_t> padc_;
std::vector<int> padh_; std::vector<uint32_t> padh_;
std::vector<int> padw_; std::vector<uint32_t> padw_;
TensorShape inDims_; TensorShape inDims_;
TensorShape outDims_; TensorShape outDims_;
}; };
......
...@@ -25,6 +25,11 @@ namespace paddle { ...@@ -25,6 +25,11 @@ namespace paddle {
* Input: a sequence * Input: a sequence
* If SequenceLevel = kNonseq: * If SequenceLevel = kNonseq:
* Output: a sequence containing only the last instance of the input sequence * Output: a sequence containing only the last instance of the input sequence
* If stride_ > 0:
* Output: a shorten sequence. The operation of getting last instance of a
* sequence is independently performed on every slice of the input
* sequence, which is obtained by sliding a window with the window
* size set to stride_.
* If SequenceLevel = kSeq: * If SequenceLevel = kSeq:
* Check input sequence must has sub-sequence * Check input sequence must has sub-sequence
* Output: a sequence containing only the last instance of each sub-sequence * Output: a sequence containing only the last instance of each sub-sequence
...@@ -37,6 +42,7 @@ class SequenceLastInstanceLayer : public SequencePoolLayer { ...@@ -37,6 +42,7 @@ class SequenceLastInstanceLayer : public SequencePoolLayer {
protected: protected:
MatrixPtr tmpSrc_; MatrixPtr tmpSrc_;
MatrixPtr tmpDest_; MatrixPtr tmpDest_;
std::vector<int> instanceIds_;
public: public:
explicit SequenceLastInstanceLayer(const LayerConfig& config) explicit SequenceLastInstanceLayer(const LayerConfig& config)
...@@ -54,6 +60,7 @@ REGISTER_LAYER(seqlastins, SequenceLastInstanceLayer); ...@@ -54,6 +60,7 @@ REGISTER_LAYER(seqlastins, SequenceLastInstanceLayer);
bool SequenceLastInstanceLayer::init(const LayerMap& layerMap, bool SequenceLastInstanceLayer::init(const LayerMap& layerMap,
const ParameterMap& parameterMap) { const ParameterMap& parameterMap) {
SequencePoolLayer::init(layerMap, parameterMap); SequencePoolLayer::init(layerMap, parameterMap);
reversed_ = config_.select_first();
tmpSrc_ = tmpSrc_ =
Matrix::create(nullptr, /* height= */ 1, 1, /* trans= */ false, useGpu_); Matrix::create(nullptr, /* height= */ 1, 1, /* trans= */ false, useGpu_);
...@@ -66,7 +73,8 @@ bool SequenceLastInstanceLayer::init(const LayerMap& layerMap, ...@@ -66,7 +73,8 @@ bool SequenceLastInstanceLayer::init(const LayerMap& layerMap,
void SequenceLastInstanceLayer::forward(PassType passType) { void SequenceLastInstanceLayer::forward(PassType passType) {
SequencePoolLayer::forward(passType); SequencePoolLayer::forward(passType);
const int* starts = startPositions_->getData(false); auto starts = (stride_ > 0) ? stridePositions_->getData()
: startPositions_->getData(false);
MatrixPtr inputValue = getInputValue(0); MatrixPtr inputValue = getInputValue(0);
MatrixPtr outputValue = getOutputValue(); MatrixPtr outputValue = getOutputValue();
...@@ -74,9 +82,10 @@ void SequenceLastInstanceLayer::forward(PassType passType) { ...@@ -74,9 +82,10 @@ void SequenceLastInstanceLayer::forward(PassType passType) {
AsyncGpuBlock asyncGpuBlock; AsyncGpuBlock asyncGpuBlock;
REGISTER_TIMER_INFO("SequenceLastInstanceLayerForward", getName().c_str()); REGISTER_TIMER_INFO("SequenceLastInstanceLayerForward", getName().c_str());
instanceIds_.clear();
for (size_t seqId = 0; seqId < newBatchSize_; ++seqId) { for (size_t seqId = 0; seqId < newBatchSize_; ++seqId) {
int insId = int insId = reversed_ ? starts[seqId] : starts[seqId + 1] - 1;
config_.select_first() ? starts[seqId] : starts[seqId + 1] - 1; instanceIds_.push_back(insId);
outputValue->subMatrix(seqId, 1, tmpDest_) outputValue->subMatrix(seqId, 1, tmpDest_)
->assign(*(inputValue->subMatrix(insId, 1, tmpSrc_))); ->assign(*(inputValue->subMatrix(insId, 1, tmpSrc_)));
...@@ -96,18 +105,13 @@ void SequenceLastInstanceLayer::backward(const UpdateCallback& callback) { ...@@ -96,18 +105,13 @@ void SequenceLastInstanceLayer::backward(const UpdateCallback& callback) {
MatrixPtr inputGrad = getInputGrad(0); MatrixPtr inputGrad = getInputGrad(0);
MatrixPtr outputGrad = getOutputGrad(); MatrixPtr outputGrad = getOutputGrad();
const int* starts = startPositions_->getData(false);
size_t numSequences = startPositions_->getSize() - 1;
if (inputGrad) { if (inputGrad) {
AsyncGpuBlock asyncGpuBlock; AsyncGpuBlock asyncGpuBlock;
REGISTER_TIMER_INFO("SequenceLastInstanceLayerBackward", getName().c_str()); REGISTER_TIMER_INFO("SequenceLastInstanceLayerBackward", getName().c_str());
for (size_t seqId = 0; seqId < numSequences; ++seqId) { for (size_t seqId = 0; seqId < newBatchSize_; ++seqId) {
int insId = inputGrad->subMatrix(instanceIds_[seqId], 1, tmpDest_)
config_.select_first() ? starts[seqId] : starts[seqId + 1] - 1;
inputGrad->subMatrix(insId, 1, tmpDest_)
->add(*(outputGrad->subMatrix(seqId, 1, tmpSrc_))); ->add(*(outputGrad->subMatrix(seqId, 1, tmpSrc_)));
} }
} }
......
...@@ -37,6 +37,7 @@ bool SequencePoolLayer::init(const LayerMap& layerMap, ...@@ -37,6 +37,7 @@ bool SequencePoolLayer::init(const LayerMap& layerMap,
} else { } else {
LOG(FATAL) << "Unknown trans_type: " << config_.trans_type(); LOG(FATAL) << "Unknown trans_type: " << config_.trans_type();
} }
stride_ = config_.seq_pool_stride();
setNeedSequenceInfo(false); setNeedSequenceInfo(false);
return true; return true;
} }
...@@ -55,8 +56,6 @@ void SequencePoolLayer::forward(PassType passType) { ...@@ -55,8 +56,6 @@ void SequencePoolLayer::forward(PassType passType) {
CHECK_EQ(starts->getData()[newBatchSize_], input.getBatchSize()); CHECK_EQ(starts->getData()[newBatchSize_], input.getBatchSize());
CHECK_EQ(newBatchSize_, starts->getSize() - 1); CHECK_EQ(newBatchSize_, starts->getSize() - 1);
resetOutput(newBatchSize_, dim);
/* If type_ = kNonSeq, both seq has or not has sub-seq degrade to a non-seq, /* If type_ = kNonSeq, both seq has or not has sub-seq degrade to a non-seq,
* thus, in this case, output_ has no sequenceStartPositions. * thus, in this case, output_ has no sequenceStartPositions.
* If type_ = kSeq, seq has sub-seq degrades to a seq, thus, only in this * If type_ = kSeq, seq has sub-seq degrades to a seq, thus, only in this
...@@ -67,6 +66,15 @@ void SequencePoolLayer::forward(PassType passType) { ...@@ -67,6 +66,15 @@ void SequencePoolLayer::forward(PassType passType) {
<< "when trans_type = seq, input must hasSubseq"; << "when trans_type = seq, input must hasSubseq";
output_.degradeSequence(input); output_.degradeSequence(input);
} }
if (stride_ > 0) {
CHECK_EQ(input.hasSubseq(), 0UL)
<< "sequence stride pooling is invalid for hasSubseq now";
output_.poolSequenceWithStride(
input, stride_, &stridePositions_, reversed_);
newBatchSize_ = stridePositions_->getSize() - 1;
}
resetOutput(newBatchSize_, dim);
} }
void SequencePoolLayer::backward(const UpdateCallback& callback) { void SequencePoolLayer::backward(const UpdateCallback& callback) {
......
...@@ -26,6 +26,10 @@ namespace paddle { ...@@ -26,6 +26,10 @@ namespace paddle {
* Output: output size is the number of input sequences (NOT input instances) * Output: output size is the number of input sequences (NOT input instances)
* output[i] = seqlastin/average/max_{for each instance in this * output[i] = seqlastin/average/max_{for each instance in this
* sequence}{input[i]} * sequence}{input[i]}
* If stride_ > 0:
* Check input sequence must not have sub-sequence
* Output: a shorten sequence, pooling is performed upon a small local
* area
* If SequenceLevel = kSeq: * If SequenceLevel = kSeq:
* Check input sequence must has sub-sequence * Check input sequence must has sub-sequence
* Output: output size is the number of input sub-sequences * Output: output size is the number of input sub-sequences
...@@ -42,6 +46,11 @@ protected: ...@@ -42,6 +46,11 @@ protected:
enum SequenceLevel { kNonSeq = 0, kSeq = 1 }; enum SequenceLevel { kNonSeq = 0, kSeq = 1 };
size_t newBatchSize_; size_t newBatchSize_;
ICpuGpuVectorPtr startPositions_; ICpuGpuVectorPtr startPositions_;
int stride_;
// Store the start position of each window.
IVectorPtr stridePositions_;
// Whether the input sequence is reversed or not.
bool reversed_ = false;
public: public:
explicit SequencePoolLayer(const LayerConfig& config) : Layer(config) {} explicit SequencePoolLayer(const LayerConfig& config) : Layer(config) {}
......
...@@ -778,8 +778,10 @@ void testProjectionGrad(ProjectionConfig conf, ...@@ -778,8 +778,10 @@ void testProjectionGrad(ProjectionConfig conf,
config.biasSize = biasSize == 0 ? config.layerConfig.size() : biasSize; config.biasSize = biasSize == 0 ? config.layerConfig.size() : biasSize;
config.layerConfig.set_bias_size(config.biasSize); config.layerConfig.set_bias_size(config.biasSize);
config.layerConfig.set_shared_biases(sharedBias); config.layerConfig.set_shared_biases(sharedBias);
config.inputDefs.push_back( config.inputDefs.push_back({inputType,
{inputType, "layer_0", conf.input_size(), parameterSize}); "layer_0",
static_cast<size_t>(conf.input_size()),
parameterSize});
*config.layerConfig.add_inputs()->mutable_proj_conf() = conf; *config.layerConfig.add_inputs()->mutable_proj_conf() = conf;
config.testState = testState; config.testState = testState;
testLayerGrad(config, "mixed", batchSize, false, useGpu); testLayerGrad(config, "mixed", batchSize, false, useGpu);
......
...@@ -804,10 +804,14 @@ TEST(Layer, ExpandLayer) { ...@@ -804,10 +804,14 @@ TEST(Layer, ExpandLayer) {
testExpandLayer("seq", true); // seq expand to hasSubseq testExpandLayer("seq", true); // seq expand to hasSubseq
} }
void testDegradeLayer(bool hasSubseq, string layer_type, string trans_type) { void testDegradeLayer(bool hasSubseq,
string layer_type,
string trans_type,
int stride) {
TestConfig config; TestConfig config;
config.layerConfig.set_type(layer_type); config.layerConfig.set_type(layer_type);
config.layerConfig.set_size(10); config.layerConfig.set_size(10);
config.layerConfig.set_seq_pool_stride(stride);
config.biasSize = 0; config.biasSize = 0;
config.inputDefs.push_back( config.inputDefs.push_back(
...@@ -827,36 +831,46 @@ void testDegradeLayer(bool hasSubseq, string layer_type, string trans_type) { ...@@ -827,36 +831,46 @@ void testDegradeLayer(bool hasSubseq, string layer_type, string trans_type) {
if (layer_type == "average") { if (layer_type == "average") {
for (auto strategy : {"average", "sum", "squarerootn"}) { for (auto strategy : {"average", "sum", "squarerootn"}) {
LOG(INFO) << " hasSubseq=" << hasSubseq << " trans_type=" << trans_type LOG(INFO) << " hasSubseq=" << hasSubseq << " trans_type=" << trans_type
<< " average_strategy=" << strategy; << " average_strategy=" << strategy
<< " seq_pool_stride=" << stride;
config.layerConfig.set_average_strategy(strategy); config.layerConfig.set_average_strategy(strategy);
testDegradeLayerGrad(config, layer_type); testDegradeLayerGrad(config, layer_type);
} }
} else { } else {
LOG(INFO) << " hasSubseq=" << hasSubseq << " trans_type=" << trans_type; LOG(INFO) << " hasSubseq=" << hasSubseq << " trans_type=" << trans_type
<< " seq_pool_stride=" << stride;
testDegradeLayerGrad(config, layer_type); testDegradeLayerGrad(config, layer_type);
} }
} }
TEST(Layer, MaxLayer) { TEST(Layer, MaxLayer) {
testDegradeLayer(false, "max", "non-seq"); // seq max to non-seq testDegradeLayer(false, "max", "non-seq", -1); // seq max to non-seq
testDegradeLayer(true, "max", "non-seq"); // hasSubseq max to non-seq testDegradeLayer(true, "max", "non-seq", -1); // hasSubseq max to non-seq
testDegradeLayer(true, "max", "seq"); // hasSubseq max to seq testDegradeLayer(true, "max", "seq", -1); // hasSubseq max to seq
} }
TEST(Layer, SequenceLastInstanceLayer) { TEST(Layer, SequenceLastInstanceLayer) {
testDegradeLayer(false, testDegradeLayer(false,
"seqlastins", "seqlastins",
"non-seq"); // seq seqlastins to non-seq "non-seq",
-1); // seq seqlastins to non-seq
testDegradeLayer(false,
"seqlastins",
"non-seq",
5); // seq seqlastins to a shorten seq, stride window = 5
testDegradeLayer(true, testDegradeLayer(true,
"seqlastins", "seqlastins",
"non-seq"); // hasSubseq seqlastins to non-seq "non-seq",
testDegradeLayer(true, "seqlastins", "seq"); // hasSubseq seqlastins to seq -1); // hasSubseq seqlastins to non-seq
testDegradeLayer(
true, "seqlastins", "seq", -1); // hasSubseq seqlastins to seq
} }
TEST(Layer, AverageLayer) { TEST(Layer, AverageLayer) {
testDegradeLayer(false, "average", "non-seq"); // seq average to non-seq testDegradeLayer(false, "average", "non-seq", -1); // seq average to non-seq
testDegradeLayer(true, "average", "non-seq"); // hasSubseq average to non-seq testDegradeLayer(
testDegradeLayer(true, "average", "seq"); // hasSubseq average to seq true, "average", "non-seq", -1); // hasSubseq average to non-seq
testDegradeLayer(true, "average", "seq", -1); // hasSubseq average to seq
} }
TEST(Layer, SequenceConcatLayer) { TEST(Layer, SequenceConcatLayer) {
......
...@@ -85,11 +85,16 @@ int getrf<float>(const CBLAS_ORDER order, ...@@ -85,11 +85,16 @@ int getrf<float>(const CBLAS_ORDER order,
float* A, float* A,
const int lda, const int lda,
int* ipiv) { int* ipiv) {
#ifdef PADDLE_USE_LAPACK
#ifdef PADDLE_USE_ATLAS #ifdef PADDLE_USE_ATLAS
return clapack_sgetrf(order, M, N, A, lda, ipiv); return clapack_sgetrf(order, M, N, A, lda, ipiv);
#else #else
return LAPACKE_sgetrf(order, M, N, A, lda, ipiv); return LAPACKE_sgetrf(order, M, N, A, lda, ipiv);
#endif #endif
#else
LOG(FATAL) << "Not implemented";
#endif
return 0;
} }
template <> template <>
...@@ -99,11 +104,16 @@ int getrf<double>(const CBLAS_ORDER order, ...@@ -99,11 +104,16 @@ int getrf<double>(const CBLAS_ORDER order,
double* A, double* A,
const int lda, const int lda,
int* ipiv) { int* ipiv) {
#ifdef PADDLE_USE_LAPACK
#ifdef PADDLE_USE_ATLAS #ifdef PADDLE_USE_ATLAS
return clapack_dgetrf(order, M, N, A, lda, ipiv); return clapack_dgetrf(order, M, N, A, lda, ipiv);
#else #else
return LAPACKE_dgetrf(order, M, N, A, lda, ipiv); return LAPACKE_dgetrf(order, M, N, A, lda, ipiv);
#endif #endif
#else
LOG(FATAL) << "Not implemented";
#endif
return 0;
} }
template <> template <>
...@@ -112,11 +122,16 @@ int getri<float>(const CBLAS_ORDER order, ...@@ -112,11 +122,16 @@ int getri<float>(const CBLAS_ORDER order,
float* A, float* A,
const int lda, const int lda,
const int* ipiv) { const int* ipiv) {
#ifdef PADDLE_USE_LAPACK
#ifdef PADDLE_USE_ATLAS #ifdef PADDLE_USE_ATLAS
return clapack_sgetri(order, N, A, lda, ipiv); return clapack_sgetri(order, N, A, lda, ipiv);
#else #else
return LAPACKE_sgetri(order, N, A, lda, ipiv); return LAPACKE_sgetri(order, N, A, lda, ipiv);
#endif #endif
#else
LOG(FATAL) << "Not implemented";
#endif
return 0;
} }
template <> template <>
...@@ -125,11 +140,16 @@ int getri<double>(const CBLAS_ORDER order, ...@@ -125,11 +140,16 @@ int getri<double>(const CBLAS_ORDER order,
double* A, double* A,
const int lda, const int lda,
const int* ipiv) { const int* ipiv) {
#ifdef PADDLE_USE_LAPACK
#ifdef PADDLE_USE_ATLAS #ifdef PADDLE_USE_ATLAS
return clapack_dgetri(order, N, A, lda, ipiv); return clapack_dgetri(order, N, A, lda, ipiv);
#else #else
return LAPACKE_dgetri(order, N, A, lda, ipiv); return LAPACKE_dgetri(order, N, A, lda, ipiv);
#endif #endif
#else
LOG(FATAL) << "Not implemented";
#endif
return 0;
} }
template <> template <>
......
...@@ -17,11 +17,14 @@ limitations under the License. */ ...@@ -17,11 +17,14 @@ limitations under the License. */
#ifdef PADDLE_USE_MKL #ifdef PADDLE_USE_MKL
#include <mkl.h> #include <mkl.h>
#ifdef PADDLE_USE_LAPACK
#include <mkl_lapacke.h> #include <mkl_lapacke.h>
#endif
#else #else
extern "C" { extern "C" {
#include <cblas.h> #include <cblas.h>
} }
#ifdef PADDLE_USE_LAPACK
#ifdef PADDLE_USE_ATLAS #ifdef PADDLE_USE_ATLAS
extern "C" { extern "C" {
#include <clapack.h> #include <clapack.h>
...@@ -30,6 +33,7 @@ extern "C" { ...@@ -30,6 +33,7 @@ extern "C" {
#include <lapacke.h> #include <lapacke.h>
#endif #endif
#endif #endif
#endif
#include <cmath> #include <cmath>
......
...@@ -483,6 +483,20 @@ void GpuMatrix::sequenceAvgForward(Matrix& a, ...@@ -483,6 +483,20 @@ void GpuMatrix::sequenceAvgForward(Matrix& a,
hl_sequence_avg_forward(dst, src, starts, height, width, mode); hl_sequence_avg_forward(dst, src, starts, height, width, mode);
} }
void GpuMatrix::sequenceAvgBackward(Matrix& a,
const IVector& startsPos,
int mode) {
size_t height = a.getHeight();
size_t width = getWidth();
CHECK_EQ(height, startsPos.getSize() - 1);
CHECK_EQ(width, a.getWidth());
real* dst = getData();
real* src = a.getData();
const int* starts = startsPos.getData();
hl_sequence_avg_backward(dst, src, starts, height, width, mode);
}
/* this = scaleAB*(a*b) + scaleT*this */ /* this = scaleAB*(a*b) + scaleT*this */
void GpuMatrix::mul(const GpuMatrix& a, void GpuMatrix::mul(const GpuMatrix& a,
const GpuMatrix& b, const GpuMatrix& b,
...@@ -2304,6 +2318,41 @@ void CpuMatrix::sequenceAvgForward(Matrix& a, ...@@ -2304,6 +2318,41 @@ void CpuMatrix::sequenceAvgForward(Matrix& a,
} }
} }
void CpuMatrix::sequenceAvgBackward(Matrix& a,
const IVector& startsPos,
int mode) {
size_t height = a.getHeight();
size_t width = getWidth();
CHECK_EQ(height, startsPos.getSize() - 1);
CHECK_EQ(width, a.getWidth());
real* dst = getData();
real* src = a.getData();
const int* starts = startsPos.getData();
MatrixPtr outMtx = Matrix::create(nullptr, 1, width, false, false);
MatrixPtr dataMtx = Matrix::create(nullptr, 1, width, false, false);
for (size_t i = 0; i < height; ++i) {
int sequenceLength = starts[i + 1] - starts[i];
if (0 == sequenceLength) {
// empty sequence
continue;
}
outMtx->setData(dst + starts[i] * width, sequenceLength, width);
dataMtx->setData(src + i * width);
if (mode == 0) {
// plain average
outMtx->addBias(*dataMtx, 1.0f / sequenceLength);
} else if (mode == 1) {
// sum instead of average
outMtx->addBias(*dataMtx, 1.0f);
} else if (mode == 2) {
// divide by square root of sequenceLength
outMtx->addBias(*dataMtx, 1.0f / std::sqrt(sequenceLength));
} else {
LOG(FATAL) << "should not reach here";
}
}
}
/* this = scaleAB*(a*b) + scaleT*this*/ /* this = scaleAB*(a*b) + scaleT*this*/
void CpuMatrix::mul(const Matrix& a, void CpuMatrix::mul(const Matrix& a,
const Matrix& b, const Matrix& b,
...@@ -2377,41 +2426,8 @@ void CpuMatrix::mul(CpuMatrix* a, CpuMatrix* b, real scaleAB, real scaleT) { ...@@ -2377,41 +2426,8 @@ void CpuMatrix::mul(CpuMatrix* a, CpuMatrix* b, real scaleAB, real scaleT) {
int lda = a->getStride(); int lda = a->getStride();
int ldb = b->getStride(); int ldb = b->getStride();
int ldc = getStride(); int ldc = getStride();
#ifndef PADDLE_TYPE_DOUBLE gemm<real>(
cblas_sgemm(CblasRowMajor, a_trans, b_trans, M, N, K, scaleAB, A, lda, B, ldb, scaleT, C, ldc);
a_trans,
b_trans,
M,
N,
K,
scaleAB,
A,
lda,
B,
ldb,
scaleT,
C,
ldc);
#else
cblas_dgemm(CblasRowMajor,
a_trans,
b_trans,
M,
N,
K,
scaleAB,
A,
lda,
B,
ldb,
scaleT,
C,
ldc);
// TODO(yuyang18): Is gemm defined other place?
#endif
VLOG(2) << " A[0]=" << A[0] << " A[1]=" << A[1] << " B[0]=" << B[0]
<< " B[1]=" << B[1] << " C[0]=" << C[0] << " C[1]=" << C[1];
} }
void CpuMatrix::mul( void CpuMatrix::mul(
......
...@@ -461,6 +461,12 @@ public: ...@@ -461,6 +461,12 @@ public:
LOG(FATAL) << "Not implemented"; LOG(FATAL) << "Not implemented";
} }
virtual void sequenceAvgBackward(Matrix& a,
const IVector& startsPos,
int mode) {
LOG(FATAL) << "Not implemented";
}
/** /**
* @code * @code
* this = scaleAB*(a*b) + scaleT*this * this = scaleAB*(a*b) + scaleT*this
...@@ -1203,6 +1209,7 @@ public: ...@@ -1203,6 +1209,7 @@ public:
void collectSharedBias(Matrix& a, real scale); void collectSharedBias(Matrix& a, real scale);
void sequenceAvgForward(Matrix& a, const IVector& startsPos, int mode); void sequenceAvgForward(Matrix& a, const IVector& startsPos, int mode);
void sequenceAvgBackward(Matrix& a, const IVector& startsPos, int mode);
/** /**
* @code * @code
...@@ -1619,6 +1626,7 @@ public: ...@@ -1619,6 +1626,7 @@ public:
void collectSharedBias(Matrix& a, real scale); void collectSharedBias(Matrix& a, real scale);
void sequenceAvgForward(Matrix& a, const IVector& startsPos, int mode); void sequenceAvgForward(Matrix& a, const IVector& startsPos, int mode);
void sequenceAvgBackward(Matrix& a, const IVector& startsPos, int mode);
/** /**
* @code * @code
......
...@@ -13,119 +13,12 @@ See the License for the specific language governing permissions and ...@@ -13,119 +13,12 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "SIMDFunctions.h" #include "SIMDFunctions.h"
#ifdef __SSE3__
#include <immintrin.h> #include <immintrin.h>
#endif
#include <algorithm> #include <algorithm>
#ifndef __AVX__ #ifdef __AVX__
static void addto_sse(float* a, const float* b, size_t len) {
int offset = len % 16;
__m128 ma0, ma1, ma2, ma3;
__m128 mb0, mb1, mb2, mb3;
for (unsigned int k = 0; k < len / 16; k++, a += 16, b += 16) {
ma0 = _mm_load_ps(a);
ma1 = _mm_load_ps(a + 4);
ma2 = _mm_load_ps(a + 8);
ma3 = _mm_load_ps(a + 12);
mb0 = _mm_load_ps(b);
mb1 = _mm_load_ps(b + 4);
mb2 = _mm_load_ps(b + 8);
mb3 = _mm_load_ps(b + 12);
ma0 = _mm_add_ps(ma0, mb0);
ma1 = _mm_add_ps(ma1, mb1);
ma2 = _mm_add_ps(ma2, mb2);
ma3 = _mm_add_ps(ma3, mb3);
_mm_store_ps(a, ma0);
_mm_store_ps(a + 4, ma1);
_mm_store_ps(a + 8, ma2);
_mm_store_ps(a + 12, ma3);
}
for (int i = 0; i < offset; i++) a[i] += b[i];
}
static void batch_addto_sse(float* a, const float* b[], int batch, size_t len) {
int offset = len % 16;
__m128 ma0, ma1, ma2, ma3;
__m128 mb0, mb1, mb2, mb3;
for (unsigned int k = 0; k < len / 16; k++, a += 16) {
ma0 = _mm_load_ps(a);
ma1 = _mm_load_ps(a + 4);
ma2 = _mm_load_ps(a + 8);
ma3 = _mm_load_ps(a + 12);
for (int i = 0; i < batch; i++) {
mb0 = _mm_load_ps(b[i]);
mb1 = _mm_load_ps(b[i] + 4);
mb2 = _mm_load_ps(b[i] + 8);
mb3 = _mm_load_ps(b[i] + 12);
ma0 = _mm_add_ps(ma0, mb0);
ma1 = _mm_add_ps(ma1, mb1);
ma2 = _mm_add_ps(ma2, mb2);
ma3 = _mm_add_ps(ma3, mb3);
b[i] += 16;
}
_mm_store_ps(a, ma0);
_mm_store_ps(a + 4, ma1);
_mm_store_ps(a + 8, ma2);
_mm_store_ps(a + 12, ma3);
}
for (int i = 0; i < offset; i++) {
for (int k = 0; k < batch; k++) a[i] += b[k][i];
}
return;
}
static void col_max_sse(float* result,
const float* data,
int dim,
int numSamples) {
// first sample, direct copy
for (int d = 0; d < dim; ++d) {
result[d] = data[d];
}
int offset = dim % 16;
__m128 ma0, ma1, ma2, ma3;
__m128 mb0, mb1, mb2, mb3;
// first 16n dims
for (int k = 0; k < dim / 16; k++, result += 16, data += 16) {
ma0 = _mm_load_ps(result);
ma1 = _mm_load_ps(result + 4);
ma2 = _mm_load_ps(result + 8);
ma3 = _mm_load_ps(result + 12);
for (int i = 1; i < numSamples; i++) {
mb0 = _mm_load_ps(data + i * dim);
mb1 = _mm_load_ps(data + i * dim + 4);
mb2 = _mm_load_ps(data + i * dim + 8);
mb3 = _mm_load_ps(data + i * dim + 12);
ma0 = _mm_max_ps(ma0, mb0);
ma1 = _mm_max_ps(ma1, mb1);
ma2 = _mm_max_ps(ma2, mb2);
ma3 = _mm_max_ps(ma3, mb3);
}
_mm_store_ps(result, ma0);
_mm_store_ps(result + 4, ma1);
_mm_store_ps(result + 8, ma2);
_mm_store_ps(result + 12, ma3);
}
// last dims
for (int d = 0; d < offset; ++d) {
float sm = data[d];
for (int i = 1; i < numSamples; ++i) {
sm = std::max(sm, data[i * dim + d]);
}
result[d] = sm;
}
}
#else
static void addto_avx(float* a, const float* b, size_t len) { static void addto_avx(float* a, const float* b, size_t len) {
int offset = len % 32; int offset = len % 32;
...@@ -355,17 +248,128 @@ static void decayL1_avx( ...@@ -355,17 +248,128 @@ static void decayL1_avx(
} }
} }
#elif defined(__SSE3__)
static void addto_sse(float* a, const float* b, size_t len) {
int offset = len % 16;
__m128 ma0, ma1, ma2, ma3;
__m128 mb0, mb1, mb2, mb3;
for (unsigned int k = 0; k < len / 16; k++, a += 16, b += 16) {
ma0 = _mm_load_ps(a);
ma1 = _mm_load_ps(a + 4);
ma2 = _mm_load_ps(a + 8);
ma3 = _mm_load_ps(a + 12);
mb0 = _mm_load_ps(b);
mb1 = _mm_load_ps(b + 4);
mb2 = _mm_load_ps(b + 8);
mb3 = _mm_load_ps(b + 12);
ma0 = _mm_add_ps(ma0, mb0);
ma1 = _mm_add_ps(ma1, mb1);
ma2 = _mm_add_ps(ma2, mb2);
ma3 = _mm_add_ps(ma3, mb3);
_mm_store_ps(a, ma0);
_mm_store_ps(a + 4, ma1);
_mm_store_ps(a + 8, ma2);
_mm_store_ps(a + 12, ma3);
}
for (int i = 0; i < offset; i++) a[i] += b[i];
}
static void batch_addto_sse(float* a, const float* b[], int batch, size_t len) {
int offset = len % 16;
__m128 ma0, ma1, ma2, ma3;
__m128 mb0, mb1, mb2, mb3;
for (unsigned int k = 0; k < len / 16; k++, a += 16) {
ma0 = _mm_load_ps(a);
ma1 = _mm_load_ps(a + 4);
ma2 = _mm_load_ps(a + 8);
ma3 = _mm_load_ps(a + 12);
for (int i = 0; i < batch; i++) {
mb0 = _mm_load_ps(b[i]);
mb1 = _mm_load_ps(b[i] + 4);
mb2 = _mm_load_ps(b[i] + 8);
mb3 = _mm_load_ps(b[i] + 12);
ma0 = _mm_add_ps(ma0, mb0);
ma1 = _mm_add_ps(ma1, mb1);
ma2 = _mm_add_ps(ma2, mb2);
ma3 = _mm_add_ps(ma3, mb3);
b[i] += 16;
}
_mm_store_ps(a, ma0);
_mm_store_ps(a + 4, ma1);
_mm_store_ps(a + 8, ma2);
_mm_store_ps(a + 12, ma3);
}
for (int i = 0; i < offset; i++) {
for (int k = 0; k < batch; k++) a[i] += b[k][i];
}
return;
}
static void col_max_sse(float* result,
const float* data,
int dim,
int numSamples) {
// first sample, direct copy
for (int d = 0; d < dim; ++d) {
result[d] = data[d];
}
int offset = dim % 16;
__m128 ma0, ma1, ma2, ma3;
__m128 mb0, mb1, mb2, mb3;
// first 16n dims
for (int k = 0; k < dim / 16; k++, result += 16, data += 16) {
ma0 = _mm_load_ps(result);
ma1 = _mm_load_ps(result + 4);
ma2 = _mm_load_ps(result + 8);
ma3 = _mm_load_ps(result + 12);
for (int i = 1; i < numSamples; i++) {
mb0 = _mm_load_ps(data + i * dim);
mb1 = _mm_load_ps(data + i * dim + 4);
mb2 = _mm_load_ps(data + i * dim + 8);
mb3 = _mm_load_ps(data + i * dim + 12);
ma0 = _mm_max_ps(ma0, mb0);
ma1 = _mm_max_ps(ma1, mb1);
ma2 = _mm_max_ps(ma2, mb2);
ma3 = _mm_max_ps(ma3, mb3);
}
_mm_store_ps(result, ma0);
_mm_store_ps(result + 4, ma1);
_mm_store_ps(result + 8, ma2);
_mm_store_ps(result + 12, ma3);
}
// last dims
for (int d = 0; d < offset; ++d) {
float sm = data[d];
for (int i = 1; i < numSamples; ++i) {
sm = std::max(sm, data[i * dim + d]);
}
result[d] = sm;
}
}
#endif #endif
#ifndef __AVX__ #if defined(__AVX__)
#define SIMD_INVOKE(func, ...) func##_sse(__VA_ARGS__)
#else
#define SIMD_INVOKE(func, ...) func##_avx(__VA_ARGS__) #define SIMD_INVOKE(func, ...) func##_avx(__VA_ARGS__)
#elif defined(__SSE3__)
#define SIMD_INVOKE(func, ...) func##_sse(__VA_ARGS__)
#endif #endif
namespace paddle { namespace paddle {
namespace simd { namespace simd {
namespace internal { namespace internal {
#ifdef __SSE3__
void addToImpl(float* a, const float* b, size_t len) { void addToImpl(float* a, const float* b, size_t len) {
SIMD_INVOKE(addto, a, b, len); SIMD_INVOKE(addto, a, b, len);
} }
...@@ -376,6 +380,7 @@ void batchAddToImpl(float* a, const float* b[], int batch, size_t len) { ...@@ -376,6 +380,7 @@ void batchAddToImpl(float* a, const float* b[], int batch, size_t len) {
void colMaxImpl(float* result, const float* data, int dim, int numSamples) { void colMaxImpl(float* result, const float* data, int dim, int numSamples) {
SIMD_INVOKE(col_max, result, data, dim, numSamples); SIMD_INVOKE(col_max, result, data, dim, numSamples);
} }
#endif
#ifdef __AVX__ #ifdef __AVX__
void decayL1AvxImpl(float* dst, float* src, float lambda, size_t len) { void decayL1AvxImpl(float* dst, float* src, float lambda, size_t len) {
...@@ -385,8 +390,8 @@ void decayL1AvxImpl( ...@@ -385,8 +390,8 @@ void decayL1AvxImpl(
float* dst, float* src, float* lr, float lambda, size_t len) { float* dst, float* src, float* lr, float lambda, size_t len) {
decayL1_avx(dst, src, lr, lambda, len); decayL1_avx(dst, src, lr, lambda, len);
} }
#endif #endif
} // namespace internal } // namespace internal
} // namespace simd } // namespace simd
} // namespace paddle } // namespace paddle
...@@ -128,17 +128,29 @@ void decayL1AvxImpl( ...@@ -128,17 +128,29 @@ void decayL1AvxImpl(
template <> template <>
inline void addTo(float* a, const float* b, size_t len) { inline void addTo(float* a, const float* b, size_t len) {
#ifdef __SSE3__
internal::addToImpl(a, b, len); internal::addToImpl(a, b, len);
#else
naive::addTo(a, b, len);
#endif
} }
template <> template <>
inline void batchAddTo(float* a, const float* b[], int batch, size_t len) { inline void batchAddTo(float* a, const float* b[], int batch, size_t len) {
#ifdef __SSE3__
internal::batchAddToImpl(a, b, batch, len); internal::batchAddToImpl(a, b, batch, len);
#else
naive::batchAddTo(a, b, batch, len);
#endif
} }
template <> template <>
inline void colMax(float* result, const float* data, int dim, int numSamples) { inline void colMax(float* result, const float* data, int dim, int numSamples) {
#ifdef __SSE3__
internal::colMaxImpl(result, data, dim, numSamples); internal::colMaxImpl(result, data, dim, numSamples);
#else
naive::colMax(result, data, dim, numSamples);
#endif
} }
template <> template <>
......
...@@ -14,6 +14,7 @@ limitations under the License. */ ...@@ -14,6 +14,7 @@ limitations under the License. */
#include "Storage.h" #include "Storage.h"
#include "Allocator.h" #include "Allocator.h"
#include "paddle/utils/StringUtil.h"
#include "paddle/utils/Util.h" #include "paddle/utils/Util.h"
DEFINE_int32(pool_limit_size, DEFINE_int32(pool_limit_size,
...@@ -62,7 +63,7 @@ PoolAllocator* StorageEngine::getGpuAllocator(int deviceId) { ...@@ -62,7 +63,7 @@ PoolAllocator* StorageEngine::getGpuAllocator(int deviceId) {
} }
if (gpuAllocator_[deviceId] == nullptr) { if (gpuAllocator_[deviceId] == nullptr) {
std::string name = std::string name =
"gpu" + std::to_string(deviceId) + std::string("_pool"); "gpu" + str::to_string(deviceId) + std::string("_pool");
gpuAllocator_[deviceId] = gpuAllocator_[deviceId] =
new PoolAllocator(new GpuAllocator(), FLAGS_pool_limit_size, name); new PoolAllocator(new GpuAllocator(), FLAGS_pool_limit_size, name);
} }
......
...@@ -685,7 +685,7 @@ TEST(SMatrix, topK) { ...@@ -685,7 +685,7 @@ TEST(SMatrix, topK) {
} }
} }
void testMatrixSequenceAvgForward(int batchSize, int inputDim, int mode) { void testMatrixSequenceAvg(int batchSize, int inputDim, int mode) {
MatrixPtr cpuInput = std::make_shared<CpuMatrix>(batchSize, inputDim); MatrixPtr cpuInput = std::make_shared<CpuMatrix>(batchSize, inputDim);
MatrixPtr gpuInput = std::make_shared<GpuMatrix>(batchSize, inputDim); MatrixPtr gpuInput = std::make_shared<GpuMatrix>(batchSize, inputDim);
cpuInput->randomizeUniform(); cpuInput->randomizeUniform();
...@@ -706,15 +706,25 @@ void testMatrixSequenceAvgForward(int batchSize, int inputDim, int mode) { ...@@ -706,15 +706,25 @@ void testMatrixSequenceAvgForward(int batchSize, int inputDim, int mode) {
gpuOutput->sequenceAvgForward(*gpuInput, *gpuSequence, mode); gpuOutput->sequenceAvgForward(*gpuInput, *gpuSequence, mode);
TensorCheckErr(*cpuOutput, *gpuOutput); TensorCheckErr(*cpuOutput, *gpuOutput);
MatrixPtr cpuInGrad = std::make_shared<CpuMatrix>(batchSize, inputDim);
MatrixPtr gpuInGrad = std::make_shared<GpuMatrix>(batchSize, inputDim);
cpuInGrad->randomizeUniform();
gpuInGrad->copyFrom(*cpuInGrad);
cpuInGrad->sequenceAvgBackward(*cpuOutput, *cpuSequence, mode);
gpuInGrad->sequenceAvgBackward(*gpuOutput, *gpuSequence, mode);
TensorCheckErr(*cpuInGrad, *gpuInGrad);
} }
TEST(Matrix, sequenceAvgForward) { TEST(Matrix, sequenceAvg) {
for (auto batchSize : {10, 128, 6000}) { for (auto batchSize : {10, 128, 6000}) {
for (auto inputDim : {32, 100, 512}) { for (auto inputDim : {32, 100, 512}) {
for (auto mode : {0, 1, 2}) { for (auto mode : {0, 1, 2}) {
VLOG(3) << " batchSize=" << batchSize << " inputDim=" << inputDim VLOG(3) << " batchSize=" << batchSize << " inputDim=" << inputDim
<< " mode=" << mode; << " mode=" << mode;
testMatrixSequenceAvgForward(batchSize, inputDim, mode); testMatrixSequenceAvg(batchSize, inputDim, mode);
} }
} }
} }
......
...@@ -559,6 +559,49 @@ void Argument::degradeSequence(const Argument& input) { ...@@ -559,6 +559,49 @@ void Argument::degradeSequence(const Argument& input) {
tgtBuf[numSequences] = numSubSequences; tgtBuf[numSequences] = numSubSequences;
} }
void Argument::poolSequenceWithStride(const Argument& input,
size_t stride,
IVectorPtr* stridePostions,
bool reversed) {
// If input.sequenceStartPositions = [0, 9, 14, 17, 30] and stride = 5,
// then sequenceStartPositions = [0, 2, 3, 4, 7].
// If reversed = false, stridePostions = [0, 5, 9, 14, 17, 22, 27, 30];
// else reversed = true, stridePostions = [0, 4, 9, 14, 17, 20, 25, 30]
CHECK(input.sequenceStartPositions);
CHECK_EQ(input.hasSubseq(), 0UL);
CHECK_GT(stride, 0) << "stride must larger than 0";
size_t numSequences = input.getNumSequences();
ICpuGpuVector::resizeOrCreate(
sequenceStartPositions, numSequences + 1, false);
const int* starts = input.sequenceStartPositions->getData(false);
int* tgtBuf = sequenceStartPositions->getMutableData(false);
// first index of target sequence and stride positions are both 0
tgtBuf[0] = 0;
std::vector<int> stridePos;
for (size_t seqId = 0; seqId < numSequences; ++seqId) {
size_t seqLength = starts[seqId + 1] - starts[seqId];
stridePos.emplace_back(starts[seqId]);
if (seqLength == 0) {
// empty sequence
tgtBuf[seqId + 1] = tgtBuf[seqId];
} else {
int size = ceil((float)seqLength / stride);
tgtBuf[seqId + 1] = tgtBuf[seqId] + size;
for (int i = 0; i < size - 1; ++i) {
int cur = reversed ? starts[seqId + 1] - (size - 1 - i) * stride
: stridePos.back() + stride;
stridePos.emplace_back(cur);
}
}
}
stridePos.emplace_back(starts[numSequences]);
int size = stridePos.size();
CHECK_EQ(size - 1, tgtBuf[numSequences]);
IVector::resizeOrCreate(*stridePostions, size, false);
(*stridePostions)->copyFrom(stridePos.data(), size);
}
void Argument::getValueString( void Argument::getValueString(
std::unordered_map<std::string, std::string>* out) const { std::unordered_map<std::string, std::string>* out) const {
if (value) { if (value) {
......
...@@ -291,6 +291,15 @@ struct Argument { ...@@ -291,6 +291,15 @@ struct Argument {
*/ */
void degradeSequence(const Argument& input); void degradeSequence(const Argument& input);
/*
After pooling with stride n (n is smaller than sequence length),
a long sequence will be shorten.
This function is invalid for sequence having sub-sequence.
*/
void poolSequenceWithStride(const Argument& input,
size_t stride,
IVectorPtr* stridePositions,
bool reversed = false);
/** /**
* @brief getValueString will return the argument's output in string. There * @brief getValueString will return the argument's output in string. There
* are several kinds of output. The keys of output dictionary are 'value', * are several kinds of output. The keys of output dictionary are 'value',
......
add_simple_unittest(test_common) add_simple_unittest(test_common)
add_simple_unittest(test_argument)
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <gtest/gtest.h>
#include <paddle/parameter/Argument.h>
using namespace paddle; // NOLINT
TEST(Argument, poolSequenceWithStride) {
Argument input, output;
ICpuGpuVector::resizeOrCreate(input.sequenceStartPositions, 5, false);
int* inStart = input.sequenceStartPositions->getMutableData(false);
inStart[0] = 0;
inStart[1] = 9;
inStart[2] = 14;
inStart[3] = 17;
inStart[4] = 30;
int strideResult[] = {0, 5, 9, 14, 17, 22, 27, 30};
int strideResultReversed[] = {0, 4, 9, 14, 17, 20, 25, 30};
for (auto reversed : {false, true}) {
IVectorPtr stridePositions;
output.poolSequenceWithStride(
input, 5 /* stride */, &stridePositions, reversed);
const int* outStart = output.sequenceStartPositions->getData(false);
CHECK_EQ(outStart[0], 0);
CHECK_EQ(outStart[1], 2);
CHECK_EQ(outStart[2], 3);
CHECK_EQ(outStart[3], 4);
CHECK_EQ(outStart[4], 7);
CHECK_EQ(stridePositions->getSize(), 8);
auto result = reversed ? strideResultReversed : strideResult;
for (int i = 0; i < 8; i++) {
CHECK_EQ(stridePositions->getData()[i], result[i]);
}
}
}
int main(int argc, char** argv) {
testing::InitGoogleTest(&argc, argv);
initMain(argc, argv);
return RUN_ALL_TESTS();
}
...@@ -29,6 +29,7 @@ limitations under the License. */ ...@@ -29,6 +29,7 @@ limitations under the License. */
#include "paddle/utils/Flags.h" #include "paddle/utils/Flags.h"
#include "paddle/utils/GlobalConstants.h" #include "paddle/utils/GlobalConstants.h"
#include "paddle/utils/Stat.h" #include "paddle/utils/Stat.h"
#include "paddle/utils/StringUtil.h"
DEFINE_int32(pserver_num_threads, 1, "number of threads for sync op exec"); DEFINE_int32(pserver_num_threads, 1, "number of threads for sync op exec");
DEFINE_double(async_lagged_ratio_min, DEFINE_double(async_lagged_ratio_min,
...@@ -218,7 +219,8 @@ void ParameterServer2::setConfig(const SetConfigRequest& request, ...@@ -218,7 +219,8 @@ void ParameterServer2::setConfig(const SetConfigRequest& request,
callback(response); callback(response);
/// always defined, barrier slowest node function need it. /// always defined, barrier slowest node function need it.
statSet_.reset(new StatSet("ParameterServer" + std::to_string(serverId_))); statSet_.reset(new StatSet("ParameterServer" +
str::to_string(static_cast<int>(serverId_))));
} }
real bufferSum(const std::vector<ParameterServer2::Buffer>& buffers) { real bufferSum(const std::vector<ParameterServer2::Buffer>& buffers) {
......
...@@ -160,10 +160,19 @@ class SparseFloatScanner(SparseBinaryScanner): ...@@ -160,10 +160,19 @@ class SparseFloatScanner(SparseBinaryScanner):
class IndexScanner(IScanner): class IndexScanner(IScanner):
def __init__(self, input_type, pos): def __init__(self, input_type, pos):
IScanner.__init__(self, input_type, pos) IScanner.__init__(self, input_type, pos)
self.__ids__ = [] self.__ids__ = None
self.__idx__ = 0
def pre_scan(self, dat):
self.__idx__ += 1
def finish_pre_scan(self, argument):
self.__ids__ = [0] * self.__idx__
self.__idx__ = 0
def scan(self, dat): def scan(self, dat):
self.__ids__.append(dat) self.__ids__[self.__idx__] = dat
self.__idx__ += 1
def finish_scan(self, argument): def finish_scan(self, argument):
ids = swig_paddle.IVector.create(self.__ids__, self.data_in_gpu) ids = swig_paddle.IVector.create(self.__ids__, self.data_in_gpu)
...@@ -178,6 +187,13 @@ class SequenceScanner(IScanner): ...@@ -178,6 +187,13 @@ class SequenceScanner(IScanner):
self.__inner_scanner__ = inner_scanner self.__inner_scanner__ = inner_scanner
self.__setter__ = setter self.__setter__ = setter
def pre_scan(self, dat):
for each in dat:
self.__inner_scanner__.pre_scan(each)
def finish_pre_scan(self, argument):
self.__inner_scanner__.finish_pre_scan(argument)
def scan(self, dat): def scan(self, dat):
self.__seq__.append(self.__seq__[-1] + self.get_size(dat)) self.__seq__.append(self.__seq__[-1] + self.get_size(dat))
for each in dat: for each in dat:
......
...@@ -83,13 +83,17 @@ def __arguments_to_numpy__(i, arg): ...@@ -83,13 +83,17 @@ def __arguments_to_numpy__(i, arg):
assert isinstance(arg, swig_paddle.Arguments) assert isinstance(arg, swig_paddle.Arguments)
value = arg.getSlotValue(i) value = arg.getSlotValue(i)
ids = arg.getSlotIds(i) ids = arg.getSlotIds(i)
prob = arg.getSlotIn(i)
if value is not None: if value is not None:
assert isinstance(value, swig_paddle.Matrix) assert isinstance(value, swig_paddle.Matrix)
value = value.copyToNumpyMat() value = value.copyToNumpyMat()
if ids is not None: if ids is not None:
assert isinstance(ids, swig_paddle.IVector) assert isinstance(ids, swig_paddle.IVector)
ids = ids.copyToNumpyArray() ids = ids.copyToNumpyArray()
return {"value": value, "id": ids} if prob is not None:
assert isinstance(prob, swig_paddle.Matrix)
prob = prob.copyToNumpyMat()
return {"value": value, "id": ids, "prob": prob}
def __monkeypatch_gradient_machine__(): def __monkeypatch_gradient_machine__():
......
...@@ -94,7 +94,7 @@ docker build -t paddle:dev --build-arg UBUNTU_MIRROR=mirror://mirrors.ubuntu.com ...@@ -94,7 +94,7 @@ docker build -t paddle:dev --build-arg UBUNTU_MIRROR=mirror://mirrors.ubuntu.com
Given the development image `paddle:dev`, the following command builds PaddlePaddle from the source tree on the development computer (host): Given the development image `paddle:dev`, the following command builds PaddlePaddle from the source tree on the development computer (host):
```bash ```bash
docker run -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_AVX=ON" -e "WITH_TEST=OFF" -e "RUN_TEST=OFF" paddle:dev docker run --rm -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_AVX=ON" -e "WITH_TEST=OFF" -e "RUN_TEST=OFF" paddle:dev
``` ```
This command mounts the source directory on the host into `/paddle` in the container, so the default entry point of `paddle:dev`, `build.sh`, could build the source code with possible local changes. When it writes to `/paddle/build` in the container, it writes to `$PWD/build` on the host indeed. This command mounts the source directory on the host into `/paddle` in the container, so the default entry point of `paddle:dev`, `build.sh`, could build the source code with possible local changes. When it writes to `/paddle/build` in the container, it writes to `$PWD/build` on the host indeed.
...@@ -110,7 +110,7 @@ Users can specify the following Docker build arguments with either "ON" or "OFF" ...@@ -110,7 +110,7 @@ Users can specify the following Docker build arguments with either "ON" or "OFF"
- `WITH_AVX`: ***Required***. Set to "OFF" prevents from generating AVX instructions. If you don't know what is AVX, you might want to set "ON". - `WITH_AVX`: ***Required***. Set to "OFF" prevents from generating AVX instructions. If you don't know what is AVX, you might want to set "ON".
- `WITH_TEST`: ***Optional, default OFF***. Build unit tests binaries. Once you've built the unit tests, you can run these test manually by the following command: - `WITH_TEST`: ***Optional, default OFF***. Build unit tests binaries. Once you've built the unit tests, you can run these test manually by the following command:
```bash ```bash
docker run -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_AVX=ON" paddle:dev sh -c "cd /paddle/build; make coverall" docker run --rm -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_AVX=ON" paddle:dev sh -c "cd /paddle/build; make coverall"
``` ```
- `RUN_TEST`: ***Optional, default OFF***. Run unit tests after building. You can't run unit tests without building it. - `RUN_TEST`: ***Optional, default OFF***. Run unit tests after building. You can't run unit tests without building it.
...@@ -129,7 +129,7 @@ This production image is minimal -- it includes binary `paddle`, the shared libr ...@@ -129,7 +129,7 @@ This production image is minimal -- it includes binary `paddle`, the shared libr
Again the development happens on the host. Suppose that we have a simple application program in `a.py`, we can test and run it using the production image: Again the development happens on the host. Suppose that we have a simple application program in `a.py`, we can test and run it using the production image:
```bash ```bash
docker run -it -v $PWD:/work paddle /work/a.py docker run --rm -it -v $PWD:/work paddle /work/a.py
``` ```
But this works only if all dependencies of `a.py` are in the production image. If this is not the case, we need to build a new Docker image from the production image and with more dependencies installs. But this works only if all dependencies of `a.py` are in the production image. If this is not the case, we need to build a new Docker image from the production image and with more dependencies installs.
...@@ -166,3 +166,18 @@ docker tag myapp me/myapp ...@@ -166,3 +166,18 @@ docker tag myapp me/myapp
docker push docker push
kubectl ... kubectl ...
``` ```
### Reading source code with woboq codebrowser
For developers who are interested in the C++ source code, please use -e "WOBOQ=ON" to enable the building of C++ source code into HTML pages using [Woboq codebrowser](https://github.com/woboq/woboq_codebrowser).
- The following command builds PaddlePaddle, generates HTML pages from C++ source code, and writes HTML pages into `$HOME/woboq_out` on the host:
```bash
docker run -v $PWD:/paddle -v $HOME/woboq_out:/woboq_out -e "WITH_GPU=OFF" -e "WITH_AVX=ON" -e "WITH_TEST=ON" -e "WOBOQ=ON" paddle:dev
```
- You can open the generated HTML files in your Web browser. Or, if you want to run a Nginx container to serve them for a wider audience, you can run:
```
docker run -v $HOME/woboq_out:/usr/share/nginx/html -d -p 8080:80 nginx
```
...@@ -4,7 +4,7 @@ set -e ...@@ -4,7 +4,7 @@ set -e
# Set BASE_IMAGE according to env variables # Set BASE_IMAGE according to env variables
if [ ${WITH_GPU} == "ON" ]; then if [ ${WITH_GPU} == "ON" ]; then
BASE_IMAGE="nvidia/cuda:7.5-cudnn5-runtime-ubuntu14.04" BASE_IMAGE="nvidia/cuda:8.0-cudnn5-runtime-ubuntu14.04"
# additional packages to install when building gpu images # additional packages to install when building gpu images
GPU_DOCKER_PKG="python-pip python-dev" GPU_DOCKER_PKG="python-pip python-dev"
else else
...@@ -12,11 +12,10 @@ else ...@@ -12,11 +12,10 @@ else
fi fi
DOCKERFILE_GPU_ENV="" DOCKERFILE_GPU_ENV=""
DOCKERFILE_CUDNN_DSO=""
if [[ ${WITH_GPU:-OFF} == 'ON' ]]; then if [[ ${WITH_GPU:-OFF} == 'ON' ]]; then
DOCKERFILE_GPU_ENV="ENV LD_LIBRARY_PATH /usr/lib/x86_64-linux-gnu:${LD_LIBRARY_PATH}" DOCKERFILE_GPU_ENV="ENV LD_LIBRARY_PATH /usr/lib/x86_64-linux-gnu:${LD_LIBRARY_PATH}"
DOCKERFILE_CUDNN_DSO="RUN ln -s /usr/lib/x86_64-linux-gnu/libcudnn.so.5 /usr/lib/x86_64-linux-gnu/libcudnn.so"
# for cmake to find cudnn
ln -s /usr/lib/x86_64-linux-gnu/libcudnn.so /usr/lib/libcudnn.so
fi fi
mkdir -p /paddle/build mkdir -p /paddle/build
...@@ -47,7 +46,7 @@ make install ...@@ -47,7 +46,7 @@ make install
# install them in docker # install them in docker
cpack -D CPACK_GENERATOR='DEB' -D CPACK_DEBIAN_PACKAGE_DEPENDS="" .. cpack -D CPACK_GENERATOR='DEB' -D CPACK_DEBIAN_PACKAGE_DEPENDS="" ..
if [[ ${BUILD_WOBOQ:-OFF} == 'ON' ]]; then if [[ ${WOBOQ:-OFF} == 'ON' ]]; then
apt-get install -y clang-3.8 llvm-3.8 libclang-3.8-dev apt-get install -y clang-3.8 llvm-3.8 libclang-3.8-dev
# Install woboq_codebrowser. # Install woboq_codebrowser.
git clone https://github.com/woboq/woboq_codebrowser /woboq git clone https://github.com/woboq/woboq_codebrowser /woboq
...@@ -57,7 +56,7 @@ if [[ ${BUILD_WOBOQ:-OFF} == 'ON' ]]; then ...@@ -57,7 +56,7 @@ if [[ ${BUILD_WOBOQ:-OFF} == 'ON' ]]; then
. .
make make
export WOBOQ_OUT=/usr/share/nginx/html/paddle export WOBOQ_OUT=/woboq_out/paddle
export BUILD_DIR=/paddle/build export BUILD_DIR=/paddle/build
mkdir -p $WOBOQ_OUT mkdir -p $WOBOQ_OUT
cp -rv /woboq/data $WOBOQ_OUT/../data cp -rv /woboq/data $WOBOQ_OUT/../data
...@@ -95,7 +94,10 @@ RUN ${MIRROR_UPDATE} ...@@ -95,7 +94,10 @@ RUN ${MIRROR_UPDATE}
# Use different deb file when building different type of images # Use different deb file when building different type of images
ADD build/*.deb /usr/local/opt/paddle/deb/ ADD build/*.deb /usr/local/opt/paddle/deb/
# run paddle version to install python packages first # run paddle version to install python packages first
RUN dpkg -i /usr/local/opt/paddle/deb/*.deb && rm -f /usr/local/opt/paddle/deb/*.deb && paddle version RUN dpkg -i /usr/local/opt/paddle/deb/*.deb && \
rm -f /usr/local/opt/paddle/deb/*.deb && \
paddle version
${DOCKERFILE_CUDNN_DSO}
${DOCKERFILE_GPU_ENV} ${DOCKERFILE_GPU_ENV}
# default command shows the paddle version and exit # default command shows the paddle version and exit
CMD ["paddle", "version"] CMD ["paddle", "version"]
......
...@@ -21,9 +21,7 @@ function version(){ ...@@ -21,9 +21,7 @@ function version(){
echo " with_double: @WITH_DOUBLE@" echo " with_double: @WITH_DOUBLE@"
echo " with_python: @WITH_PYTHON@" echo " with_python: @WITH_PYTHON@"
echo " with_rdma: @WITH_RDMA@" echo " with_rdma: @WITH_RDMA@"
echo " with_metric_learning: @WITH_METRIC@"
echo " with_timer: @WITH_TIMER@" echo " with_timer: @WITH_TIMER@"
echo " with_predict_sdk: @WITH_PREDICT_SDK@"
} }
function ver2num() { function ver2num() {
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#if __cplusplus > 201402L
#include <any>
namespace paddle {
// using std::any for C++ 17
using std::any;
using std::any_cast;
using std::bad_any_cast;
} // namespace paddle
#else
#include <any.hpp>
namespace paddle {
// use linb::any for C++ 11
using linb::any;
using linb::any_cast;
using linb::bad_any_cast;
} // namespace paddle
#endif
...@@ -19,7 +19,7 @@ limitations under the License. */ ...@@ -19,7 +19,7 @@ limitations under the License. */
/// for MSVC /// for MSVC
#define CPUID(info, x) __cpuidex(info, x, 0) #define CPUID(info, x) __cpuidex(info, x, 0)
#else #elif !defined(__ANDROID__)
#include <cpuid.h> #include <cpuid.h>
...@@ -31,6 +31,7 @@ limitations under the License. */ ...@@ -31,6 +31,7 @@ limitations under the License. */
namespace paddle { namespace paddle {
SIMDFlags::SIMDFlags() { SIMDFlags::SIMDFlags() {
#if !defined(__ANDROID__)
unsigned int cpuInfo[4]; unsigned int cpuInfo[4];
// CPUID: https://en.wikipedia.org/wiki/CPUID // CPUID: https://en.wikipedia.org/wiki/CPUID
// clang-format off // clang-format off
...@@ -51,6 +52,9 @@ SIMDFlags::SIMDFlags() { ...@@ -51,6 +52,9 @@ SIMDFlags::SIMDFlags() {
CPUID(cpuInfo, 0x80000001); CPUID(cpuInfo, 0x80000001);
simd_flags_ |= cpuInfo[2] & (1 << 16) ? SIMD_FMA4 : SIMD_NONE; simd_flags_ |= cpuInfo[2] & (1 << 16) ? SIMD_FMA4 : SIMD_NONE;
// clang-fotmat on // clang-fotmat on
#else
simd_flags_ = SIMD_NEON;
#endif
} }
SIMDFlags const* SIMDFlags::instance() { SIMDFlags const* SIMDFlags::instance() {
......
...@@ -30,6 +30,7 @@ enum simd_t { ...@@ -30,6 +30,7 @@ enum simd_t {
SIMD_AVX = 1 << 8, ///< AVX SIMD_AVX = 1 << 8, ///< AVX
SIMD_AVX2 = 1 << 9, ///< AVX 2 SIMD_AVX2 = 1 << 9, ///< AVX 2
SIMD_AVX512 = 1 << 10, ///< AVX 512 SIMD_AVX512 = 1 << 10, ///< AVX 512
SIMD_NEON = 1 << 11, /// NEON
}; };
// clang-format on // clang-format on
...@@ -96,6 +97,7 @@ private: ...@@ -96,6 +97,7 @@ private:
#define HAS_AVX HAS_SIMD(SIMD_AVX) #define HAS_AVX HAS_SIMD(SIMD_AVX)
#define HAS_AVX2 HAS_SIMD(SIMD_AVX2) #define HAS_AVX2 HAS_SIMD(SIMD_AVX2)
#define HAS_AVX512 HAS_SIMD(SIMD_AVX512) #define HAS_AVX512 HAS_SIMD(SIMD_AVX512)
#define HAS_NEON HAS_SIMD(SIMD_NEON)
// clang-format on // clang-format on
/** /**
......
...@@ -18,6 +18,7 @@ limitations under the License. */ ...@@ -18,6 +18,7 @@ limitations under the License. */
*/ */
#include "Logging.h" #include "Logging.h"
#include <cstdlib>
namespace paddle { namespace paddle {
......
...@@ -54,6 +54,25 @@ inline T toWithStatus(const std::string& s, bool* ok = nullptr) { ...@@ -54,6 +54,25 @@ inline T toWithStatus(const std::string& s, bool* ok = nullptr) {
return v; return v;
} }
/**
* Cast type T to string with status.
*
* @param [in] v input value of type T.
* @param [out] ok status, return true if there is no error in casting. Set
* nullptr if user don't care error at all.
* @return result of casting. If error occurred, a empty string will be
* returned.
*/
template <class T>
inline std::string toWithStatus(const T v, bool* ok = nullptr) {
std::ostringstream sout;
sout << v;
if (ok) {
*ok = !sout.fail();
}
return sout.str();
}
/// Convert string to type T. It makes sure all the characters in s are used. /// Convert string to type T. It makes sure all the characters in s are used.
/// Otherwise it will abort. /// Otherwise it will abort.
/// ///
...@@ -67,6 +86,18 @@ inline T to(const std::string& s) { ...@@ -67,6 +86,18 @@ inline T to(const std::string& s) {
return v; return v;
} }
/// Convert type T to string.
///
/// @tparam T type of input value
/// @param v input value of type T
template <class T>
std::string to_string(T v) {
bool ok;
std::string s = toWithStatus<T>(v, &ok);
CHECK(ok) << "Cannot convert v(" << v << ") to type std::string";
return s;
}
} // namespace str } // namespace str
#undef DEFINE_STRING_CONVERSION #undef DEFINE_STRING_CONVERSION
......
...@@ -15,11 +15,16 @@ limitations under the License. */ ...@@ -15,11 +15,16 @@ limitations under the License. */
#include "Util.h" #include "Util.h"
#include <dirent.h> #include <dirent.h>
#include <pmmintrin.h>
#include <signal.h> #include <signal.h>
#include <sys/stat.h> #include <sys/stat.h>
#include <sys/types.h> #include <sys/types.h>
#ifdef __SSE__
#include <xmmintrin.h> #include <xmmintrin.h>
#endif
#ifdef __SSE3__
#include <pmmintrin.h>
#endif
#include <fstream> #include <fstream>
#include <mutex> #include <mutex>
...@@ -163,8 +168,12 @@ void initMain(int argc, char** argv) { ...@@ -163,8 +168,12 @@ void initMain(int argc, char** argv) {
installProfilerSwitch(); installProfilerSwitch();
#ifdef __SSE__
_MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON); _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON);
#endif
#ifdef __SSE3__
_MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON); _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON);
#endif
if (FLAGS_seed == 0) { if (FLAGS_seed == 0) {
unsigned int t = time(NULL); unsigned int t = time(NULL);
......
...@@ -15,6 +15,7 @@ limitations under the License. */ ...@@ -15,6 +15,7 @@ limitations under the License. */
#include "paddle/utils/Locks.h" #include "paddle/utils/Locks.h"
#include <semaphore.h> #include <semaphore.h>
#include <unistd.h> #include <unistd.h>
#include "paddle/utils/Logging.h"
namespace paddle { namespace paddle {
class SemaphorePrivate { class SemaphorePrivate {
...@@ -26,7 +27,10 @@ Semaphore::Semaphore(int initValue) : m(new SemaphorePrivate()) { ...@@ -26,7 +27,10 @@ Semaphore::Semaphore(int initValue) : m(new SemaphorePrivate()) {
sem_init(&m->sem, 0, initValue); sem_init(&m->sem, 0, initValue);
} }
Semaphore::~Semaphore() { sem_destroy(&m->sem); } Semaphore::~Semaphore() {
sem_destroy(&m->sem);
delete m;
}
bool Semaphore::timeWait(struct timespec* ts) { bool Semaphore::timeWait(struct timespec* ts) {
return (0 == sem_timedwait(&m->sem, ts)); return (0 == sem_timedwait(&m->sem, ts));
...@@ -36,36 +40,101 @@ void Semaphore::wait() { sem_wait(&m->sem); } ...@@ -36,36 +40,101 @@ void Semaphore::wait() { sem_wait(&m->sem); }
void Semaphore::post() { sem_post(&m->sem); } void Semaphore::post() { sem_post(&m->sem); }
#ifdef PADDLE_USE_PTHREAD_SPINLOCK
class SpinLockPrivate { class SpinLockPrivate {
public: public:
inline SpinLockPrivate() { pthread_spin_init(&lock_, 0); } inline SpinLockPrivate() { pthread_spin_init(&lock_, 0); }
inline ~SpinLockPrivate() { pthread_spin_destroy(&lock_); } inline ~SpinLockPrivate() { pthread_spin_destroy(&lock_); }
inline void lock() { pthread_spin_lock(&lock_); }
inline void unlock() { pthread_spin_unlock(&lock_); }
pthread_spinlock_t lock_; pthread_spinlock_t lock_;
char padding_[64 - sizeof(pthread_spinlock_t)]; char padding_[64 - sizeof(pthread_spinlock_t)];
}; };
SpinLock::SpinLock() : m(new SpinLockPrivate()) {} #else
SpinLock::~SpinLock() { delete m; } #include <atomic>
class SpinLockPrivate {
public:
inline void lock() {
while (lock_.test_and_set(std::memory_order_acquire)) {
}
}
inline void unlock() { lock_.clear(std::memory_order_release); }
std::atomic_flag lock_ = ATOMIC_FLAG_INIT;
char padding_[64 - sizeof(lock_)]; // Padding to cache line size
};
void SpinLock::lock() { pthread_spin_lock(&m->lock_); } #endif
void SpinLock::unlock() { pthread_spin_unlock(&m->lock_); } SpinLock::SpinLock() : m(new SpinLockPrivate()) {}
SpinLock::~SpinLock() { delete m; }
void SpinLock::lock() { m->lock(); }
void SpinLock::unlock() { m->unlock(); }
#ifdef PADDLE_USE_PTHREAD_BARRIER
class ThreadBarrierPrivate { class ThreadBarrierPrivate {
public: public:
pthread_barrier_t barrier_; pthread_barrier_t barrier_;
inline explicit ThreadBarrierPrivate(int count) {
pthread_barrier_init(&barrier_, nullptr, count);
}
inline ~ThreadBarrierPrivate() { pthread_barrier_destroy(&barrier_); }
inline void wait() { pthread_barrier_wait(&barrier_); }
}; };
ThreadBarrier::ThreadBarrier(int count) : m(new ThreadBarrierPrivate()) { #else
pthread_barrier_init(&m->barrier_, nullptr, count);
}
ThreadBarrier::~ThreadBarrier() { class ThreadBarrierPrivate {
pthread_barrier_destroy(&m->barrier_); public:
delete m; pthread_mutex_t mutex_;
} pthread_cond_t cond_;
int count_;
int tripCount_;
inline explicit ThreadBarrierPrivate(int cnt) : count_(0), tripCount_(cnt) {
CHECK_NE(cnt, 0);
CHECK_GE(pthread_mutex_init(&mutex_, 0), 0);
CHECK_GE(pthread_cond_init(&cond_, 0), 0);
}
inline ~ThreadBarrierPrivate() {
pthread_cond_destroy(&cond_);
pthread_mutex_destroy(&mutex_);
}
/**
* @brief wait
* @return true if the last wait
*/
inline bool wait() {
pthread_mutex_lock(&mutex_);
++count_;
if (count_ >= tripCount_) {
count_ = 0;
pthread_cond_broadcast(&cond_);
pthread_mutex_unlock(&mutex_);
return true;
} else {
pthread_cond_wait(&cond_, &mutex_);
pthread_mutex_unlock(&mutex_);
return false;
}
}
};
#endif
void ThreadBarrier::wait() { pthread_barrier_wait(&m->barrier_); } ThreadBarrier::ThreadBarrier(int count) : m(new ThreadBarrierPrivate(count)) {}
ThreadBarrier::~ThreadBarrier() { delete m; }
void ThreadBarrier::wait() { m->wait(); }
} // namespace paddle } // namespace paddle
...@@ -19,6 +19,7 @@ limitations under the License. */ ...@@ -19,6 +19,7 @@ limitations under the License. */
#include "paddle/utils/CustomStackTrace.h" #include "paddle/utils/CustomStackTrace.h"
#include "paddle/utils/Locks.h" #include "paddle/utils/Locks.h"
#include "paddle/utils/StringUtil.h"
#include "paddle/utils/Util.h" #include "paddle/utils/Util.h"
DEFINE_int32(test_thread_num, 10, "testing thread number"); DEFINE_int32(test_thread_num, 10, "testing thread number");
...@@ -69,11 +70,11 @@ TEST(CustomStackTrace, normalTrain) { ...@@ -69,11 +70,11 @@ TEST(CustomStackTrace, normalTrain) {
while (countDown-- > 0) { while (countDown-- > 0) {
start.wait(); start.wait();
for (size_t i = 0; i < layerSize; ++i) { for (size_t i = 0; i < layerSize; ++i) {
tracer.push("layer_" + std::to_string(i)); tracer.push("layer_" + paddle::str::to_string(i));
} }
tracer.pop(""); tracer.pop("");
for (size_t i = 0; i < layerSize; ++i) { for (size_t i = 0; i < layerSize; ++i) {
tracer.pop("layer_" + std::to_string(layerSize - 1 - i)); tracer.pop("layer_" + paddle::str::to_string(layerSize - 1 - i));
} }
finish.wait(); finish.wait();
} }
...@@ -89,7 +90,7 @@ TEST(CustomStackTrace, normalTest) { ...@@ -89,7 +90,7 @@ TEST(CustomStackTrace, normalTest) {
while (countDown-- > 0) { while (countDown-- > 0) {
start.wait(); start.wait();
for (size_t i = 0; i < layerSize; ++i) { for (size_t i = 0; i < layerSize; ++i) {
tracer.push("layer_" + std::to_string(i)); tracer.push("layer_" + paddle::str::to_string(i));
} }
tracer.clear(); // in forward test, tracer will clear after forward. tracer.clear(); // in forward test, tracer will clear after forward.
finish.wait(); finish.wait();
......
...@@ -13,13 +13,14 @@ See the License for the specific language governing permissions and ...@@ -13,13 +13,14 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/utils/CustomStackTrace.h" #include "paddle/utils/CustomStackTrace.h"
#include "paddle/utils/StringUtil.h"
#include "paddle/utils/Util.h" #include "paddle/utils/Util.h"
int main(int argc, char** argv) { int main(int argc, char** argv) {
paddle::initMain(argc, argv); paddle::initMain(argc, argv);
for (size_t i = 0; i < 1000; ++i) { for (size_t i = 0; i < 1000; ++i) {
paddle::gLayerStackTrace.push("layer_" + std::to_string(i)); paddle::gLayerStackTrace.push("layer_" + paddle::str::to_string(i));
if (i == 998) { if (i == 998) {
throw "Unhandle exception"; throw "Unhandle exception";
} }
......
...@@ -18,7 +18,8 @@ limitations under the License. */ ...@@ -18,7 +18,8 @@ limitations under the License. */
using namespace paddle; // NOLINT using namespace paddle; // NOLINT
TEST(SIMDFlags, gccTest) { TEST(SIMDFlags, gccTest) {
#if (defined(__GNUC__) || defined(__GNUG__)) && !(defined(__clang__)) #if (defined(__GNUC__) || defined(__GNUG__)) && !(defined(__clang__)) && \
!defined(__arm__)
// clang-format off // clang-format off
CHECK(!__builtin_cpu_supports("sse") != HAS_SSE); CHECK(!__builtin_cpu_supports("sse") != HAS_SSE);
CHECK(!__builtin_cpu_supports("sse2") != HAS_SSE2); CHECK(!__builtin_cpu_supports("sse2") != HAS_SSE2);
...@@ -43,4 +44,5 @@ TEST(SIMDFlags, normalPrint) { ...@@ -43,4 +44,5 @@ TEST(SIMDFlags, normalPrint) {
LOG(INFO) << "Has AVX: " << std::boolalpha << HAS_AVX; LOG(INFO) << "Has AVX: " << std::boolalpha << HAS_AVX;
LOG(INFO) << "Has AVX2: " << std::boolalpha << HAS_AVX2; LOG(INFO) << "Has AVX2: " << std::boolalpha << HAS_AVX2;
LOG(INFO) << "Has AVX512: " << std::boolalpha << HAS_AVX512; LOG(INFO) << "Has AVX512: " << std::boolalpha << HAS_AVX512;
LOG(INFO) << "Has NEON: " << std::boolalpha << HAS_NEON;
} }
...@@ -441,6 +441,11 @@ message LayerConfig { ...@@ -441,6 +441,11 @@ message LayerConfig {
// blank label used in ctc loss // blank label used in ctc loss
optional uint32 blank = 52 [default = 0]; optional uint32 blank = 52 [default = 0];
// stride parameter for seqlastins layer, AverageLayer, MaxLayer, which
// controls the scope of pooling operation. can be set > 0.
// leave empty or set to -1 to disable this stride pooling.
optional int32 seq_pool_stride = 53 [default = -1];
} }
message EvaluatorConfig { message EvaluatorConfig {
......
...@@ -24,8 +24,9 @@ add_custom_target(paddle_python ALL DEPENDS ...@@ -24,8 +24,9 @@ add_custom_target(paddle_python ALL DEPENDS
${OUTPUT_DIR}/.timestamp) ${OUTPUT_DIR}/.timestamp)
add_subdirectory(paddle/trainer_config_helpers/tests) add_subdirectory(paddle/trainer_config_helpers/tests)
add_subdirectory(paddle/v2/reader/tests)
add_subdirectory(paddle/v2/tests) add_subdirectory(paddle/v2/tests)
add_subdirectory(paddle/v2/reader/tests)
add_subdirectory(paddle/v2/plot/tests)
install(DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/dist/ install(DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/dist/
DESTINATION opt/paddle/share/wheels DESTINATION opt/paddle/share/wheels
......
...@@ -2485,6 +2485,7 @@ class SequenceLastInstanceLayer(LayerBase): ...@@ -2485,6 +2485,7 @@ class SequenceLastInstanceLayer(LayerBase):
active_type='linear', active_type='linear',
trans_type='non-seq', trans_type='non-seq',
bias=False, bias=False,
stride=-1,
**xargs): **xargs):
super(SequenceLastInstanceLayer, self).__init__( super(SequenceLastInstanceLayer, self).__init__(
name, name,
...@@ -2495,10 +2496,11 @@ class SequenceLastInstanceLayer(LayerBase): ...@@ -2495,10 +2496,11 @@ class SequenceLastInstanceLayer(LayerBase):
**xargs) **xargs)
config_assert( config_assert(
len(inputs) == 1, 'SequenceLastInstanceLayer must have 1 input') len(inputs) == 1, 'SequenceLastInstanceLayer must have 1 input')
if trans_type == 'seq':
config_assert(stride == -1, 'subseq does not support stride window')
self.config.trans_type = trans_type self.config.trans_type = trans_type
for input_index in xrange(len(self.inputs)): self.config.seq_pool_stride = stride
input_layer = self.get_input_layer(input_index) self.set_layer_size(self.get_input_layer(0).size)
self.set_layer_size(input_layer.size)
self.create_bias_parameter(bias, self.config.size) self.create_bias_parameter(bias, self.config.size)
...@@ -2510,10 +2512,16 @@ class SequenceFirstInstanceLayer(SequenceLastInstanceLayer): ...@@ -2510,10 +2512,16 @@ class SequenceFirstInstanceLayer(SequenceLastInstanceLayer):
active_type='linear', active_type='linear',
trans_type='non-seq', trans_type='non-seq',
bias=False, bias=False,
stride=-1,
**xargs): **xargs):
super(SequenceFirstInstanceLayer, self).__init__( super(SequenceFirstInstanceLayer, self).__init__(
name, inputs=inputs, active_type=active_type, bias=bias, **xargs) name,
self.config.trans_type = trans_type inputs=inputs,
active_type=active_type,
trans_type=trans_type,
bias=bias,
stride=stride,
**xargs)
self.config.select_first = True self.config.select_first = True
......
...@@ -18,7 +18,7 @@ import inspect ...@@ -18,7 +18,7 @@ import inspect
from paddle.trainer.config_parser import * from paddle.trainer.config_parser import *
from .activations import LinearActivation, SigmoidActivation, TanhActivation, \ from .activations import LinearActivation, SigmoidActivation, TanhActivation, \
ReluActivation, IdentityActivation, SoftmaxActivation ReluActivation, IdentityActivation, SoftmaxActivation, BaseActivation
from .evaluators import * from .evaluators import *
from .poolings import MaxPooling, AvgPooling, BasePoolingType from .poolings import MaxPooling, AvgPooling, BasePoolingType
from .attrs import * from .attrs import *
...@@ -1342,10 +1342,16 @@ def grumemory(input, ...@@ -1342,10 +1342,16 @@ def grumemory(input,
def last_seq(input, def last_seq(input,
name=None, name=None,
agg_level=AggregateLevel.EACH_TIMESTEP, agg_level=AggregateLevel.EACH_TIMESTEP,
stride=-1,
layer_attr=None): layer_attr=None):
""" """
Get Last Timestamp Activation of a sequence. Get Last Timestamp Activation of a sequence.
If stride > 0, this layer slides a window whose size is determined by stride,
and return the last value of the window as the output. Thus, a long sequence
will be shorten. Note that for sequence with sub-sequence, the default value
of stride is -1.
The simple usage is: The simple usage is:
.. code-block:: python .. code-block:: python
...@@ -1357,6 +1363,8 @@ def last_seq(input, ...@@ -1357,6 +1363,8 @@ def last_seq(input,
:type name: basestring :type name: basestring
:param input: Input layer name. :param input: Input layer name.
:type input: LayerOutput :type input: LayerOutput
:param stride: window size.
:type stride: Int
:param layer_attr: extra layer attributes. :param layer_attr: extra layer attributes.
:type layer_attr: ExtraLayerAttribute. :type layer_attr: ExtraLayerAttribute.
:return: LayerOutput object. :return: LayerOutput object.
...@@ -1368,11 +1376,15 @@ def last_seq(input, ...@@ -1368,11 +1376,15 @@ def last_seq(input,
" series information at all. Maybe you want to use" " series information at all. Maybe you want to use"
" first_seq instead.") " first_seq instead.")
if agg_level == AggregateLevel.EACH_SEQUENCE:
assert stride == -1
Layer( Layer(
name=name, name=name,
type=LayerType.SEQUENCE_LAST_INSTANCE, type=LayerType.SEQUENCE_LAST_INSTANCE,
inputs=[input.name], inputs=[input.name],
trans_type=agg_level, trans_type=agg_level,
stride=stride,
**ExtraLayerAttribute.to_kwargs(layer_attr)) **ExtraLayerAttribute.to_kwargs(layer_attr))
return LayerOutput( return LayerOutput(
name, name,
...@@ -1386,10 +1398,16 @@ def last_seq(input, ...@@ -1386,10 +1398,16 @@ def last_seq(input,
def first_seq(input, def first_seq(input,
name=None, name=None,
agg_level=AggregateLevel.EACH_TIMESTEP, agg_level=AggregateLevel.EACH_TIMESTEP,
stride=-1,
layer_attr=None): layer_attr=None):
""" """
Get First Timestamp Activation of a sequence. Get First Timestamp Activation of a sequence.
If stride > 0, this layer slides a window whose size is determined by stride,
and return the first value of the window as the output. Thus, a long sequence
will be shorten. Note that for sequence with sub-sequence, the default value
of stride is -1.
The simple usage is: The simple usage is:
.. code-block:: python .. code-block:: python
...@@ -1401,6 +1419,8 @@ def first_seq(input, ...@@ -1401,6 +1419,8 @@ def first_seq(input,
:type name: basestring :type name: basestring
:param input: Input layer name. :param input: Input layer name.
:type input: LayerOutput :type input: LayerOutput
:param stride: window size.
:type stride: Int
:param layer_attr: extra layer attributes. :param layer_attr: extra layer attributes.
:type layer_attr: ExtraLayerAttribute. :type layer_attr: ExtraLayerAttribute.
:return: LayerOutput object. :return: LayerOutput object.
...@@ -1413,11 +1433,15 @@ def first_seq(input, ...@@ -1413,11 +1433,15 @@ def first_seq(input,
' time series information at all. Maybe you want to use' ' time series information at all. Maybe you want to use'
' last_seq instead.') ' last_seq instead.')
if agg_level == AggregateLevel.EACH_SEQUENCE:
assert stride == -1
Layer( Layer(
name=name, name=name,
type=LayerType.SEQUENCE_FIRST_INSTANCE, type=LayerType.SEQUENCE_FIRST_INSTANCE,
inputs=[input.name], inputs=[input.name],
trans_type=agg_level, trans_type=agg_level,
stride=stride,
**ExtraLayerAttribute.to_kwargs(layer_attr)) **ExtraLayerAttribute.to_kwargs(layer_attr))
return LayerOutput( return LayerOutput(
name, name,
...@@ -1916,7 +1940,7 @@ def cos_sim(a, b, scale=1, size=1, name=None, layer_attr=None): ...@@ -1916,7 +1940,7 @@ def cos_sim(a, b, scale=1, size=1, name=None, layer_attr=None):
@layer_support() @layer_support()
def hsigmoid(input, def hsigmoid(input,
label, label,
num_classes, num_classes=None,
name=None, name=None,
bias_attr=None, bias_attr=None,
param_attr=None, param_attr=None,
...@@ -1932,8 +1956,7 @@ def hsigmoid(input, ...@@ -1932,8 +1956,7 @@ def hsigmoid(input,
.. code-block:: python .. code-block:: python
cost = hsigmoid(input=[layer1, layer2], cost = hsigmoid(input=[layer1, layer2],
label=data_layer, label=data_layer)
num_classes=3)
:param input: Input layers. It could be a LayerOutput or list/tuple of :param input: Input layers. It could be a LayerOutput or list/tuple of
LayerOutput. LayerOutput.
...@@ -1941,12 +1964,14 @@ def hsigmoid(input, ...@@ -1941,12 +1964,14 @@ def hsigmoid(input,
:param label: Label layer. :param label: Label layer.
:type label: LayerOutput :type label: LayerOutput
:param num_classes: number of classes. :param num_classes: number of classes.
:type num_classes: int :type num_classes: int|None
:param name: layer name :param name: layer name
:type name: basestring :type name: basestring
:param bias_attr: Bias attribute. None means default bias. :param bias_attr: Bias attribute. None means default bias.
False means no bias. False means no bias.
:type bias_attr: ParameterAttribute|False :type bias_attr: ParameterAttribute|False
:param param_attr: Parameter Attribute. None means default parameter.
:type param_attr: ParameterAttribute|None
:param layer_attr: Extra Layer Attribute. :param layer_attr: Extra Layer Attribute.
:type layer_attr: ExtraLayerAttribute :type layer_attr: ExtraLayerAttribute
:return: LayerOutput object. :return: LayerOutput object.
...@@ -1966,6 +1991,11 @@ def hsigmoid(input, ...@@ -1966,6 +1991,11 @@ def hsigmoid(input,
assert isinstance(label, LayerOutput) assert isinstance(label, LayerOutput)
assert label.layer_type == LayerType.DATA assert label.layer_type == LayerType.DATA
if num_classes is None:
num_classes = label.size
if num_classes is None or num_classes <= 2:
raise ValueError("hsigmoid label size must larger than 2.")
ipts_for_layer = [] ipts_for_layer = []
parents = [] parents = []
for each_input, each_param_attr in zip(input, param_attr): for each_input, each_param_attr in zip(input, param_attr):
...@@ -2253,8 +2283,9 @@ def img_pool_layer(input, ...@@ -2253,8 +2283,9 @@ def img_pool_layer(input,
pool_type.name = 'avg' pool_type.name = 'avg'
type_name = pool_type.name + '-projection' \ type_name = pool_type.name + '-projection' \
if (isinstance(pool_type, AvgPooling) or isinstance(pool_type, MaxPooling)) \ if (
else pool_type.name isinstance(pool_type, AvgPooling) or isinstance(pool_type, MaxPooling)) \
else pool_type.name
pool_size_y = pool_size if pool_size_y is None else pool_size_y pool_size_y = pool_size if pool_size_y is None else pool_size_y
stride_y = stride if stride_y is None else stride_y stride_y = stride if stride_y is None else stride_y
...@@ -3294,8 +3325,8 @@ def recurrent_group(step, ...@@ -3294,8 +3325,8 @@ def recurrent_group(step,
assert (targetInlink == None or targetInlink_in_inlinks()) assert (targetInlink == None or targetInlink_in_inlinks())
targetInlinkName = None if targetInlink == None \ targetInlinkName = None if targetInlink == None \
else targetInlink.name if isinstance(targetInlink, LayerOutput) \ else targetInlink.name if isinstance(targetInlink, LayerOutput) \
else targetInlink.input.name else targetInlink.input.name
contains_sub_seq = [False] contains_sub_seq = [False]
...@@ -4807,12 +4838,14 @@ def crf_decoding_layer(input, ...@@ -4807,12 +4838,14 @@ def crf_decoding_layer(input,
return LayerOutput(name, LayerType.CRF_DECODING_LAYER, parents, size=1) return LayerOutput(name, LayerType.CRF_DECODING_LAYER, parents, size=1)
@wrap_act_default(act=SigmoidActivation())
@wrap_bias_attr_default(has_bias=True) @wrap_bias_attr_default(has_bias=True)
@wrap_name_default() @wrap_name_default()
@layer_support() @layer_support()
def nce_layer(input, def nce_layer(input,
label, label,
num_classes, num_classes,
act=None,
weight=None, weight=None,
num_neg_samples=10, num_neg_samples=10,
neg_distribution=None, neg_distribution=None,
...@@ -4841,6 +4874,8 @@ def nce_layer(input, ...@@ -4841,6 +4874,8 @@ def nce_layer(input,
:type weight: LayerOutput :type weight: LayerOutput
:param num_classes: number of classes. :param num_classes: number of classes.
:type num_classes: int :type num_classes: int
:param act: Activation, default is Sigmoid.
:type act: BaseActivation
:param num_neg_samples: number of negative samples. Default is 10. :param num_neg_samples: number of negative samples. Default is 10.
:type num_neg_samples: int :type num_neg_samples: int
:param neg_distribution: The distribution for generating the random negative labels. :param neg_distribution: The distribution for generating the random negative labels.
...@@ -4862,7 +4897,9 @@ def nce_layer(input, ...@@ -4862,7 +4897,9 @@ def nce_layer(input,
if neg_distribution is not None: if neg_distribution is not None:
assert isinstance(neg_distribution, collections.Sequence) assert isinstance(neg_distribution, collections.Sequence)
assert len(neg_distribution) == num_classes assert len(neg_distribution) == num_classes
assert sum(neg_distribution) == 1 assert abs(sum(neg_distribution) - 1.0) < 1e-5
if not isinstance(act, BaseActivation):
raise TypeError()
ipts_for_layer = [] ipts_for_layer = []
parents = [] parents = []
...@@ -4884,12 +4921,17 @@ def nce_layer(input, ...@@ -4884,12 +4921,17 @@ def nce_layer(input,
type=LayerType.NCE_LAYER, type=LayerType.NCE_LAYER,
num_classes=num_classes, num_classes=num_classes,
neg_sampling_dist=neg_distribution, neg_sampling_dist=neg_distribution,
active_type=act.name,
num_neg_samples=num_neg_samples, num_neg_samples=num_neg_samples,
inputs=ipts_for_layer, inputs=ipts_for_layer,
bias=ParamAttr.to_bias(bias_attr), bias=ParamAttr.to_bias(bias_attr),
**ExtraLayerAttribute.to_kwargs(layer_attr)) **ExtraLayerAttribute.to_kwargs(layer_attr))
return LayerOutput( return LayerOutput(
name, LayerType.NCE_LAYER, parents=parents, size=l.config.size) name,
LayerType.NCE_LAYER,
parents=parents,
size=l.config.size,
activation=act)
""" """
......
...@@ -14,4 +14,7 @@ for op in seq_op: ...@@ -14,4 +14,7 @@ for op in seq_op:
for al in agg_level: for al in agg_level:
opts.append(op(input=din, agg_level=al)) opts.append(op(input=din, agg_level=al))
for op in seq_op:
opts.append(op(input=din, agg_level=AggregateLevel.EACH_TIMESTEP, stride=5))
outputs(opts) outputs(opts)
...@@ -15,6 +15,7 @@ layers { ...@@ -15,6 +15,7 @@ layers {
} }
select_first: true select_first: true
trans_type: "seq" trans_type: "seq"
seq_pool_stride: -1
} }
layers { layers {
name: "__first_seq_1__" name: "__first_seq_1__"
...@@ -26,6 +27,7 @@ layers { ...@@ -26,6 +27,7 @@ layers {
} }
select_first: true select_first: true
trans_type: "non-seq" trans_type: "non-seq"
seq_pool_stride: -1
} }
layers { layers {
name: "__last_seq_0__" name: "__last_seq_0__"
...@@ -36,6 +38,7 @@ layers { ...@@ -36,6 +38,7 @@ layers {
input_layer_name: "data" input_layer_name: "data"
} }
trans_type: "seq" trans_type: "seq"
seq_pool_stride: -1
} }
layers { layers {
name: "__last_seq_1__" name: "__last_seq_1__"
...@@ -46,12 +49,38 @@ layers { ...@@ -46,12 +49,38 @@ layers {
input_layer_name: "data" input_layer_name: "data"
} }
trans_type: "non-seq" trans_type: "non-seq"
seq_pool_stride: -1
}
layers {
name: "__first_seq_2__"
type: "seqlastins"
size: 30
active_type: "linear"
inputs {
input_layer_name: "data"
}
select_first: true
trans_type: "non-seq"
seq_pool_stride: 5
}
layers {
name: "__last_seq_2__"
type: "seqlastins"
size: 30
active_type: "linear"
inputs {
input_layer_name: "data"
}
trans_type: "non-seq"
seq_pool_stride: 5
} }
input_layer_names: "data" input_layer_names: "data"
output_layer_names: "__first_seq_0__" output_layer_names: "__first_seq_0__"
output_layer_names: "__first_seq_1__" output_layer_names: "__first_seq_1__"
output_layer_names: "__last_seq_0__" output_layer_names: "__last_seq_0__"
output_layer_names: "__last_seq_1__" output_layer_names: "__last_seq_1__"
output_layer_names: "__first_seq_2__"
output_layer_names: "__last_seq_2__"
sub_models { sub_models {
name: "root" name: "root"
layer_names: "data" layer_names: "data"
...@@ -59,11 +88,15 @@ sub_models { ...@@ -59,11 +88,15 @@ sub_models {
layer_names: "__first_seq_1__" layer_names: "__first_seq_1__"
layer_names: "__last_seq_0__" layer_names: "__last_seq_0__"
layer_names: "__last_seq_1__" layer_names: "__last_seq_1__"
layer_names: "__first_seq_2__"
layer_names: "__last_seq_2__"
input_layer_names: "data" input_layer_names: "data"
output_layer_names: "__first_seq_0__" output_layer_names: "__first_seq_0__"
output_layer_names: "__first_seq_1__" output_layer_names: "__first_seq_1__"
output_layer_names: "__last_seq_0__" output_layer_names: "__last_seq_0__"
output_layer_names: "__last_seq_1__" output_layer_names: "__last_seq_1__"
output_layer_names: "__first_seq_2__"
output_layer_names: "__last_seq_2__"
is_recurrent_layer_group: false is_recurrent_layer_group: false
} }
...@@ -128,6 +128,7 @@ layers { ...@@ -128,6 +128,7 @@ layers {
input_layer_name: "__simple_gru_0__" input_layer_name: "__simple_gru_0__"
} }
trans_type: "non-seq" trans_type: "non-seq"
seq_pool_stride: -1
} }
layers { layers {
name: "__last_seq_1__" name: "__last_seq_1__"
...@@ -138,6 +139,7 @@ layers { ...@@ -138,6 +139,7 @@ layers {
input_layer_name: "__simple_gru_1__" input_layer_name: "__simple_gru_1__"
} }
trans_type: "non-seq" trans_type: "non-seq"
seq_pool_stride: -1
} }
layers { layers {
name: "__fc_layer_0__" name: "__fc_layer_0__"
......
...@@ -210,6 +210,7 @@ layers { ...@@ -210,6 +210,7 @@ layers {
input_layer_name: "__lstm_group_0__" input_layer_name: "__lstm_group_0__"
} }
trans_type: "non-seq" trans_type: "non-seq"
seq_pool_stride: -1
} }
layers { layers {
name: "__last_seq_1__" name: "__last_seq_1__"
...@@ -220,6 +221,7 @@ layers { ...@@ -220,6 +221,7 @@ layers {
input_layer_name: "__lstm_group_1__" input_layer_name: "__lstm_group_1__"
} }
trans_type: "non-seq" trans_type: "non-seq"
seq_pool_stride: -1
} }
layers { layers {
name: "__fc_layer_0__" name: "__fc_layer_0__"
......
...@@ -143,6 +143,7 @@ layers { ...@@ -143,6 +143,7 @@ layers {
input_layer_name: "__recurrent_layer_0__" input_layer_name: "__recurrent_layer_0__"
} }
trans_type: "non-seq" trans_type: "non-seq"
seq_pool_stride: -1
} }
layers { layers {
name: "__first_seq_0__" name: "__first_seq_0__"
...@@ -154,6 +155,7 @@ layers { ...@@ -154,6 +155,7 @@ layers {
} }
select_first: true select_first: true
trans_type: "non-seq" trans_type: "non-seq"
seq_pool_stride: -1
} }
layers { layers {
name: "__last_seq_1__" name: "__last_seq_1__"
...@@ -164,6 +166,7 @@ layers { ...@@ -164,6 +166,7 @@ layers {
input_layer_name: "__lstmemory_0__" input_layer_name: "__lstmemory_0__"
} }
trans_type: "non-seq" trans_type: "non-seq"
seq_pool_stride: -1
} }
layers { layers {
name: "__first_seq_1__" name: "__first_seq_1__"
...@@ -175,6 +178,7 @@ layers { ...@@ -175,6 +178,7 @@ layers {
} }
select_first: true select_first: true
trans_type: "non-seq" trans_type: "non-seq"
seq_pool_stride: -1
} }
layers { layers {
name: "__last_seq_2__" name: "__last_seq_2__"
...@@ -185,6 +189,7 @@ layers { ...@@ -185,6 +189,7 @@ layers {
input_layer_name: "__gru_0__" input_layer_name: "__gru_0__"
} }
trans_type: "non-seq" trans_type: "non-seq"
seq_pool_stride: -1
} }
layers { layers {
name: "__first_seq_2__" name: "__first_seq_2__"
...@@ -196,6 +201,7 @@ layers { ...@@ -196,6 +201,7 @@ layers {
} }
select_first: true select_first: true
trans_type: "non-seq" trans_type: "non-seq"
seq_pool_stride: -1
} }
parameters { parameters {
name: "___fc_layer_0__.w0" name: "___fc_layer_0__.w0"
......
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册