提交 0d0c7613 编写于 作者: T typhoonzero

Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into clusterdesign

......@@ -7,6 +7,7 @@ build/
.project
.cproject
.pydevproject
.settings/
Makefile
.test_env/
third_party/
......
......@@ -12,19 +12,26 @@
# See the License for the specific language governing permissions and
# limitations under the License
cmake_minimum_required(VERSION 3.0)
project(paddle CXX C)
set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_SOURCE_DIR}/cmake")
set(PROJ_ROOT ${CMAKE_SOURCE_DIR})
include(system)
if(ANDROID)
cmake_minimum_required(VERSION 3.7)
else()
cmake_minimum_required(VERSION 3.0)
endif()
project(paddle CXX C)
find_package(Sphinx)
find_package(CUDA QUIET)
if(NOT CMAKE_CROSSCOMPILING)
find_package(CUDA QUIET)
endif(NOT CMAKE_CROSSCOMPILING)
find_package(Git REQUIRED)
find_package(Threads REQUIRED)
include(system)
include(simd)
################################ Configurations #######################################
......@@ -40,7 +47,7 @@ option(WITH_RDMA "Compile PaddlePaddle with RDMA support" OFF)
option(WITH_TIMER "Compile PaddlePaddle with stats timer" OFF)
option(WITH_PROFILER "Compile PaddlePaddle with GPU profiler" OFF)
option(WITH_DOC "Compile PaddlePaddle with documentation" OFF)
option(ON_COVERALLS "Compile PaddlePaddle with code coverage" OFF)
option(WITH_COVERAGE "Compile PaddlePaddle with code coverage" OFF)
option(COVERALLS_UPLOAD "Package code coverage data to coveralls" OFF)
option(ON_TRAVIS "Exclude special unit test on Travis CI" OFF)
......@@ -51,6 +58,21 @@ if(NOT CMAKE_BUILD_TYPE)
FORCE)
endif()
if(ANDROID)
if(${CMAKE_SYSTEM_VERSION} VERSION_LESS "21")
message(FATAL_ERROR "Unsupport standalone toolchains with Android API level lower than 21")
endif()
set(WITH_GPU OFF CACHE STRING
"Disable GPU when cross-compiling for Android" FORCE)
set(WITH_AVX OFF CACHE STRING
"Disable AVX when cross-compiling for Android" FORCE)
set(WITH_PYTHON OFF CACHE STRING
"Disable PYTHON when cross-compiling for Android" FORCE)
set(WITH_RDMA OFF CACHE STRING
"Disable RDMA when cross-compiling for Android" FORCE)
endif(ANDROID)
set(THIRD_PARTY_PATH "${PROJ_ROOT}/third_party" CACHE STRING
"A path setting third party libraries download & build directories.")
########################################################################################
......@@ -64,6 +86,7 @@ include(external/python) # download, build, install python
include(external/openblas) # download, build, install openblas
include(external/swig) # download, build, install swig
include(external/warpctc) # download, build, install warpctc
include(external/any) # download libn::any
include(package) # set paddle packages
include(cpplint) # set paddle c++ style
......@@ -74,7 +97,6 @@ include(flags) # set paddle compile flags
include(cudnn) # set cudnn libraries
include(version) # set PADDLE_VERSION
include(coveralls) # set code coverage
include(configure) # add paddle env configuration
include_directories("${PROJ_ROOT}")
......@@ -82,14 +104,21 @@ include_directories("${PROJ_ROOT}/paddle/cuda/include")
include_directories("${CMAKE_CURRENT_BINARY_DIR}/proto")
set(EXTERNAL_LIBS
# have not include gtest here.
${GFLAGS_LIBRARIES}
${GLOG_LIBRARIES}
${CBLAS_LIBRARIES}
${PROTOBUF_LIBRARY}
${ZLIB_LIBRARIES}
${PYTHON_LIBRARIES}
)
if(WITH_GPU)
list(APPEND EXTERNAL_LIB ${CUDA_LIBRARIES} ${CUDA_rt_LIBRARY})
if(NOT WITH_DSO)
list(APPEND EXTERNAL_LIB ${CUDNN_LIBRARY} ${CUDA_CUBLAS_LIBRARIES} ${CUDA_curand_LIBRARY})
endif(NOT WITH_DSO)
endif(WITH_GPU)
add_subdirectory(proto)
add_subdirectory(paddle)
add_subdirectory(python)
......
# A image for building paddle binaries
# Use cuda devel base image for both cpu and gpu environment
FROM nvidia/cuda:7.5-cudnn5-devel-ubuntu14.04
FROM nvidia/cuda:8.0-cudnn5-devel-ubuntu14.04
MAINTAINER PaddlePaddle Authors <paddle-dev@baidu.com>
ARG UBUNTU_MIRROR
RUN /bin/bash -c 'if [[ -n ${UBUNTU_MIRROR} ]]; then sed -i 's#http://archive.ubuntu.com/ubuntu#${UBUNTU_MIRROR}#g' /etc/apt/sources.list; fi'
# ENV variables
ARG BUILD_WOBOQ
ARG WITH_GPU
ARG WITH_AVX
ARG WITH_DOC
ARG WITH_STYLE_CHECK
ENV BUILD_WOBOQ=${BUILD_WOBOQ:-OFF}
ENV WOBOQ OFF
ENV WITH_GPU=${WITH_AVX:-OFF}
ENV WITH_AVX=${WITH_AVX:-ON}
ENV WITH_DOC=${WITH_DOC:-OFF}
......@@ -37,18 +36,20 @@ RUN git config --global credential.helper store
# Fix locales to en_US.UTF-8
RUN localedef -i en_US -f UTF-8 en_US.UTF-8
# FIXME: due to temporary ipykernel dependency issue, specify ipykernel jupyter
# version util jupyter fixes this issue.
RUN pip install --upgrade pip && \
pip install -U 'protobuf==3.1.0' && \
pip install -U wheel pillow BeautifulSoup && \
pip install -U docopt PyYAML sphinx && \
pip install -U sphinx-rtd-theme==0.1.9 recommonmark && \
pip install -U pre-commit 'requests==2.9.2' jupyter
pip install pre-commit 'requests==2.9.2' 'ipykernel==4.6.0' 'jupyter==1.0.0'
RUN curl -sSL https://cmake.org/files/v3.4/cmake-3.4.1.tar.gz | tar -xz && \
cd cmake-3.4.1 && ./bootstrap && make -j `nproc` && make install && \
cd .. && rm -rf cmake-3.4.1
VOLUME ["/usr/share/nginx/html/data", "/usr/share/nginx/html/paddle"]
VOLUME ["/woboq_out"]
# Configure OpenSSH server. c.f. https://docs.docker.com/engine/examples/running_ssh_service
RUN mkdir /var/run/sshd
......
......@@ -19,9 +19,9 @@ set(CBLAS_FOUND OFF)
set(INTEL_ROOT "/opt/intel" CACHE PATH "Folder contains intel libs")
set(MKL_ROOT ${INTEL_ROOT}/mkl CACHE PATH "Folder contains MKL")
find_path(MKL_INCLUDE_DIR mkl.h PATHS
find_path(MKL_INC_DIR mkl.h PATHS
${MKL_ROOT}/include)
find_path(MKL_INCLUDE_DIR mkl_lapacke.h PATHS
find_path(MKL_LAPACK_INC_DIR mkl_lapacke.h PATHS
${MKL_ROOT}/include)
find_library(MKL_CORE_LIB NAMES mkl_core PATHS
${MKL_ROOT}/lib
......@@ -34,15 +34,19 @@ find_library(MKL_INTEL_LP64 NAMES mkl_intel_lp64 PATHS
${MKL_ROOT}/lib/intel64)
if(MKL_INCLUDE_DIR AND MKL_CORE_LIB AND MKL_SEQUENTIAL_LIB AND MKL_INTEL_LP64)
if(MKL_INC_DIR AND MKL_CORE_LIB AND MKL_SEQUENTIAL_LIB AND MKL_INTEL_LP64)
set(CBLAS_PROVIDER MKL)
set(CBLAS_INC_DIR ${MKL_INCLUDE_DIR})
set(CBLAS_INC_DIR ${MKL_INC_DIR})
set(CBLAS_LIBRARIES ${MKL_INTEL_LP64}
${MKL_SEQUENTIAL_LIB}
${MKL_CORE_LIB})
add_definitions(-DPADDLE_USE_MKL)
message(STATUS "Found MKL (include: ${CBLAS_INC_DIR}, library: ${CBLAS_LIBRARIES})")
set(CBLAS_FOUND ON)
if(${MKL_LAPACK_INC_DIR})
add_definitions(-DPADDLE_USE_LAPACK)
message(STATUS "Found lapack in MKL (include: ${MKL_LAPACK_INC_DIR})")
endif()
return() # return file.
endif()
......@@ -68,13 +72,17 @@ find_library(ATLAS_CBLAS_LIB NAMES cblas libcblas.so.3
find_library(ATLAS_LIB NAMES lapack_atlas liblapack_atlas.so.3
PATHS ${ATLAS_LIB_SEARCH_PATHS})
if(ATLAS_INC_DIR AND ATLAS_CBLAS_LIB AND ATLAS_LIB)
if(ATLAS_INC_DIR AND ATLAS_CBLAS_LIB AND ATLAS_LIB AND NOT CBLAS_FOUND)
set(CBLAS_PROVIDER ATLAS)
set(CBLAS_INC_DIR ${ATLAS_INC_DIR} ${ATLAS_CLAPACK_INC_DIR})
set(CBLAS_INC_DIR ${ATLAS_INC_DIR})
set(CBLAS_LIBRARIES ${ATLAS_LIB} ${ATLAS_CBLAS_LIB})
add_definitions(-DPADDLE_USE_ATLAS)
message(STATUS "Found Atlas (include: ${CBLAS_INC_DIR}, library: ${CBLAS_LIBRARIES})")
message(STATUS "Found ATLAS (include: ${CBLAS_INC_DIR}, library: ${CBLAS_LIBRARIES})")
set(CBLAS_FOUND ON)
if(ATLAS_CLAPACK_INC_DIR)
add_definitions(-DPADDLE_USE_LAPACK)
message(STATUS "Found lapack in ATLAS (include: ${ATLAS_CLAPACK_INC_DIR})")
endif()
return()
endif()
......@@ -103,8 +111,12 @@ if(OPENBLAS_INC_DIR AND OPENBLAS_LIB)
set(CBLAS_PROVIDER OPENBLAS)
set(CBLAS_INC_DIR ${OPENBLAS_INC_DIR})
set(CBLAS_LIBRARIES ${OPENBLAS_LIB})
message(STATUS "Found OpenBlas (include: ${CBLAS_INC_DIR}, library: ${CBLAS_LIBRARIES})")
message(STATUS "Found OpenBLAS (include: ${CBLAS_INC_DIR}, library: ${CBLAS_LIBRARIES})")
set(CBLAS_FOUND ON)
if(OPENBLAS_LAPACKE_INC_DIR)
add_definitions(-DPADDLE_USE_LAPACK)
message(STATUS "Found lapack in OpenBLAS (include: ${OPENBLAS_LAPACKE_INC_DIR})")
endif()
return()
endif()
......
......@@ -32,6 +32,14 @@ if(NOT WITH_PROFILER)
add_definitions(-DPADDLE_DISABLE_PROFILER)
endif(NOT WITH_PROFILER)
if(NOT CMAKE_CROSSCOMPILING)
if(WITH_AVX AND AVX_FOUND)
set(SIMD_FLAG ${AVX_FLAG})
elseif(SSE3_FOUND)
set(SIMD_FLAG ${SSE3_FLAG})
endif()
endif()
if(NOT WITH_GPU)
add_definitions(-DPADDLE_ONLY_CPU)
add_definitions(-DHPPL_STUB_FUNC)
......@@ -48,21 +56,12 @@ else()
message(FATAL_ERROR "Paddle need cudnn to compile")
endif()
if(WITH_AVX)
set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "-Xcompiler ${AVX_FLAG}")
else(WITH_AVX)
set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "-Xcompiler ${SSE3_FLAG}")
endif(WITH_AVX)
set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "-Xcompiler ${SIMD_FLAG}")
# Include cuda and cudnn
include_directories(${CUDNN_INCLUDE_DIR})
include_directories(${CUDA_TOOLKIT_INCLUDE})
endif(NOT WITH_GPU)
if(WITH_AVX)
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${AVX_FLAG}")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${AVX_FLAG}")
else(WITH_AVX)
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${SSE3_FLAG}")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${SSE3_FLAG}")
endif(WITH_AVX)
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${SIMD_FLAG}")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${SIMD_FLAG}")
......@@ -61,7 +61,7 @@ function(code_coverage _COVERAGE_SRCS _COVERALLS_UPLOAD _CMAKE_SCRIPT_PATH)
endif()
endfunction()
if(ON_COVERALLS)
if(WITH_COVERAGE)
set(CMAKE_BUILD_TYPE "Debug")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -g -O0 -fprofile-arcs -ftest-coverage")
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -g -O0 -fprofile-arcs -ftest-coverage")
......
......@@ -134,7 +134,7 @@ foreach(GCDA ${GCDA_FILES})
# If -p is not specified then the file is named only "the_file.c.gcov"
#
execute_process(
COMMAND "${GCOV_EXECUTABLE} -p -o ${GCDA_DIR} ${GCDA}"
COMMAND ${GCOV_EXECUTABLE} -p -o ${GCDA_DIR} ${GCDA} >/dev/null
WORKING_DIRECTORY ${GCDA_DIR}
)
endforeach()
......
if(NOT WITH_GPU)
return()
endif()
set(CUDNN_ROOT "" CACHE PATH "CUDNN ROOT")
find_path(CUDNN_INCLUDE_DIR cudnn.h
PATHS ${CUDNN_ROOT} ${CUDNN_ROOT}/include
......@@ -11,6 +15,7 @@ list(APPEND CUDNN_CHECK_LIBRARY_DIRS
${CUDNN_ROOT}
${CUDNN_ROOT}/lib64
${CUDNN_ROOT}/lib
${CUDNN_ROOT}/lib/x86_64-linux-gnu
$ENV{CUDNN_ROOT}
$ENV{CUDNN_ROOT}/lib64
$ENV{CUDNN_ROOT}/lib
......
INCLUDE(ExternalProject)
SET(ANY_SOURCE_DIR ${THIRD_PARTY_PATH}/any)
INCLUDE_DIRECTORIES(${ANY_SOURCE_DIR}/src/linb_any)
ExternalProject_Add(
linb_any
${EXTERNAL_PROJECT_LOG_ARGS}
GIT_REPOSITORY "https://github.com/thelink2012/any.git"
GIT_TAG "8fef1e93710a0edf8d7658999e284a1142c4c020"
PREFIX ${ANY_SOURCE_DIR}
UPDATE_COMMAND ""
CONFIGURE_COMMAND ""
BUILD_COMMAND ""
INSTALL_COMMAND ""
TEST_COMMAND ""
)
add_definitions(-DANY_IMPL_ANY_CAST_MOVEABLE)
......@@ -31,9 +31,17 @@ ExternalProject_Add(
GIT_REPOSITORY "https://github.com/gflags/gflags.git"
PREFIX ${GFLAGS_SOURCES_DIR}
UPDATE_COMMAND ""
CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
CMAKE_ARGS -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
CMAKE_ARGS -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
CMAKE_ARGS -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${GFLAGS_INSTALL_DIR}
CMAKE_ARGS -DCMAKE_POSITION_INDEPENDENT_CODE=ON
CMAKE_ARGS -DBUILD_TESTING=OFF
CMAKE_ARGS -DCMAKE_BUILD_TYPE=Release
CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${GFLAGS_INSTALL_DIR}
-DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
-DCMAKE_BUILD_TYPE:STRING=Release
)
LIST(APPEND external_project_dependencies gflags)
......@@ -33,11 +33,19 @@ ExternalProject_Add(
GIT_REPOSITORY "https://github.com/google/glog.git"
PREFIX ${GLOG_SOURCES_DIR}
UPDATE_COMMAND ""
CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
CMAKE_ARGS -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
CMAKE_ARGS -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
CMAKE_ARGS -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${GLOG_INSTALL_DIR}
CMAKE_ARGS -DCMAKE_POSITION_INDEPENDENT_CODE=ON
CMAKE_ARGS -DWITH_GFLAGS=ON
CMAKE_ARGS -Dgflags_DIR=${GFLAGS_INSTALL_DIR}/lib/cmake/gflags
CMAKE_ARGS -DBUILD_TESTING=OFF
CMAKE_ARGS -DCMAKE_BUILD_TYPE=Release
CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${GLOG_INSTALL_DIR}
-DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
-DCMAKE_BUILD_TYPE:STRING=Release
)
LIST(APPEND external_project_dependencies glog)
......@@ -41,11 +41,19 @@ IF(WITH_TESTING)
GIT_TAG "release-1.8.0"
PREFIX ${GTEST_SOURCES_DIR}
UPDATE_COMMAND ""
CMAKE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${GTEST_INSTALL_DIR}
CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
CMAKE_ARGS -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
CMAKE_ARGS -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
CMAKE_ARGS -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${GTEST_INSTALL_DIR}
CMAKE_ARGS -DCMAKE_POSITION_INDEPENDENT_CODE=ON
CMAKE_ARGS -DBUILD_GMOCK=ON
CMAKE_ARGS -Dgtest_disable_pthreads=ON
CMAKE_ARGS -Dgtest_force_shared_crt=ON
CMAKE_ARGS -DCMAKE_BUILD_TYPE=Release
CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${GTEST_INSTALL_DIR}
-DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
-DCMAKE_BUILD_TYPE:STRING=Release
)
LIST(APPEND external_project_dependencies gtest)
ENDIF(WITH_TESTING)
......@@ -29,7 +29,24 @@ IF(NOT ${CBLAS_FOUND})
IF(CMAKE_COMPILER_IS_GNUCC)
ENABLE_LANGUAGE(Fortran)
LIST(APPEND CBLAS_LIBRARIES gfortran pthread)
if (NOT CMAKE_Fortran_COMPILER_VERSION)
# cmake < 3.4 cannot get CMAKE_Fortran_COMPILER_VERSION directly.
execute_process(COMMAND ${CMAKE_Fortran_COMPILER} -dumpversion
OUTPUT_VARIABLE CMAKE_Fortran_COMPILER_VERSION)
endif()
string(REGEX MATCHALL "[0-9]+" Fortran_VERSION ${CMAKE_Fortran_COMPILER_VERSION})
list(GET Fortran_VERSION 0 Fortran_MAJOR)
list(GET Fortran_VERSION 1 Fortran_MINOR)
find_library(GFORTRAN_LIBRARY NAMES gfortran PATHS
/lib
/usr/lib
/usr/lib/gcc/x86_64-linux-gnu/${Fortran_MAJOR}.${Fortran_MINOR}/
/usr/lib/gcc/x86_64-linux-gnu/${Fortran_MAJOR}/)
if (NOT GFORTRAN_LIBRARY)
message(FATAL_ERROR "Cannot found gfortran library which it is used by openblas")
endif()
find_package(Threads REQUIRED)
LIST(APPEND CBLAS_LIBRARIES ${GFORTRAN_LIBRARY} ${CMAKE_THREAD_LIBS_INIT})
ENDIF(CMAKE_COMPILER_IS_GNUCC)
IF(NOT CMAKE_Fortran_COMPILER)
......@@ -37,6 +54,8 @@ IF(NOT ${CBLAS_FOUND})
"you need to set gfortran compiler: cmake .. -DCMAKE_Fortran_COMPILER=...")
ENDIF(NOT CMAKE_Fortran_COMPILER)
ADD_DEFINITIONS(-DPADDLE_USE_LAPACK)
ExternalProject_Add(
openblas
${EXTERNAL_PROJECT_LOG_ARGS}
......@@ -45,7 +64,7 @@ IF(NOT ${CBLAS_FOUND})
PREFIX ${CBLAS_SOURCES_DIR}
INSTALL_DIR ${CBLAS_INSTALL_DIR}
BUILD_IN_SOURCE 1
BUILD_COMMAND ${CMAKE_MAKE_PROGRAM} FC=${CMAKE_Fortran_COMPILER} CC=${CMAKE_C_COMPILER} HOSTCC=${CMAKE_C_COMPILER} NO_SHARED=1 libs netlib
BUILD_COMMAND ${CMAKE_MAKE_PROGRAM} FC=${CMAKE_Fortran_COMPILER} CC=${CMAKE_C_COMPILER} HOSTCC=${CMAKE_C_COMPILER} DYNAMIC_ARCH=1 NO_SHARED=1 libs netlib
INSTALL_COMMAND ${CMAKE_MAKE_PROGRAM} install NO_SHARED=1 PREFIX=<INSTALL_DIR>
UPDATE_COMMAND ""
CONFIGURE_COMMAND ""
......
......@@ -14,7 +14,8 @@
INCLUDE(ExternalProject)
FIND_PACKAGE(Protobuf 3.1)
set(PROTOBUF_VERSION 3.1)
FIND_PACKAGE(Protobuf ${PROTOBUF_VERSION})
IF(PROTOBUF_FOUND)
EXEC_PROGRAM(${PROTOBUF_PROTOC_EXECUTABLE} ARGS --version OUTPUT_VARIABLE PROTOBUF_VERSION)
......@@ -57,12 +58,20 @@ IF(NOT PROTOBUF_FOUND)
GIT_TAG "9f75c5aa851cd877fb0d93ccc31b8567a6706546"
CONFIGURE_COMMAND
${CMAKE_COMMAND} ${PROTOBUF_SOURCES_DIR}/src/protobuf/cmake
-Dprotobuf_BUILD_TESTS=OFF
-DZLIB_ROOT:FILEPATH=${ZLIB_ROOT}
-DCMAKE_POSITION_INDEPENDENT_CODE=ON
-DCMAKE_BUILD_TYPE=Release
-DCMAKE_INSTALL_PREFIX=${PROTOBUF_INSTALL_DIR}
-DCMAKE_INSTALL_LIBDIR=lib
-Dprotobuf_BUILD_TESTS=OFF
-DZLIB_ROOT:FILEPATH=${ZLIB_ROOT}
-DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
-DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
-DCMAKE_POSITION_INDEPENDENT_CODE=ON
-DCMAKE_BUILD_TYPE=Release
-DCMAKE_INSTALL_PREFIX=${PROTOBUF_INSTALL_DIR}
-DCMAKE_INSTALL_LIBDIR=lib
CMAKE_CACHE_ARGS
-DCMAKE_INSTALL_PREFIX:PATH=${PROTOBUF_INSTALL_DIR}
-DCMAKE_BUILD_TYPE:STRING=Release
-DCMAKE_VERBOSE_MAKEFILE:BOOL=OFF
-DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
-DZLIB_ROOT:STRING=${ZLIB_ROOT}
)
LIST(APPEND external_project_dependencies protobuf)
......
......@@ -219,5 +219,9 @@ ELSE(PYTHONLIBS_FOUND AND PYTHONINTERP_FOUND)
ENDIF(PYTHONLIBS_FOUND AND PYTHONINTERP_FOUND)
INCLUDE_DIRECTORIES(${PYTHON_INCLUDE_DIR})
INCLUDE_DIRECTORIES(${PYTHON_NUMPY_INCLUDE_DIR})
IF(WITH_PYTHON)
INCLUDE_DIRECTORIES(${PYTHON_INCLUDE_DIR})
INCLUDE_DIRECTORIES(${PYTHON_NUMPY_INCLUDE_DIR})
ELSE()
SET(PYTHON_LIBRARIES "")
ENDIF()
......@@ -50,12 +50,19 @@ ExternalProject_Add(
UPDATE_COMMAND ""
CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
CMAKE_ARGS -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
CMAKE_ARGS -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
CMAKE_ARGS -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${WARPCTC_INSTALL_DIR}
CMAKE_ARGS -DWITH_GPU=${WITH_GPU}
CMAKE_ARGS -DWITH_OMP=${USE_OMP}
CMAKE_ARGS -DWITH_TORCH=OFF
CMAKE_ARGS -DCMAKE_DISABLE_FIND_PACKAGE_Torch=TRUE
CMAKE_ARGS -DCMAKE_DISABLE_FIND_PACKAGE_Torch=ON
CMAKE_ARGS -DBUILD_SHARED=ON
CMAKE_ARGS -DCMAKE_POSITION_INDEPENDENT_CODE=ON
CMAKE_ARGS -DCMAKE_BUILD_TYPE=Release
CMAKE_CACHE_ARGS -DCMAKE_BUILD_TYPE:STRING=Release
-DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
-DCMAKE_INSTALL_PREFIX:PATH=${WARPCTC_INSTALL_DIR}
)
LIST(APPEND external_project_dependencies warpctc)
......@@ -22,7 +22,7 @@ SET(ZLIB_INCLUDE_DIR "${ZLIB_INSTALL_DIR}/include" CACHE PATH "zlib include dire
IF(WIN32)
SET(ZLIB_LIBRARIES "${ZLIB_INSTALL_DIR}/lib/zlibstatic.lib" CACHE FILEPATH "zlib library." FORCE)
ELSE(WIN32)
set(ZLIB_LIBRARIES "${ZLIB_INSTALL_DIR}/lib/libz.a" CACHE FILEPATH "zlib library." FORCE)
SET(ZLIB_LIBRARIES "${ZLIB_INSTALL_DIR}/lib/libz.a" CACHE FILEPATH "zlib library." FORCE)
ENDIF(WIN32)
INCLUDE_DIRECTORIES(${ZLIB_INCLUDE_DIR})
......@@ -34,10 +34,18 @@ ExternalProject_Add(
GIT_TAG "v1.2.8"
PREFIX ${ZLIB_SOURCES_DIR}
UPDATE_COMMAND ""
CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
CMAKE_ARGS -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
CMAKE_ARGS -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
CMAKE_ARGS -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${ZLIB_INSTALL_DIR}
CMAKE_ARGS -DBUILD_SHARED_LIBS=OFF
CMAKE_ARGS -DCMAKE_POSITION_INDEPENDENT_CODE=ON
CMAKE_ARGS -DCMAKE_MACOSX_RPATH=ON
CMAKE_ARGS -DCMAKE_BUILD_TYPE=Release
CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${ZLIB_INSTALL_DIR}
-DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
-DCMAKE_BUILD_TYPE:STRING=Release
)
LIST(APPEND external_project_dependencies zlib)
......@@ -2,6 +2,7 @@
include(CheckCXXCompilerFlag)
include(CheckCCompilerFlag)
include(CheckCXXSymbolExists)
include(CheckTypeSize)
function(CheckCompilerCXX11Flag)
if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
......@@ -25,7 +26,7 @@ function(CheckCompilerCXX11Flag)
endfunction()
CheckCompilerCXX11Flag()
LIST(APPEND CMAKE_CXX_FLAGS -std=c++11)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
# safe_set_flag
#
......@@ -83,6 +84,17 @@ if(NOT UINT64_MAX_EXISTS)
endif()
endif()
SET(CMAKE_EXTRA_INCLUDE_FILES "pthread.h")
CHECK_TYPE_SIZE(pthread_spinlock_t SPINLOCK_FOUND)
CHECK_TYPE_SIZE(pthread_barrier_t BARRIER_FOUND)
if(SPINLOCK_FOUND)
add_definitions(-DPADDLE_USE_PTHREAD_SPINLOCK)
endif(SPINLOCK_FOUND)
if(BARRIER_FOUND)
add_definitions(-DPADDLE_USE_PTHREAD_BARRIER)
endif(BARRIER_FOUND)
SET(CMAKE_EXTRA_INCLUDE_FILES "")
# Common flags. the compiler flag used for C/C++ sources whenever release or debug
# Do not care if this flag is support for gcc.
set(COMMON_FLAGS
......
......@@ -2,6 +2,7 @@
# so that PaddlePaddle can unleash the vectorization power of muticore.
INCLUDE(CheckCXXSourceRuns)
INCLUDE(CheckCXXSourceCompiles)
IF(CMAKE_COMPILER_IS_GNUCC OR CMAKE_COMPILER_IS_GNUCXX OR CMAKE_CXX_COMPILER_ID MATCHES "Clang")
set(MMX_FLAG "-mmmx")
......@@ -17,6 +18,8 @@ ELSEIF(MSVC)
SET(AVX2_FLAG "/arch:AVX2")
ENDIF()
set(CMAKE_REQUIRED_FLAGS_RETAINED ${CMAKE_REQUIRED_FLAGS})
# Check MMX
set(CMAKE_REQUIRED_FLAGS ${MMX_FLAG})
CHECK_CXX_SOURCE_RUNS("
......@@ -73,4 +76,5 @@ int main()
return 0;
}" AVX2_FOUND)
set(CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS_RETAINED})
mark_as_advanced(MMX_FOUND SSE2_FOUND SSE3_FOUND AVX_FOUND AVX2_FOUND)
......@@ -67,6 +67,12 @@ MARK_AS_ADVANCED(HOST_SYSTEM CPU_CORES)
MESSAGE(STATUS "Found Paddle host system: ${HOST_SYSTEM}")
MESSAGE(STATUS "Found Paddle host system's CPU: ${CPU_CORES} cores")
IF(DEFINED CMAKE_SYSTEM_NAME)
IF(${CMAKE_SYSTEM_NAME} STREQUAL "Android")
SET(ANDROID TRUE)
ENDIF()
ENDIF()
# external dependencies log output
SET(EXTERNAL_PROJECT_LOG_ARGS
LOG_DOWNLOAD 0 # Wrap download in script to log output
......
......@@ -90,25 +90,9 @@ function(link_paddle_exe TARGET_NAME)
${RDMA_LD_FLAGS}
${RDMA_LIBS})
if(WITH_PYTHON)
target_link_libraries(${TARGET_NAME}
${PYTHON_LIBRARIES} util)
endif()
if(WITH_GPU)
target_link_libraries(${TARGET_NAME} ${CUDA_CUDART_LIBRARY})
if(NOT WITH_DSO OR WITH_METRIC)
target_link_libraries(${TARGET_NAME}
${CUDNN_LIBRARY}
${CUDA_curand_LIBRARY})
CUDA_ADD_CUBLAS_TO_TARGET(${TARGET_NAME})
endif()
check_library_exists(rt clock_gettime "time.h" HAVE_CLOCK_GETTIME )
if(HAVE_CLOCK_GETTIME)
target_link_libraries(${TARGET_NAME} rt)
endif()
endif()
if(ANDROID)
target_link_libraries(${TARGET_NAME} log)
endif(ANDROID)
add_dependencies(${TARGET_NAME} ${external_project_dependencies})
endfunction()
......
import sys
import paddle.v2 as paddle
def seqToseq_net(source_dict_dim, target_dict_dim):
def seqToseq_net(source_dict_dim, target_dict_dim, is_generating=False):
### Network Architecture
word_vector_dim = 512 # dimension of word vector
decoder_size = 512 # dimension of hidden unit in GRU Decoder network
encoder_size = 512 # dimension of hidden unit in GRU Encoder network
beam_size = 3
max_length = 250
#### Encoder
src_word_id = paddle.layer.data(
name='source_language_word',
......@@ -67,79 +71,143 @@ def seqToseq_net(source_dict_dim, target_dict_dim):
group_input2 = paddle.layer.StaticInputV2(input=encoded_proj, is_seq=True)
group_inputs = [group_input1, group_input2]
trg_embedding = paddle.layer.embedding(
input=paddle.layer.data(
name='target_language_word',
type=paddle.data_type.integer_value_sequence(target_dict_dim)),
size=word_vector_dim,
param_attr=paddle.attr.ParamAttr(name='_target_language_embedding'))
group_inputs.append(trg_embedding)
# For decoder equipped with attention mechanism, in training,
# target embeding (the groudtruth) is the data input,
# while encoded source sequence is accessed to as an unbounded memory.
# Here, the StaticInput defines a read-only memory
# for the recurrent_group.
decoder = paddle.layer.recurrent_group(
name=decoder_group_name,
step=gru_decoder_with_attention,
input=group_inputs)
lbl = paddle.layer.data(
name='target_language_next_word',
type=paddle.data_type.integer_value_sequence(target_dict_dim))
cost = paddle.layer.classification_cost(input=decoder, label=lbl)
return cost
if not is_generating:
trg_embedding = paddle.layer.embedding(
input=paddle.layer.data(
name='target_language_word',
type=paddle.data_type.integer_value_sequence(target_dict_dim)),
size=word_vector_dim,
param_attr=paddle.attr.ParamAttr(name='_target_language_embedding'))
group_inputs.append(trg_embedding)
# For decoder equipped with attention mechanism, in training,
# target embeding (the groudtruth) is the data input,
# while encoded source sequence is accessed to as an unbounded memory.
# Here, the StaticInput defines a read-only memory
# for the recurrent_group.
decoder = paddle.layer.recurrent_group(
name=decoder_group_name,
step=gru_decoder_with_attention,
input=group_inputs)
lbl = paddle.layer.data(
name='target_language_next_word',
type=paddle.data_type.integer_value_sequence(target_dict_dim))
cost = paddle.layer.classification_cost(input=decoder, label=lbl)
return cost
else:
# In generation, the decoder predicts a next target word based on
# the encoded source sequence and the last generated target word.
# The encoded source sequence (encoder's output) must be specified by
# StaticInput, which is a read-only memory.
# Embedding of the last generated word is automatically gotten by
# GeneratedInputs, which is initialized by a start mark, such as <s>,
# and must be included in generation.
trg_embedding = paddle.layer.GeneratedInputV2(
size=target_dict_dim,
embedding_name='_target_language_embedding',
embedding_size=word_vector_dim)
group_inputs.append(trg_embedding)
beam_gen = paddle.layer.beam_search(
name=decoder_group_name,
step=gru_decoder_with_attention,
input=group_inputs,
bos_id=0,
eos_id=1,
beam_size=beam_size,
max_length=max_length)
return beam_gen
def main():
paddle.init(use_gpu=False, trainer_count=1)
is_generating = False
# source and target dict dim.
dict_size = 30000
source_dict_dim = target_dict_dim = dict_size
# define network topology
cost = seqToseq_net(source_dict_dim, target_dict_dim)
parameters = paddle.parameters.create(cost)
# define optimize method and trainer
optimizer = paddle.optimizer.Adam(
learning_rate=5e-5,
regularization=paddle.optimizer.L2Regularization(rate=1e-3))
trainer = paddle.trainer.SGD(cost=cost,
parameters=parameters,
update_equation=optimizer)
# define data reader
feeding = {
'source_language_word': 0,
'target_language_word': 1,
'target_language_next_word': 2
}
wmt14_reader = paddle.batch(
paddle.reader.shuffle(
paddle.dataset.wmt14.train(dict_size=dict_size), buf_size=8192),
batch_size=5)
# define event_handler callback
def event_handler(event):
if isinstance(event, paddle.event.EndIteration):
if event.batch_id % 10 == 0:
print "\nPass %d, Batch %d, Cost %f, %s" % (
event.pass_id, event.batch_id, event.cost, event.metrics)
# train the network
if not is_generating:
cost = seqToseq_net(source_dict_dim, target_dict_dim)
parameters = paddle.parameters.create(cost)
# define optimize method and trainer
optimizer = paddle.optimizer.Adam(
learning_rate=5e-5,
regularization=paddle.optimizer.L2Regularization(rate=8e-4))
trainer = paddle.trainer.SGD(cost=cost,
parameters=parameters,
update_equation=optimizer)
# define data reader
wmt14_reader = paddle.batch(
paddle.reader.shuffle(
paddle.dataset.wmt14.train(dict_size), buf_size=8192),
batch_size=5)
# define event_handler callback
def event_handler(event):
if isinstance(event, paddle.event.EndIteration):
if event.batch_id % 10 == 0:
print "\nPass %d, Batch %d, Cost %f, %s" % (
event.pass_id, event.batch_id, event.cost,
event.metrics)
else:
sys.stdout.write('.')
sys.stdout.flush()
# start to train
trainer.train(
reader=wmt14_reader, event_handler=event_handler, num_passes=2)
# generate a english sequence to french
else:
# use the first 3 samples for generation
gen_creator = paddle.dataset.wmt14.gen(dict_size)
gen_data = []
gen_num = 3
for item in gen_creator():
gen_data.append((item[0], ))
if len(gen_data) == gen_num:
break
beam_gen = seqToseq_net(source_dict_dim, target_dict_dim, is_generating)
# get the pretrained model, whose bleu = 26.92
parameters = paddle.dataset.wmt14.model()
# prob is the prediction probabilities, and id is the prediction word.
beam_result = paddle.infer(
output_layer=beam_gen,
parameters=parameters,
input=gen_data,
field=['prob', 'id'])
# get the dictionary
src_dict, trg_dict = paddle.dataset.wmt14.get_dict(dict_size)
# the delimited element of generated sequences is -1,
# the first element of each generated sequence is the sequence length
seq_list = []
seq = []
for w in beam_result[1]:
if w != -1:
seq.append(w)
else:
sys.stdout.write('.')
sys.stdout.flush()
# start to train
trainer.train(
reader=wmt14_reader,
event_handler=event_handler,
num_passes=10000,
feeding=feeding)
seq_list.append(' '.join([trg_dict.get(w) for w in seq[1:]]))
seq = []
prob = beam_result[0]
beam_size = 3
for i in xrange(gen_num):
print "\n*******************************************************\n"
print "src:", ' '.join(
[src_dict.get(w) for w in gen_data[i][0]]), "\n"
for j in xrange(beam_size):
print "prob = %f:" % (prob[i][j]), seq_list[i * beam_size + j]
if __name__ == '__main__':
......
......@@ -109,6 +109,12 @@ sum_to_one_norm
:members: sum_to_one_norm
:noindex:
cross_channel_norm
------------------
.. automodule:: paddle.v2.layer
:members: cross_channel_norm
:noindex:
Recurrent Layers
================
......
.. _api_v2.optimizer:
==========
Optimizer
==========
......
========
Datasets
========
==================================
Data Reader Interface and DataSets
==================================
DataTypes
......@@ -49,7 +49,6 @@ mnist
:members:
:noindex:
cifar
+++++
......@@ -61,7 +60,7 @@ conll05
+++++++
.. automodule:: paddle.v2.dataset.conll05
:members:
:members: get_dict,get_embedding,test
:noindex:
imdb
......@@ -85,6 +84,12 @@ movielens
:members:
:noindex:
.. autoclass:: paddle.v2.dataset.movielens.MovieInfo
:noindex:
.. autoclass:: paddle.v2.dataset.movielens.UserInfo
:noindex:
sentiment
+++++++++
......@@ -102,7 +107,7 @@ uci_housing
wmt14
+++++
.. automodule:: paddle.v2.dataset.uci_housing
.. automodule:: paddle.v2.dataset.wmt14
:members:
:noindex:
......@@ -6,18 +6,21 @@ Parameters
==========
.. automodule:: paddle.v2.parameters
:members: Parameters
:noindex:
Trainer
=======
.. automodule:: paddle.v2.trainer
:members: SGD
:noindex:
Event
=====
.. automodule:: paddle.v2.event
:members:
:noindex:
Inference
......@@ -25,3 +28,4 @@ Inference
.. autofunction:: paddle.v2.infer
:noindex:
\ No newline at end of file
# Design Doc: Distributed Training
## Objective
In [this slides](https://www.slideshare.net/cxwangyi/paddlepaddle-a-complete-solution-for-businesses), we explained that we'd like PaddlePaddle running on general-purpose clusters like those managed by Kubernetes, so to address demands for AI from both Internet and non-Internet industries.
This poses technical challenges to PaddlePaddle:
1. Support fault-recovery.
1. Support both offline and online training.
1. [Serverless computing](https://en.wikipedia.org/wiki/Serverless_computing) of distributed training.
## Training Job
A training job will be created once user asks Paddle cloud to train a model. The training job is made up of different processes that collaboratively consume data and produce a trained model. There are three kinds of processes:
1. the *master process*, which dispatches tasks to
1. one or more *trainer processes*, which run distributed training and synchronize gradients/models via
1. one or more *parameter server processes*, where each holds a shard of the global model.
Their relation is illustrated in the following graph:
<img src="src/paddle-model-sharding.png"/>
### Master Process
The master process will:
- Partition a dataset into [tasks](#task) and dispatch tasks to trainers.
- Keep track of training progress on the dataset with [task queue](#task-queue). A training job will iterate on the dataset for a full pass until it goes into next pass.
#### Task
A task is a data shard to be trained. The total number of tasks will be much bigger than the total number of trainers. The number of data instances inside a task will be much bigger than the mini-batch size.
#### Task Queue
The master process has three task queues to track training progress. As illustrated in the graph below, Job A and Job B both have one master process. Each master process has three task queues.
<img src="src/paddle-task-queues.png"/>
- The todo queue holds tasks to be dispatched. When a job starts, the master process fills in the todo queue with all tasks.
- The pending queue holds tasks that are currently training by trainers.
- the done queue holds tasks that are already trained.
The life cycle of a single task is illustrated below:
<img src="src/paddle-task-states.png"/>
1. When a new pass of training starts, all tasks will be placed in the todo queue.
1. The master process will dispatch few tasks to each trainer at a time, puts them in the pending queue and waits for completion.
1. The trainer will work on its tasks and tell the master process once a task is completed. The master process will dispatch a new task to that trainer.
1. If a task timeout. the master process will move it back to the todo queue. The timeout count will increase by one. If the timeout count is above a threshold, the task is likely to cause a trainer to crash, so it will be discarded.
1. The master process will move completed task to the done queue. When the todo queue is empty, the master process will start a new pass by moving all tasks in the done queue to todo queue and reset the timeout counter of all tasks to zero.
### Trainer Process
The trainer process will:
- Receive tasks from the master.
- Work on the tasks: calculate and upload gradient to parameter servers, and update local model by downloading new parameters from parameter servers.
### Parameter Server Process
Parameter server processes hold the parameters collaboratively. The parameters are partitioned on different parameter servers.
The parameter server will:
- Receive gradient from the trainers, update its parameters, and give the trainers the latest parameters.
- Periodically save its parameters to distributed file system by overriding the previous save.
### Optimization Algorithms
The communication pattern between the trainers and the parameter servers depends on the category of optimization algorithm:
- Synchronous Stochastic Gradient Descent (sync-SGD)
Parameter server will wait for all trainer finish n-th mini-batch calculation and send their gradients before broadcasting new parameters to every trainer. Every trainer will wait for the new parameters before starting n+1-th mini-batch.
- Asynchronous Stochastic Gradient Descent (async-SGD)
There will no synchronization between different trainers, and parameter server updates its parameter as soon as it receives new gradient:
- Each trainer uploads its accumulated gradient every n mini-batches.
- Every m mini-batches, the trainer downloads new parameters from parameter server.
- n and m do not have to be equal.
## Fault Tolerant
The training job will pause if the master processes is dead, or any of the parameter server process is dead. They will be started by [Kubernetes](https://kubernetes.io/) and recover in few minutes. Please refer to [fault recovery](#fault-recovery).
The training job will continue to make progress if there is at least one training process running. The strategy depends on the type of optimization algorithm:
- sync-SGD
TODO
- async-SGD
Since async-SGD does not require synchronization between mini-batches, the system will by definition make process if at least one trainer is running.
## Fault Recovery
PaddlePaddle uses [etcd](https://github.com/coreos/etcd) to keep track of the states of processes. Because etcd is a distributed reliable key-value store, the restarted process can recover its states from etcd. The model parameters are periodically saved into distributed file system, so a restarted parameter server can recover its parameters from the saved file.
Now we will introduce how each process recovers from a failure, the graph below shows how etcd is used:
<img src="src/paddle-etcd.png"/>
### Master Process
When the master is started by the Kubernetes, it executes the following steps at startup:
1. Grabs a unique *master* lock in etcd, which prevents concurrent master instantiations.
1. Recovers the task queues from etcd if they already exist, otherwise, the master will create them.
1. Watches the trainer prefix keys `/trainer/` on etcd to find the live trainers.
1. Starts dispatching the tasks to the trainers, and updates task queue using an etcd transaction to ensure lock is held during the update.
The master process will kill itself if its etcd lease expires.
When the master process is dead for any reason, Kubernetes will restart it. It will be online again with all states recovered from etcd in few minutes.
### Trainer Process
When the trainer is started by the Kubernetes, it executes the following steps at startup:
1. Watches the available parameter server prefix keys `/ps/` on etcd and waits until the count of parameter servers reaches the desired count.
1. Generates a unique ID, and sets key `/trainer/<unique ID>` with its contact address as value. The key will be deleted when the lease expires, so the master will be aware of the trainer being online and offline.
1. Waits for tasks from the master to start training.
If trainer's etcd lease expires, it will try set key `/trainer/<unique ID>` again so that the master process can discover the trainer again.
### Parameter Server Process
When the parameter server is started by Kubernetes, it executes the following steps at startup:
1. Read desired total number of parameter servers from etcd `/ps_desired`
1. Search through etcd keys `/ps/<index>` (`/ps/0`, `/ps/1`, ...) to find the first non-existant key whose index is smaller than the total number of parameter servers. Set the key using a transaction to avoid concurrent writes. The parameter server's index is inferred from the key name.
The desired number of parameter servers is 3:
<img src="src/paddle-ps-0.png"/>
The third parameter server joined:
<img src="src/paddle-ps-1.png"/>
1. The parameter server can load parameters if there are already saved parameters in the save path (inferred from its index).
1. Now the parameter server is ready for the trainers' requests.
If the parameter server's etcd lease expires, the parameter server will kill itself.
## Dynamic Scaling
### Trainer Scaling
TODO
### Parameter Server Scaling
Not planned for v1.
## Training Dataset Format
TODO
## User Interface
TODO
......@@ -51,7 +51,7 @@ PaddlePaddle supports some build options.
<tr><td class="left">WITH_TIMER</td><td class="left">Compile PaddlePaddle with stats timer</td></tr>
<tr><td class="left">WITH_PROFILER</td><td class="left">Compile PaddlePaddle with GPU profiler</td></tr>
<tr><td class="left">WITH_DOC</td><td class="left">Compile PaddlePaddle with documentation</td></tr>
<tr><td class="left">ON_COVERALLS</td><td class="left">Compile PaddlePaddle with code coverage</td></tr>
<tr><td class="left">WITH_COVERAGE</td><td class="left">Compile PaddlePaddle with code coverage</td></tr>
<tr><td class="left">COVERALLS_UPLOAD</td><td class="left">Package code coverage data to coveralls</td></tr>
<tr><td class="left">ON_TRAVIS</td><td class="left">Exclude special unit test on Travis CI</td></tr>
</tbody>
......
......@@ -4,119 +4,139 @@ PaddlePaddle的Docker容器使用方式
PaddlePaddle目前唯一官方支持的运行的方式是Docker容器。因为Docker能在所有主要操作系统(包括Linux,Mac OS X和Windows)上运行。 请注意,您需要更改 `Dockers设置 <https://github.com/PaddlePaddle/Paddle/issues/627>`_ 才能充分利用Mac OS X和Windows上的硬件资源。
纯CPU和GPU的docker镜像使用说明
PaddlePaddle发布的docker镜像使用说明
------------------------------
对于每一个PaddlePaddle版本,我们都会发布两个Docker镜像:纯CPU的和GPU的
我们通过设置 `dockerhub.com <https://hub.docker.com/r/paddledev/paddle/>`_ 自动生成最新的docker镜像:
`paddledev/paddle:0.10.0rc1-cpu` 和 `paddledev/paddle:0.10.0rc1-gpu`。
对于每一个PaddlePaddle版本,我们都会发布两种Docker镜像:开发镜像、运行镜像。运行镜像包括纯CPU版本和GPU版本以及其对应的非AVX版本
我们会在 `dockerhub.com <https://hub.docker.com/r/paddledev/paddle/>`_ 提供最新的docker镜像,可以在"tags"标签下找到最新的Paddle镜像版本。
1. 开发镜像::code:`paddlepaddle/paddle:<version>-dev`
以交互容器方式运行纯CPU的镜像:
这个镜像包含了Paddle相关的开发工具以及编译和运行环境。用户可以使用开发镜像代替配置本地环境,完成开发,编译,发布,
文档编写等工作。由于不同的Paddle的版本可能需要不同的依赖和工具,所以如果需要自行配置开发环境需要考虑版本的因素。
开发镜像包含了以下工具:
- gcc/clang
- nvcc
- Python
- sphinx
- woboq
- sshd
很多开发者会使用远程的安装有GPU的服务器工作,用户可以使用ssh登录到这台服务器上并执行 :code:`docker exec`进入开发镜像并开始工作,
也可以在开发镜像中启动一个SSHD服务,方便开发者直接登录到镜像中进行开发:
.. code-block:: bash
以交互容器方式运行开发镜像:
docker run -it --rm paddledev/paddle:0.10.0rc1-cpu /bin/bash
.. code-block:: bash
或者,可以以后台进程方式运行容器:
docker run -it --rm paddledev/paddle:<version>-dev /bin/bash
.. code-block:: bash
或者,可以以后台进程方式运行容器:
docker run -d -p 2202:22 -p 8888:8888 paddledev/paddle:0.10.0rc1-cpu
.. code-block:: bash
然后用密码 :code:`root` SSH进入容器:
docker run -d -p 2202:22 -p 8888:8888 paddledev/paddle:<version>-dev
.. code-block:: bash
然后用密码 :code:`root` SSH进入容器:
ssh -p 2202 root@localhost
.. code-block:: bash
SSH方式的一个优点是我们可以从多个终端进入容器。比如,一个终端运行vi,另一个终端运行Python。另一个好处是我们可以把PaddlePaddle容器运行在远程服务器上,并在笔记本上通过SSH与其连接。
ssh -p 2202 root@localhost
SSH方式的一个优点是我们可以从多个终端进入容器。比如,一个终端运行vi,另一个终端运行Python。另一个好处是我们可以把PaddlePaddle容器运行在远程服务器上,并在笔记本上通过SSH与其连接。
以上方法在GPU镜像里也能用-只是请不要忘记按装CUDA驱动,以及告诉Docker:
2. 运行镜像:根据CPU、GPU和非AVX区分了如下4个镜像:
- GPU/AVX::code:`paddlepaddle/paddle:<version>-gpu`
- GPU/no-AVX::code:`paddlepaddle/paddle:<version>-gpu-noavx`
- CPU/AVX::code:`paddlepaddle/paddle:<version>`
- CPU/no-AVX::code:`paddlepaddle/paddle:<version>-noavx`
.. code-block:: bash
纯CPU镜像以及GPU镜像都会用到AVX指令集,但是2008年之前生产的旧电脑不支持AVX。以下指令能检查Linux电脑是否支持AVX:
export CUDA_SO="$(\ls /usr/lib64/libcuda* | xargs -I{} echo '-v {}:{}') $(\ls /usr/lib64/libnvidia* | xargs -I{} echo '-v {}:{}')"
export DEVICES=$(\ls /dev/nvidia* | xargs -I{} echo '--device {}:{}')
docker run ${CUDA_SO} ${DEVICES} -it paddledev/paddle:0.10.0rc1-gpu
.. code-block:: bash
if cat /proc/cpuinfo | grep -i avx; then echo Yes; else echo No; fi
运行PaddlePaddle书籍
---------------------
如果输出是No,就需要选择使用no-AVX的镜像
Jupyter Notebook是一个开源的web程序,大家可以通过它制作和分享带有代码、公式、图表、文字的交互式文档。用户可以通过网页浏览文档。
以上方法在GPU镜像里也能用,只是请不要忘记提前在物理机上安装GPU最新驱动。
为了保证GPU驱动能够在镜像里面正常运行,我们推荐使用[nvidia-docker](https://github.com/NVIDIA/nvidia-docker)来运行镜像。
PaddlePaddle书籍是为用户和开发者制作的一个交互式的Jupyter Nodebook。
如果您想要更深入了解deep learning,PaddlePaddle书籍一定是您最好的选择。
.. code-block:: bash
当您进入容器内之后,只用运行以下命令:
nvidia-docker run -it --rm paddledev/paddle:0.10.0rc1-gpu /bin/bash
.. code-block:: bash
jupyter notebook
注意: 如果使用nvidia-docker存在问题,你也许可以尝试更老的方法,具体如下,但是我们并不推荐这种方法。:
然后在浏览器中输入以下网址:
.. code-block:: text
.. code-block:: bash
http://localhost:8888/
export CUDA_SO="$(\ls /usr/lib64/libcuda* | xargs -I{} echo '-v {}:{}') $(\ls /usr/lib64/libnvidia* | xargs -I{} echo '-v {}:{}')"
export DEVICES=$(\ls /dev/nvidia* | xargs -I{} echo '--device {}:{}')
docker run ${CUDA_SO} ${DEVICES} -it paddledev/paddle:<version>-gpu
就这么简单,享受您的旅程!
3. 使用运行镜像发布你的AI程序
假设您已经完成了一个AI训练的python程序 :code:`a.py`,这个程序是您在开发机上使用开发镜像完成开发。此时您可以运行这个命令在开发机上进行测试运行:
.. code-block:: bash
非AVX镜像
---------
docker run -it -v $PWD:/work paddle /work/a.py
纯CPU镜像以及GPU镜像都会用到AVX指令集,但是2008年之前生产的旧电脑不支持AVX。以下指令能检查Linux电脑是否支持AVX:
这里`a.py`包含的所有依赖假设都可以在Paddle的运行容器中。如果需要包含更多的依赖、或者需要发布您的应用的镜像,可以编写`Dockerfile`使用`FROM paddledev/paddle:<version>`
创建和发布自己的AI程序镜像。
.. code-block:: bash
运行PaddlePaddle书籍
---------------------
if cat /proc/cpuinfo | grep -i avx; then echo Yes; else echo No; fi
Jupyter Notebook是一个开源的web程序,大家可以通过它制作和分享带有代码、公式、图表、文字的交互式文档。用户可以通过网页浏览文档。
如果输出是No,我们就需要手动编译一个非AVX版本的镜像:
PaddlePaddle书籍是为用户和开发者制作的一个交互式的Jupyter Nodebook。
如果您想要更深入了解deep learning,PaddlePaddle书籍一定是您最好的选择。
我们提供可以直接运行PaddlePaddle书籍的docker镜像,直接运行:
.. code-block:: bash
cd ~
git clone https://github.com/PaddlePaddle/Paddle.git
cd Paddle
docker build --build-arg WITH_AVX=OFF -t paddle:cpu-noavx -f paddle/scripts/docker/Dockerfile .
docker build --build-arg WITH_AVX=OFF -t paddle:gpu-noavx -f paddle/scripts/docker/Dockerfile.gpu .
docker run -p 8888:8888 paddlepaddle/book
然后在浏览器中输入以下网址:
.. code-block:: text
http://localhost:8888/
就这么简单,享受您的旅程!
通过Docker容器开发PaddlePaddle
------------------------------
开发人员可以在Docker中开发PaddlePaddle。这样开发人员可以以一致的方式在不同的平台上工作 - Linux,Mac OS X和Windows。
开发人员可以在Docker开发镜像中开发PaddlePaddle。这样开发人员可以以一致的方式在不同的平台上工作 - Linux,Mac OS X和Windows。
1. 构建开发镜像
1. 将开发环境构建为Docker镜像
.. code-block:: bash
git clone --recursive https://github.com/PaddlePaddle/Paddle
cd Paddle
docker build -t paddle:dev -f paddle/scripts/docker/Dockerfile .
docker build -t paddle:dev .
请注意,默认情况下,:code:`docker build` 不会将源码导入到镜像中并编译它。如果我们想这样做,需要设置一个参数
请注意,默认情况下,:code:`docker build` 不会将源码导入到镜像中并编译它。如果我们想这样做,需要构建完开发镜像,然后执行
.. code-block:: bash
docker build -t paddle:dev -f paddle/scripts/docker/Dockerfile --build-arg BUILD_AND_INSTALL=ON .
docker run -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_AVX=ON" -e "TEST=OFF" paddle:dev
2. 运行开发环境
当我们编译好了 :code:`paddle:dev`, 我们可以在docker容器里做开发,源代码可以通过挂载本地文件来被载入Docker的开发环境里面:
.. code-block:: bash
docker run -d -p 2202:22 -v $PWD:/paddle paddle:dev
docker run -d -p 2202:22 -v $PWD:/paddle paddle:dev sshd
以上代码会启动一个带有PaddlePaddle开发环境的docker容器,源代码会被挂载到 :code:`/paddle` 。
请注意, :code:`paddle:dev` 的默认入口是 :code:`sshd` 。以上的 :code:`docker run` 命令其实会启动一个在2202端口监听的SSHD服务器。这样,我们就能SSH进入我们的开发容器了:
以上的 :code:`docker run` 命令其实会启动一个在2202端口监听的SSHD服务器。这样,我们就能SSH进入我们的开发容器了:
.. code-block:: bash
ssh root@localhost -p 2202
......@@ -124,13 +144,13 @@ PaddlePaddle书籍是为用户和开发者制作的一个交互式的Jupyter Nod
3. 在Docker开发环境中编译与安装PaddlPaddle代码
当在容器里面的时候,可以用脚本 :code:`paddle/scripts/docker/build.sh` 来编译、安装与测试PaddlePaddle:
.. code-block:: bash
/paddle/paddle/scripts/docker/build.sh
以上指令会在 :code:`/paddle/build` 中编译PaddlePaddle。通过以下指令可以运行单元测试:
.. code-block:: bash
cd /paddle/build
......@@ -140,14 +160,14 @@ PaddlePaddle书籍是为用户和开发者制作的一个交互式的Jupyter Nod
文档
----
Paddle的Docker镜像带有一个通过 `woboq code browser
Paddle的Docker开发镜像带有一个通过 `woboq code browser
<https://github.com/woboq/woboq_codebrowser>`_ 生成的HTML版本的C++源代码,便于用户浏览C++源码。
只要在Docker里启动PaddlePaddle的时候给它一个名字,就可以再运行另一个Nginx Docker镜像来服务HTML代码:
.. code-block:: bash
docker run -d --name paddle-cpu-doc paddle:0.10.0rc1-cpu
docker run -d --name paddle-cpu-doc paddle:<version>-dev
docker run -d --volumes-from paddle-cpu-doc -p 8088:80 nginx
接着我们就能够打开浏览器在 http://localhost:8088/paddle/ 浏览代码。
......@@ -8,173 +8,255 @@ Please be aware that you will need to change `Dockers settings
<https://github.com/PaddlePaddle/Paddle/issues/627>`_ to make full use
of your hardware resource on Mac OS X and Windows.
Working With Docker
-------------------
Docker is simple as long as we understand a few basic concepts:
- *image*: A Docker image is a pack of software. It could contain one or more programs and all their dependencies. For example, the PaddlePaddle's Docker image includes pre-built PaddlePaddle and Python and many Python packages. We can run a Docker image directly, other than installing all these software. We can type
.. code-block:: bash
docker images
to list all images in the system. We can also run
.. code-block:: bash
docker pull paddlepaddle/paddle:0.10.0rc2
to download a Docker image, paddlepaddle/paddle in this example,
from Dockerhub.com.
- *container*: considering a Docker image a program, a container is a
"process" that runs the image. Indeed, a container is exactly an
operating system process, but with a virtualized filesystem, network
port space, and other virtualized environment. We can type
.. code-block:: bash
docker run paddlepaddle/paddle:0.10.0rc2
to start a container to run a Docker image, paddlepaddle/paddle in this example.
- By default docker container have an isolated file system namespace,
we can not see the files in the host file system. By using *volume*,
mounted files in host will be visible inside docker container.
Following command will mount current dirctory into /data inside
docker container, run docker container from debian image with
command :code:`ls /data`.
.. code-block:: bash
docker run --rm -v $(pwd):/data debian ls /data
Usage of CPU-only and GPU Images
----------------------------------
For each version of PaddlePaddle, we release 2 Docker images, a
CPU-only one and a CUDA GPU one. We do so by configuring
`dockerhub.com <https://hub.docker.com/r/paddledev/paddle/>`_
automatically generate the latest docker images `paddledev/paddle:0.10.0rc1-cpu`
and `paddledev/paddle:0.10.0rc1-gpu`.
For each version of PaddlePaddle, we release two types of Docker images:
development image and production image. Production image includes
CPU-only version and a CUDA GPU version and their no-AVX versions. We
put the docker images on `dockerhub.com
<https://hub.docker.com/r/paddledev/paddle/>`_. You can find the
latest versions under "tags" tab at dockerhub.com
To run the CPU-only image as an interactive container:
1. Production images, this image might have multiple variants:
.. code-block:: bash
- GPU/AVX::code:`paddlepaddle/paddle:<version>-gpu`
- GPU/no-AVX::code:`paddlepaddle/paddle:<version>-gpu-noavx`
- CPU/AVX::code:`paddlepaddle/paddle:<version>`
- CPU/no-AVX::code:`paddlepaddle/paddle:<version>-noavx`
docker run -it --rm paddledev/paddle:0.10.0rc1-cpu /bin/bash
Please be aware that the CPU-only and the GPU images both use the
AVX instruction set, but old computers produced before 2008 do not
support AVX. The following command checks if your Linux computer
supports AVX:
or, we can run it as a daemon container
.. code-block:: bash
.. code-block:: bash
if cat /proc/cpuinfo | grep -i avx; then echo Yes; else echo No; fi
docker run -d -p 2202:22 -p 8888:8888 paddledev/paddle:0.10.0rc1-cpu
To run the CPU-only image as an interactive container:
and SSH to this container using password :code:`root`:
.. code-block:: bash
.. code-block:: bash
docker run -it --rm paddlepaddle/paddle:0.10.0rc2 /bin/bash
ssh -p 2202 root@localhost
Above method work with the GPU image too -- the recommended way is
using `nvidia-docker <https://github.com/NVIDIA/nvidia-docker>`_.
An advantage of using SSH is that we can connect to PaddlePaddle from
more than one terminals. For example, one terminal running vi and
another one running Python interpreter. Another advantage is that we
can run the PaddlePaddle container on a remote server and SSH to it
from a laptop.
Please install nvidia-docker first following this `tutorial
<https://github.com/NVIDIA/nvidia-docker#quick-start>`_.
Above methods work with the GPU image too -- just please don't forget
to install CUDA driver and let Docker knows about it:
Now you can run a GPU image:
.. code-block:: bash
.. code-block:: bash
export CUDA_SO="$(\ls /usr/lib64/libcuda* | xargs -I{} echo '-v {}:{}') $(\ls /usr/lib64/libnvidia* | xargs -I{} echo '-v {}:{}')"
export DEVICES=$(\ls /dev/nvidia* | xargs -I{} echo '--device {}:{}')
docker run ${CUDA_SO} ${DEVICES} -it paddledev/paddle:0.10.0rc1-gpu
nvidia-docker run -it --rm paddlepaddle/paddle:0.10.0rc2-gpu /bin/bash
2. development image :code:`paddlepaddle/paddle:<version>-dev`
PaddlePaddle Book
------------------
This image has packed related develop tools and runtime
environment. Users and developers can use this image instead of
their own local computer to accomplish development, build,
releasing, document writing etc. While different version of paddle
may depends on different version of libraries and tools, if you
want to setup a local environment, you must pay attention to the
versions. The development image contains:
- gcc/clang
- nvcc
- Python
- sphinx
- woboq
- sshd
Many developers use servers with GPUs, they can use ssh to login to
the server and run :code:`docker exec` to enter the docker
container and start their work. Also they can start a development
docker image with SSHD service, so they can login to the container
and start work.
The Jupyter Notebook is an open-source web application that allows
you to create and share documents that contain live code, equations,
visualizations and explanatory text in a single browser.
PaddlePaddle Book is an interactive Jupyter Notebook for users and developers.
We already exposed port 8888 for this book. If you want to
dig deeper into deep learning, PaddlePaddle Book definitely is your best choice.
Train Model Using Python API
----------------------------
Once you are inside the container, simply issue the command:
Our official docker image provides a runtime for PaddlePaddle
programs. The typical workflow will be as follows:
Create a directory as workspace:
.. code-block:: bash
jupyter notebook
Then, you would back and paste the address into the local browser:
.. code-block:: text
mkdir ~/workspace
http://localhost:8888/
Edit a PaddlePaddle python program using your favourite editor
That's all. Enjoy your journey!
.. code-block:: bash
emacs ~/workspace/example.py
Non-AVX Images
--------------
Run the program using docker:
Please be aware that the CPU-only and the GPU images both use the AVX
instruction set, but old computers produced before 2008 do not support
AVX. The following command checks if your Linux computer supports
AVX:
.. code-block:: bash
docker run --rm -v ~/workspace:/workspace paddlepaddle/paddle:0.10.0rc2 python /workspace/example.py
Or if you are using GPU for training:
.. code-block:: bash
if cat /proc/cpuinfo | grep -i avx; then echo Yes; else echo No; fi
nvidia-docker run --rm -v ~/workspace:/workspace paddlepaddle/paddle:0.10.0rc2-gpu python /workspace/example.py
Above commands will start a docker container by running :code:`python
/workspace/example.py`. It will stop once :code:`python
/workspace/example.py` finishes.
If it doesn't, we will need to build non-AVX images manually from
source code:
Another way is to tell docker to start a :code:`/bin/bash` session and
run PaddlePaddle program interactively:
.. code-block:: bash
cd ~
git clone https://github.com/PaddlePaddle/Paddle.git
cd Paddle
docker build --build-arg WITH_AVX=OFF -t paddle:cpu-noavx -f paddle/scripts/docker/Dockerfile .
docker build --build-arg WITH_AVX=OFF -t paddle:gpu-noavx -f paddle/scripts/docker/Dockerfile.gpu .
docker run -it -v ~/workspace:/workspace paddlepaddle/paddle:0.10.0rc2 /bin/bash
# now we are inside docker container
cd /workspace
python example.py
Running with GPU is identical:
Development Using Docker
------------------------
.. code-block:: bash
Developers can work on PaddlePaddle using Docker. This allows
developers to work on different platforms -- Linux, Mac OS X, and
Windows -- in a consistent way.
nvidia-docker run -it -v ~/workspace:/workspace paddlepaddle/paddle:0.10.0rc2-gpu /bin/bash
# now we are inside docker container
cd /workspace
python example.py
1. Build the Development Environment as a Docker Image
.. code-block:: bash
Develop PaddlePaddle or Train Model Using C++ API
---------------------------------------------------
git clone --recursive https://github.com/PaddlePaddle/Paddle
cd Paddle
docker build -t paddle:dev -f paddle/scripts/docker/Dockerfile .
We will be using PaddlePaddle development image since it contains all
compiling tools and dependencies.
Let's clone PaddlePaddle repo first:
Note that by default :code:`docker build` wouldn't import source
tree into the image and build it. If we want to do that, we need
to set a build arg:
.. code-block:: bash
.. code-block:: bash
git clone https://github.com/PaddlePaddle/Paddle.git && cd Paddle
docker build -t paddle:dev -f paddle/scripts/docker/Dockerfile --build-arg BUILD_AND_INSTALL=ON .
Mount both workspace folder and paddle code folder into docker
container, so we can access them inside docker container. There are
two ways of using PaddlePaddle development docker image:
- run interactive bash directly
2. Run the Development Environment
.. code-block:: bash
Once we got the image :code:`paddle:dev`, we can use it to develop
Paddle by mounting the local source code tree into a container that
runs the image:
# use nvidia-docker instead of docker if you need to use GPU
docker run -it -v ~/workspace:/workspace -v $(pwd):/paddle paddlepaddle/paddle:0.10.0rc2-dev /bin/bash
# now we are inside docker container
.. code-block:: bash
- or, we can run it as a daemon container
docker run -d -p 2202:22 -p 8888:8888 -v $PWD:/paddle paddle:dev
.. code-block:: bash
This runs a container of the development environment Docker image
with the local source tree mounted to :code:`/paddle` of the
container.
# use nvidia-docker instead of docker if you need to use GPU
docker run -d -p 2202:22 -p 8888:8888 -v ~/workspace:/workspace -v $(pwd):/paddle paddlepaddle/paddle:0.10.0rc2-dev /usr/sbin/sshd -D
Note that the default entry-point of :code:`paddle:dev` is
:code:`sshd`, and above :code:`docker run` commands actually starts
an SSHD server listening on port 2202. This allows us to log into
this container with:
and SSH to this container using password :code:`root`:
.. code-block:: bash
.. code-block:: bash
ssh root@localhost -p 2202
ssh -p 2202 root@localhost
Usually, I run above commands on my Mac. I can also run them on a
GPU server :code:`xxx.yyy.zzz.www` and ssh from my Mac to it:
An advantage is that we can run the PaddlePaddle container on a
remote server and SSH to it from a laptop.
.. code-block:: bash
When developing PaddlePaddle, you can edit PaddlePaddle source code
from outside of docker container using your favoriate editor. To
compile PaddlePaddle, run inside container:
my-mac$ ssh root@xxx.yyy.zzz.www -p 2202
.. code-block:: bash
3. Build and Install Using the Development Environment
WITH_GPU=OFF WITH_AVX=ON WITH_TEST=ON bash /paddle/paddle/scripts/docker/build.sh
Once I am in the container, I can use
:code:`paddle/scripts/docker/build.sh` to build, install, and test
Paddle:
This builds everything about Paddle in :code:`/paddle/build`. And we
can run unit tests there:
.. code-block:: bash
.. code-block:: bash
/paddle/paddle/scripts/docker/build.sh
cd /paddle/build
ctest
This builds everything about Paddle in :code:`/paddle/build`. And
we can run unit tests there:
When training model using C++ API, we can edit paddle program in
~/workspace outside of docker. And build from /workspace inside of
docker.
.. code-block:: bash
PaddlePaddle Book
------------------
The Jupyter Notebook is an open-source web application that allows
you to create and share documents that contain live code, equations,
visualizations and explanatory text in a single browser.
PaddlePaddle Book is an interactive Jupyter Notebook for users and developers.
We already exposed port 8888 for this book. If you want to
dig deeper into deep learning, PaddlePaddle Book definitely is your best choice.
We provide a packaged book image, simply issue the command:
.. code-block:: bash
cd /paddle/build
ctest
docker run -p 8888:8888 paddlepaddle/book
Then, you would back and paste the address into the local browser:
.. code-block:: text
http://localhost:8888/
That's all. Enjoy your journey!
Documentation
......@@ -191,7 +273,7 @@ container:
.. code-block:: bash
docker run -d --name paddle-cpu-doc paddle:0.10.0rc1-cpu
docker run -d --name paddle-cpu-doc paddle:<version>
docker run -d --volumes-from paddle-cpu-doc -p 8088:80 nginx
......
......@@ -46,7 +46,6 @@ PaddlePaddle提供了ubuntu 14.04 deb安装包。
with_double: OFF
with_python: ON
with_rdma: OFF
with_metric_learning:
with_timer: OFF
with_predict_sdk:
......
......@@ -55,6 +55,7 @@ extensions = [
'sphinx.ext.napoleon',
'sphinx.ext.graphviz'
]
mathjax_path="https://cdn.bootcss.com/mathjax/2.7.0/MathJax.js"
table_styling_embed_css = True
autodoc_member_order = 'bysource'
......
......@@ -6,9 +6,10 @@
## 介绍 ###
### 中文字典 ###
我们的字典使用内部的分词工具对百度知道和百度百科的语料进行分词后产生。分词风格如下: "《红楼梦》"将被分为 "《","红楼梦","》",和 "《红楼梦》"。字典采用UTF8编码,输出有2列:词本身和词频。字典共包含 3206325个词和3个特殊标记:
我们的字典使用内部的分词工具对百度知道和百度百科的语料进行分词后产生。分词风格如下: "《红楼梦》"将被分为 "《","红楼梦","》",和 "《红楼梦》"。字典采用UTF8编码,输出有2列:词本身和词频。字典共包含 3206326个词和4个特殊标记:
- `<s>`: 分词序列的开始
- `<e>`: 分词序列的结束
- `PALCEHOLDER_JUST_IGNORE_THE_EMBEDDING`: 占位符,没有实际意义
- `<unk>`: 未知词
### 中文词向量的预训练模型 ###
......
......@@ -6,9 +6,10 @@ We thank @lipeng for the pull request that defined the model schemas and pretrai
## Introduction ###
### Chinese Word Dictionary ###
Our Chinese-word dictionary is created on Baidu ZhiDao and Baidu Baike by using in-house word segmentor. For example, the participle of "《红楼梦》" is "《","红楼梦","》",and "《红楼梦》". Our dictionary (using UTF-8 format) has has two columns: word and its frequency. The total word count is 3206325, including 3 special token:
Our Chinese-word dictionary is created on Baidu ZhiDao and Baidu Baike by using in-house word segmentor. For example, the participle of "《红楼梦》" is "《","红楼梦","》",and "《红楼梦》". Our dictionary (using UTF-8 format) has has two columns: word and its frequency. The total word count is 3206326, including 4 special token:
- `<s>`: the start of a sequence
- `<e>`: the end of a sequence
- `PALCEHOLDER_JUST_IGNORE_THE_EMBEDDING`: a placeholder, just ignore it and its embedding
- `<unk>`: a word not included in dictionary
### Pretrained Chinese Word Embedding Model ###
......
......@@ -9,13 +9,8 @@ add_subdirectory(pserver)
add_subdirectory(trainer)
add_subdirectory(scripts)
configure_file(${CMAKE_CURRENT_SOURCE_DIR}/setup.py.in
${CMAKE_CURRENT_SOURCE_DIR}/setup.py)
if(WITH_PREDICT_SDK)
add_subdirectory(predict)
endif()
if(WITH_SWIG_PY)
configure_file(${CMAKE_CURRENT_SOURCE_DIR}/setup.py.in
${CMAKE_CURRENT_SOURCE_DIR}/setup.py)
add_subdirectory(api)
endif()
FUNCTION(generate_python_api target_name)
ADD_CUSTOM_COMMAND(OUTPUT ${PROJ_ROOT}/paddle/py_paddle/swig_paddle.py
${PROJ_ROOT}/paddle/Paddle_wrap.cxx
${PROJ_ROOT}/paddle/Paddle_wrap.h
COMMAND ${SWIG_EXECUTABLE} -python -c++ -outcurrentdir -I../ api/Paddle.swig
&& mv ${PROJ_ROOT}/paddle/swig_paddle.py ${PROJ_ROOT}/paddle/py_paddle/swig_paddle.py
DEPENDS ${PROJ_ROOT}/paddle/api/Paddle.swig
${PROJ_ROOT}/paddle/api/PaddleAPI.h
${external_project_dependencies}
WORKING_DIRECTORY ${PROJ_ROOT}/paddle
COMMENT "Generate Python API from swig")
ADD_CUSTOM_TARGET(${target_name} ALL DEPENDS
${PROJ_ROOT}/paddle/Paddle_wrap.cxx
${PROJ_ROOT}/paddle/Paddle_wrap.h
${PROJ_ROOT}/paddle/py_paddle/swig_paddle.py
${external_project_dependencies})
ENDFUNCTION(generate_python_api)
set(API_SOURCES
Arguments.cpp
ConfigParser.cpp
......@@ -33,65 +15,84 @@ set(API_HEADER
PaddleAPI.h
Internal.h)
add_library(paddle_api STATIC
${API_SOURCES})
add_library(paddle_api STATIC ${API_SOURCES})
add_dependencies(paddle_api gen_proto_cpp)
list(LENGTH "${GFLAGS_LIBRARIES}" GFLAGS_LIBRARIES_LENGTH)
INCLUDE(${SWIG_USE_FILE})
INCLUDE_DIRECTORIES(${PROJ_ROOT}/paddle)
if(${GFLAGS_LIBRARIES_LENGTH} EQUAL 0 AND TARGET "${GFLAGS_LIBRARIES}")
# Because gflags compiled by cmake, so it is imported by cmake target,
# not a real library path. Get the real library path here.
message(STATUS "GFLAGS Libraries is ${GFLAGS_LIBRARIES}")
get_target_property(GFLAGS_LOCATION ${GFLAGS_LIBRARIES} LOCATION)
message(STATUS "GFLAGS Target location is ${GFLAGS_LOCATION}")
else()
set(GFLAGS_LOCATION ${GFLAGS_LIBRARIES})
endif()
FILE(GLOB PY_PADDLE_PYTHON_FILES ${PROJ_ROOT}/paddle/py_paddle/*.py)
SET_SOURCE_FILES_PROPERTIES(Paddle.i PROPERTIES CPLUSPLUS ON)
SET(CMAKE_SWIG_OUTDIR ${CMAKE_CURRENT_BINARY_DIR})
SET(CMAKE_CXX_FLAGS "-std=c++11 -fPIC -Wall")
IF(WITH_COVERAGE)
SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -g -O0 -fprofile-arcs -ftest-coverage")
ENDIF(WITH_COVERAGE)
configure_file(
paddle_api_config.py.in
${PROJ_ROOT}/paddle/api/paddle_api_config.py
SET(SWIG_MODULE_swig_paddle_EXTRA_DEPS
paddle_parameter
paddle_function
paddle_math
paddle_utils
paddle_gserver
paddle_pserver
paddle_api
paddle_cuda
paddle_trainer_lib
paddle_network
paddle_proto
${external_project_dependencies}
)
generate_python_api(python_swig_sources)
IF(APPLE)
SET(MACOS_LD_FLAGS "-undefined dynamic_lookup -Wl,-all_load")
ELSE(APPLE)
SET(START_GROUP "-Xlinker -start-group")
SET(END_GROUP "-Xlinker -end-group")
SET(ARCHIVE_START "-Wl,--whole-archive")
SET(ARCHIVE_END "-Wl,--no-whole-archive")
ENDIF(APPLE)
file(GLOB PY_PADDLE_PYTHON_FILES ${PROJ_ROOT}/paddle/py_paddle/*.py)
SWIG_ADD_MODULE(swig_paddle python Paddle.i)
SWIG_LINK_LIBRARIES(swig_paddle
${MACOS_LD_FLAGS}
${START_GROUP}
${ARCHIVE_START}
paddle_gserver
paddle_function
${METRIC_LIBS}
${ARCHIVE_END}
paddle_pserver
paddle_trainer_lib
paddle_network
paddle_parameter
paddle_math
paddle_utils
paddle_proto
paddle_cuda
paddle_api
${CMAKE_DL_LIBS}
${EXTERNAL_LIBS}
${CMAKE_THREAD_LIBS_INIT}
${START_END}
)
# TODO(yuyang18) : make wheel name calculated by cmake
add_custom_command(OUTPUT ${PROJ_ROOT}/paddle/dist/.timestamp
add_custom_command(OUTPUT ${PROJ_ROOT}/paddle/py_paddle/_swig_paddle.so
COMMAND cp ${CMAKE_CURRENT_BINARY_DIR}/swig_paddle.py ${PROJ_ROOT}/paddle/py_paddle
COMMAND cp ${CMAKE_CURRENT_BINARY_DIR}/_swig_paddle.so ${PROJ_ROOT}/paddle/py_paddle
COMMAND env ${py_env} ${PYTHON_EXECUTABLE} setup.py bdist_wheel
COMMAND ${CMAKE_COMMAND} -E touch dist/.timestamp
COMMAND rm -rf py_paddle.egg-info build
WORKING_DIRECTORY ${PROJ_ROOT}/paddle
DEPENDS python_swig_sources
paddle_parameter
paddle_function
paddle_math
paddle_utils
paddle_gserver
paddle_pserver
paddle_trainer
paddle_api
paddle_cuda
${PY_PADDLE_PYTHON_FILES}
DEPENDS _swig_paddle
)
install(DIRECTORY ${PROJ_ROOT}/paddle/dist/
DESTINATION opt/paddle/share/wheels
)
# TODO(yuyang18) : make wheel name calculated by cmake
add_custom_target(python_api_wheel ALL DEPENDS ${PROJ_ROOT}/paddle/py_paddle/_swig_paddle.so)
add_custom_target(python_api_wheel ALL DEPENDS
${PROJ_ROOT}/paddle/dist/.timestamp)
add_dependencies(python_api_wheel python_swig_sources
paddle_parameter
paddle_math
paddle_utils
paddle_gserver
paddle_pserver
paddle_trainer
paddle_api
paddle_cuda)
install(DIRECTORY ${PROJ_ROOT}/paddle/dist/ DESTINATION opt/paddle/share/wheels)
if(WITH_TESTING)
IF(NOT PY_PIP_FOUND)
......
PADDLE_BUILD_DIR="@CMAKE_CURRENT_BINARY_DIR@/../"
WITH_GPU="@WITH_GPU@"
PROTOBUF_LIBRARY="@PROTOBUF_LIBRARY@"
ZLIB_LIBRARIES="@ZLIB_LIBRARIES@"
CMAKE_THREAD_LIB="@CMAKE_THREAD_LIBS_INIT@"
CMAKE_DL_LIBS="@CMAKE_DL_LIBS@"
WITH_PYTHON="@WITH_PYTHON@"
PYTHON_LIBRARIES="@PYTHON_LIBRARIES@"
GLOG_LIBRARIES="@GLOG_LIBRARIES@"
GFLAGS_LIBRARIES="@GFLAGS_LIBRARIES@"
GFLAGS_LOCATION="@GFLAGS_LOCATION@"
CBLAS_LIBRARIES="@CBLAS_LIBRARIES@"
CUDA_LIBRARIES="@CUDA_CUDART_LIBRARY@"
WITH_COVERALLS="@ON_COVERALLS@"
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
try:
from paddle_api_config import *
import os.path
import platform
system = platform.system().lower()
is_osx = (system == 'darwin')
is_win = (system == 'windows')
is_lin = (system == 'linux')
if is_lin:
whole_start = "-Wl,--whole-archive"
whole_end = "-Wl,--no-whole-archive"
elif is_osx:
whole_start = ""
whole_end = ""
LIB_DIRS = [
"math", 'function', 'utils', 'parameter', "gserver", "api", "cuda",
"pserver", "trainer"
]
PARENT_LIB_DIRS = ['proto']
class PaddleLDFlag(object):
def __init__(self):
self.paddle_build_dir = PADDLE_BUILD_DIR
self.paddle_build_dir = os.path.abspath(self.paddle_build_dir)
self.with_gpu = PaddleLDFlag.cmake_bool(WITH_GPU)
self.protolib = PROTOBUF_LIBRARY
self.zlib = ZLIB_LIBRARIES
self.thread = CMAKE_THREAD_LIB
self.dl_libs = CMAKE_DL_LIBS
self.with_python = PaddleLDFlag.cmake_bool(WITH_PYTHON)
self.python_libs = PYTHON_LIBRARIES
self.glog_libs = GLOG_LIBRARIES
self.with_coverage = PaddleLDFlag.cmake_bool(WITH_COVERALLS)
self.gflags_libs = GFLAGS_LIBRARIES
self.gflags_location = GFLAGS_LOCATION
self.cblas_libs = CBLAS_LIBRARIES
self.curt = CUDA_LIBRARIES
def ldflag_str(self):
return " ".join(
[self.libs_dir_str(), self.parent_dir_str(), self.libs_str()])
def libs_dir_str(self):
libdirs = LIB_DIRS
return " ".join(
map(lambda x: "-L" + os.path.join(self.paddle_build_dir, x),
libdirs))
def parent_dir_str(self):
libdirs = PARENT_LIB_DIRS
return " ".join(
map(lambda x: "-L" + os.path.join(self.paddle_build_dir, '..', x),
libdirs))
def libs_str(self):
libs = [
whole_start,
"-lpaddle_gserver",
"-lpaddle_function",
whole_end,
"-lpaddle_pserver",
"-lpaddle_trainer_lib",
"-lpaddle_network",
'-lpaddle_parameter',
"-lpaddle_math",
'-lpaddle_utils',
"-lpaddle_proto",
"-lpaddle_cuda",
"-lpaddle_api",
self.normalize_flag(self.protolib),
self.normalize_flag(self.glog_libs),
self.normalize_flag(self.gflags_libs),
self.normalize_flag(self.zlib),
self.normalize_flag(self.thread),
self.normalize_flag(self.dl_libs),
self.normalize_flag(self.cblas_libs),
]
if self.with_python:
libs.append(self.normalize_flag(self.python_libs))
if self.with_gpu:
libs.append(self.normalize_flag(self.curt))
if self.with_coverage:
libs.append("-fprofile-arcs")
return " ".join(filter(lambda l: len(l) != 0, libs))
def normalize_flag(self, cmake_flag):
"""
CMake flag string to ld flag
:type cmake_flag: str
"""
if ";" in cmake_flag:
return " ".join(map(self.normalize_flag, cmake_flag.split(";")))
if cmake_flag.startswith("/"): # is a path
return cmake_flag
elif cmake_flag.startswith("-l"): # normal link command
return cmake_flag
elif cmake_flag in [
"gflags-shared", "gflags-static", "gflags_nothreads-shared",
"gflags_nothreads-static"
]: # special for gflags
assert PaddleLDFlag.cmake_bool(self.gflags_location)
return self.gflags_location
elif len(cmake_flag) != 0:
return "".join(["-l", cmake_flag])
else:
return ""
@staticmethod
def cmake_bool(cmake_str):
"""
CMake bool string to bool
:param cmake_str: cmake boolean string
:type cmake_str: str
:rtype: bool
"""
if cmake_str in ["FALSE", "OFF", "NO"] or cmake_str.endswith(
"-NOTFOUND"):
return False
else:
return True
def c_flag(self):
if self.with_coverage:
return [
"-fprofile-arcs", "-ftest-coverage", "-O0", "-g",
"-std=c++11"
]
else:
return ["-std=c++11"]
except ImportError:
class PaddleLDFlag(object):
def ldflag_str(self):
pass
def c_flag(self):
pass
......@@ -17,7 +17,11 @@ limitations under the License. */
#include <stdio.h>
#include "hl_base.h"
#if defined(__ARM_NEON__) || defined(__ARM_NEON)
#include "hl_neon_matrix_kernel.cuh"
#else
#include "hl_sse_matrix_kernel.cuh"
#endif
/**
* @brief cpu element wise unary operator.
......
......@@ -66,6 +66,8 @@ typedef BaseOp SSESquaredDiff;
typedef BaseOp SSEFirst;
typedef BaseOp SSESecond;
typedef BaseOp SSEClassificationError;
#elif defined(__ARM__NEON__) || defined(__ARM_NEON)
#include "hl_matrix_base_neon.cuh"
#else
#include "hl_matrix_base_sse.cuh"
#endif
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifndef HL_MATRIX_BASE_NEON_CUH_
#define HL_MATRIX_BASE_NEON_CUH_
namespace aggregate {
class SSESum {
public:
static const bool sse = true;
INLINE float32x4_t vecOp(const float32x4_t a, const float32x4_t b) const {
return vaddq_f32(a, b);
}
};
class SSEMax {
public:
static const bool sse = true;
INLINE float32x4_t vecOp(const float32x4_t a, const float32x4_t b) const {
return vmaxq_f32(a, b);
}
};
class SSEMin {
public:
static const bool sse = true;
INLINE float32x4_t vecOp(const float32x4_t a, const float32x4_t b) const {
return vminq_f32(a, b);
}
};
} // namespace aggregate
namespace base {
namespace unary {
class SSEIdentity {
public:
static const bool sse = true;
INLINE float32x4_t vecOp(const float32x4_t a) const {
return a;
}
};
} // namespace unary
namespace binary {
class SSEAdd {
public:
static const bool sse = true;
INLINE float32x4_t vecOp(const float32x4_t a, const float32x4_t b) const {
return vaddq_f32(a, b);
}
};
class SSEAdd2 {
public:
static const bool sse = true;
const real p1;
const real p2;
float32x4_t mp1;
float32x4_t mp2;
public:
SSEAdd2(const real s1, const real s2) : p1(s1), p2(s2) {
mp1 = vdupq_n_f32(p1);
mp2 = vdupq_n_f32(p2);
}
INLINE float32x4_t vecOp(const float32x4_t a, const float32x4_t b) const {
float32x4_t tmp1, tmp2;
tmp1 = vmulq_f32(mp1, a);
tmp2 = vmulq_f32(mp2, b);
return vaddq_f32(tmp1, tmp2);
}
};
class SSESub {
public:
static const bool sse = true;
INLINE float32x4_t vecOp(const float32x4_t a, const float32x4_t b) const {
return vsubq_f32(a, b);
}
};
class SSEMul {
public:
static const bool sse = true;
INLINE float32x4_t vecOp(const float32x4_t a, const float32x4_t b) const {
return vmulq_f32(a, b);
}
};
class SSEDiv {
public:
static const bool sse = true;
INLINE float32x4_t vecOp(const float32x4_t a, const float32x4_t b) const {
float32x4_t tmp;
tmp = vrecpeq_f32(b);
return vmulq_f32(a, tmp);
}
};
class SSESquaredDiff {
public:
static const bool sse = true;
INLINE float32x4_t vecOp(const float32x4_t a, const float32x4_t b) const {
float32x4_t tmp;
tmp = vsubq_f32(a, b);
return vmulq_f32(tmp, tmp);
}
};
class SSEFirst {
public:
static const bool sse = true;
INLINE float32x4_t vecOp(const float32x4_t a, const float32x4_t b) const {
return a;
}
};
class SSESecond {
public:
static const bool sse = true;
INLINE float32x4_t vecOp(const float32x4_t a, const float32x4_t b) const {
return b;
}
};
class SSEClassificationError {
public:
static const bool sse = true;
const real p;
float32x4_t mp;
uint32x4_t result;
public:
explicit SSEClassificationError(const real s) : p(s) {
mp = vdupq_n_f32(p);
result = vdupq_n_u32(1);
}
// TODO: to be check
INLINE float32x4_t vecOp(const float32x4_t a, const float32x4_t b) const {
uint32x4_t tmp1 = vcgtq_f32(a, mp);
uint32x4_t tmp2 = vcgtq_f32(b, mp);
uint32x4_t tmp3 = veorq_u32(tmp1, tmp2);
return vcvtq_f32_u32(vandq_u32(tmp3, result));
}
};
} // namespace binary
} // namespace base
#endif /* HL_MATRIX_BASE_NEON_CUH_ */
......@@ -17,13 +17,20 @@ limitations under the License. */
#include "hl_base.h"
#ifdef __CUDA_ARCH__
#if defined(__CUDA_ARCH__)
#include <vector_types.h>
#ifndef PADDLE_TYPE_DOUBLE
typedef float4 vecType;
#else
typedef double2 vecType;
#endif
#elif (defined __ARM_NEON) || (defined __ARM_NEON__)
#include <arm_neon.h>
#ifndef PADDLE_TYPE_DOUBLE
typedef float32x4_t vecType;
#else
#error NEON instructions does not support double precision
#endif
#else
#include <mmintrin.h>
#include <xmmintrin.h>
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifndef HL_NEON_MATRIX_KERNEL_CUH_
#define HL_NEON_MATRIX_KERNEL_CUH_
#include "hl_matrix_type.cuh"
#define VECTOR_SIZE 16
/* number of float in vector */
#define VECTOR_LEN 4
#define VECTOR_SET vdupq_n_f32
inline bool hl_check_align(size_t size) {
return !(size & (VECTOR_SIZE - 1));
}
inline bool hl_check_align(void *ptr) {
return hl_check_align(reinterpret_cast<size_t>(ptr));
}
template <class Agg>
inline real hl_agg_op(Agg agg, vecType mm) {
float32x4_t rev = vrev64q_f32(mm);
float32x4_t tmp1 = agg.vecOp(rev, rev);
float32x2_t lo = vget_high_f32(rev);
float32x2_t hi = vget_low_f32(rev);
float32x4_t tmp2 = vcombine_f32(hi, lo);
float32x4_t ret = agg.vecOp(tmp1, tmp2);
return vgetq_lane_f32(ret, 0);
}
template <class Agg, class Op, class Saver>
void hl_sse_matrix_row_op(Agg agg, Op op, Saver sv,
int dimM, int dimN,
real *dst, int ld,
real *A, int lda) {
for (int i = 0; i < dimM; i++, A += lda) {
vecType mm = VECTOR_SET(agg.init());
vecType *a = (vecType*)(A);
for (int j = 0; j < dimN / VECTOR_LEN; j++, a++) {
mm = agg.vecOp(mm, op.vecOp(*a));
}
int rem = dimN % VECTOR_LEN;
if (rem) {
real tmp = hl_agg_op(agg, mm);
real *a = A + (dimN / VECTOR_LEN) * VECTOR_LEN;
for (int j = 0; j < rem; j++) {
tmp = agg(tmp, op(a[j]));
}
dst[i*ld] = sv(dst[i*ld], tmp);
} else {
dst[i*ld] = sv(dst[i*ld], hl_agg_op(agg, mm));
}
}
}
template <class Agg, class Op, class Saver>
void hl_sse_matrix_row_op(Agg agg, Op op, Saver sv,
int dimM, int dimN,
real *dst, int ld,
real *A, int lda,
real *B, int ldb) {
for (int i = 0; i < dimM; i++, A += lda, B += ldb) {
vecType mm = VECTOR_SET(agg.init());
vecType *a = (vecType*)(A);
vecType *b = (vecType*)(B);
for (int j = 0; j < dimN / VECTOR_LEN; j++, a++, b++) {
mm = agg.vecOp(mm, op.vecOp(*a, *b));
}
int rem = dimN % VECTOR_LEN;
if (rem) {
real tmp = hl_agg_op(agg, mm);
real *a = A + (dimN / VECTOR_LEN) * VECTOR_LEN;
real *b = B + (dimN / VECTOR_LEN) * VECTOR_LEN;
for (int j = 0; j < rem; j++) {
tmp = agg(tmp, op(a[j], b[j]));
}
dst[i*ld] = sv(dst[i*ld], tmp);
} else {
dst[i*ld] = sv(dst[i*ld], hl_agg_op(agg, mm));
}
}
}
template <class Agg, class Op, class Saver>
void hl_matrix_column_op(Agg agg, Op op, Saver sv,
int dimM, int dimN,
real *dst,
real *A, int lda) {
for (int j = 0; j < dimN; j++) {
real tmp = agg.init();
for (int i = 0; i < dimM; i++) {
tmp = agg(tmp, op(A[i * lda + j]));
}
dst[j] = sv(dst[j], tmp);
}
}
template <class Agg, class Op, class Saver>
void hl_matrix_column_op(Agg agg, Op op, Saver sv,
int dimM, int dimN,
real *dst,
real *A, int lda,
real *B, int ldb) {
for (int j = 0; j < dimN; j++) {
real tmp = agg.init();
for (int i = 0; i < dimM; i++) {
tmp = agg(tmp, op(A[i * lda + j], B[i * ldb + j]));
}
dst[j] = sv(dst[j], tmp);
}
}
/*
* MaxRow greater than or equal dimN
* dimN is multiples of VECTOR_LEN
* so rem <= MaxRow / VECTOR_LEN
*/
template <int MaxRow, class Agg, class Op, class Saver>
void hl_sse_column_op_with_rem(Agg agg, Op op, Saver sv,
int dimM, int dimN,
real *dst,
real *A, int lda) {
vecType mm[MaxRow / VECTOR_LEN];
for (int n = 0; n < MaxRow / VECTOR_LEN; n++) {
mm[n] = VECTOR_SET(agg.init());
}
for (int i = 0; i < dimM; i++) {
vecType *a = (vecType*)(A + i * lda);
for (int n = 0; n < dimN / VECTOR_LEN; n++) {
mm[n] = agg.vecOp(mm[n], op.vecOp(a[n]));
}
}
vecType *result = (vecType*)(dst);
for (int n = 0; n < dimN / VECTOR_LEN; n++) {
result[n] = sv.vecOp(result[n], mm[n]);
}
int rem = dimN % VECTOR_LEN;
if (rem) {
A += (dimN / VECTOR_LEN) * VECTOR_LEN;
dst += (dimN / VECTOR_LEN) * VECTOR_LEN;
hl_matrix_column_op(agg, op, sv, dimM, rem, dst, A, lda);
}
}
/*
* dimN is multiples of VECTOR_LEN
* dimN greater than Step
*/
template <int Step, class Agg, class Op, class Saver>
void hl_sse_matrix_column_op(Agg agg, Op op, Saver sv,
int dimM, int dimN,
real *dst,
real *A, int lda) {
for (int j = 0; j < dimN / Step; j++, dst += Step, A += Step) {
vecType mm[Step / VECTOR_LEN];
for (int n = 0; n < Step / VECTOR_LEN; n++) {
mm[n] = VECTOR_SET(agg.init());
}
for (int i = 0; i < dimM; i++) {
vecType *a = (vecType*)(A + i * lda);
for (int n = 0; n < Step / VECTOR_LEN; n++) {
mm[n] = agg.vecOp(mm[n], op.vecOp(a[n]));
}
}
vecType *result = (vecType*)(dst);
for (int n = 0; n < Step / VECTOR_LEN; n++) {
result[n] = sv.vecOp(result[n], mm[n]);
}
}
int remRow = dimN % Step;
if (remRow) {
hl_sse_column_op_with_rem<Step>(agg, op, sv, dimM, remRow, dst, A, lda);
}
}
template <class Agg, class Op, class Saver>
void hl_sse_matrix_column_op(Agg agg, Op op, Saver sv,
int dimM, int dimN,
real *dst,
real *A, int lda) {
if (dimN <= 16) {
hl_sse_matrix_column_op<16>(agg, op, sv, dimM, dimN, dst, A, lda);
} else if (dimN <= 32) {
hl_sse_matrix_column_op<32>(agg, op, sv, dimM, dimN, dst, A, lda);
} else if (dimN <= 1024 || dimM <= 512) {
hl_sse_matrix_column_op<64>(agg, op, sv, dimM, dimN, dst, A, lda);
} else {
hl_sse_matrix_column_op<1024>(agg, op, sv, dimM, dimN, dst, A, lda);
}
}
template <int MaxRow, class Agg, class Op, class Saver>
void hl_sse_column_op_with_rem(Agg agg, Op op, Saver sv,
int dimM, int dimN,
real *dst,
real *A, int lda,
real *B, int ldb) {
vecType mm[MaxRow / VECTOR_LEN];
for (int n = 0; n < MaxRow / VECTOR_LEN; n++) {
mm[n] = VECTOR_SET(agg.init());
}
for (int i = 0; i < dimM; i++) {
vecType *a = (vecType*)(A + i * lda);
vecType *b = (vecType*)(B + i * ldb);
for (int n = 0; n < dimN / VECTOR_LEN; n++) {
mm[n] = agg.vecOp(mm[n], op.vecOp(a[n], b[n]));
}
}
vecType *result = (vecType*)(dst);
for (int n = 0; n < dimN / VECTOR_LEN; n++) {
result[n] = sv.vecOp(result[n], mm[n]);
}
int rem = dimN % VECTOR_LEN;
if (rem) {
A += (dimN / VECTOR_LEN) * VECTOR_LEN;
B += (dimN / VECTOR_LEN) * VECTOR_LEN;
dst += (dimN / VECTOR_LEN) * VECTOR_LEN;
hl_matrix_column_op(agg, op, sv, dimM, rem, dst, A, lda, B, ldb);
}
}
template <int Step, class Agg, class Op, class Saver>
void hl_sse_matrix_column_op(Agg agg, Op op, Saver sv,
int dimM, int dimN,
real *dst,
real *A, int lda,
real *B, int ldb) {
for (int j = 0; j < dimN / Step; j++, dst += Step, A += Step, B += Step) {
vecType mm[Step / VECTOR_LEN];
for (int n = 0; n < Step / VECTOR_LEN; n++) {
mm[n] = VECTOR_SET(agg.init());
}
for (int i = 0; i < dimM; i++) {
vecType *a = (vecType*)(A + i * lda);
vecType *b = (vecType*)(B + i * ldb);
for (int n = 0; n < Step / VECTOR_LEN; n++) {
mm[n] = agg.vecOp(mm[n], op.vecOp(a[n], b[n]));
}
}
vecType *result = (vecType*)(dst);
for (int n = 0; n < Step / VECTOR_LEN; n++) {
result[n] = sv.vecOp(result[n], mm[n]);
}
}
int remRow = dimN % Step;
if (remRow) {
hl_sse_column_op_with_rem<Step>(
agg, op, sv, dimM, remRow, dst, A, lda, B, ldb);
}
}
template <class Agg, class Op, class Saver>
void hl_sse_matrix_column_op(Agg agg, Op op, Saver sv,
int dimM, int dimN,
real *dst,
real *A, int lda,
real *B, int ldb) {
if (dimN <= 16) {
hl_sse_matrix_column_op<16>(agg, op, sv, dimM, dimN, dst, A, lda, B, ldb);
} else if (dimN <= 32) {
hl_sse_matrix_column_op<32>(agg, op, sv, dimM, dimN, dst, A, lda, B, ldb);
} else if (dimN <= 1024 || dimM <= 512) {
hl_sse_matrix_column_op<64>(agg, op, sv, dimM, dimN, dst, A, lda, B, ldb);
} else {
hl_sse_matrix_column_op<1024>(agg, op, sv, dimM, dimN, dst, A, lda, B, ldb);
}
}
#endif /* HL_NEON_MATRIX_KERNEL_CUH_ */
......@@ -159,4 +159,10 @@ extern void hl_sequence_avg_forward(real* dst,
int width,
const int mode);
extern void hl_sequence_avg_backward(real* dst,
real* src,
const int* starts,
int height,
int width,
const int mode);
#endif /* HL_SEQUENCE_H_ */
......@@ -57,4 +57,10 @@ inline void hl_sequence_avg_forward(real* dst,
int width,
const int mode) {}
inline void hl_sequence_avg_backward(real* dst,
real* src,
const int* starts,
int height,
int width,
const int mode) {}
#endif // HL_SEQUENCE_STUB_H_
......@@ -325,12 +325,12 @@ __global__ void KeSequenceAvgForward(real* dst,
int seqLength = end - start;
if (seqLength == 0) return;
real sum = 0.0;
for (int i = 0; i < seqLength; i++) {
sum += src[(start + i) * width + col];
for (int i = start; i < end; i++) {
sum += src[i * width + col];
}
sum = mode == 1 ? sum :
(mode == 0 ? sum / seqLength : sum * my_rsqrt((real)seqLength));
dst[row * width + col] = sum;
dst[gid] = sum;
}
}
......@@ -354,3 +354,48 @@ void hl_sequence_avg_forward(real* dst,
(dst, src, starts, height, width, mode);
CHECK_SYNC("hl_sequence_avg_forward failed");
}
__global__ void KeSequenceAvgBackward(real* dst,
real* src,
const int* starts,
int height,
int width,
const int mode) {
int gid = blockIdx.x * blockDim.x + threadIdx.x;
int row = gid / width;
int col = gid % width;
if (gid < height * width) {
int start = starts[row];
int end = starts[row + 1];
int seqLength = end - start;
if (seqLength == 0) return;
real grad = src[gid];
grad = mode == 1 ? grad :
(mode == 0 ? grad / seqLength : grad * my_rsqrt((real)seqLength));
for (int i = start; i < end; i++) {
dst[i * width + col] += grad;
}
}
}
void hl_sequence_avg_backward(real* dst,
real* src,
const int* starts,
int height,
int width,
const int mode) {
CHECK_NOTNULL(dst);
CHECK_NOTNULL(src);
CHECK_NOTNULL(starts);
int block = 512;
int grid = DIVUP(width * height, 512);
CHECK(mode == 0 || mode == 1 || mode == 2)
<< "mode error in hl_sequence_avg_backward!";
KeSequenceAvgBackward<<< grid, block, 0, STREAM_DEFAULT >>>
(dst, src, starts, height, width, mode);
CHECK_SYNC("hl_sequence_avg_backward failed");
}
......@@ -16,66 +16,6 @@ limitations under the License. */
namespace paddle {
template <>
size_t FuncConfig::get<size_t>(const std::string& key) const {
auto it = valueMap_.find(key);
CHECK(it != valueMap_.end()) << "Cannot find value: '" << key << "'";
return it->second.s;
}
template <>
real FuncConfig::get<real>(const std::string& key) const {
auto it = valueMap_.find(key);
CHECK(it != valueMap_.end()) << "Cannot find value: '" << key << "'";
return it->second.r;
}
template <>
int FuncConfig::get<int>(const std::string& key) const {
auto it = valueMap_.find(key);
CHECK(it != valueMap_.end()) << "Cannot find value: '" << key << "'";
return it->second.i;
}
template <>
bool FuncConfig::get<bool>(const std::string& key) const {
auto it = valueMap_.find(key);
CHECK(it != valueMap_.end()) << "Cannot find value: '" << key << "'";
return it->second.b;
}
template <>
FuncConfig& FuncConfig::set<size_t>(const std::string& key, size_t v) {
CHECK_EQ(static_cast<int>(valueMap_.count(key)), 0) << "Duplicated value: "
<< key;
valueMap_[key].s = v;
return *this;
}
template <>
FuncConfig& FuncConfig::set<real>(const std::string& key, real v) {
CHECK_EQ(static_cast<int>(valueMap_.count(key)), 0) << "Duplicated value: "
<< key;
valueMap_[key].r = v;
return *this;
}
template <>
FuncConfig& FuncConfig::set<int>(const std::string& key, int v) {
CHECK_EQ(static_cast<int>(valueMap_.count(key)), 0) << "Duplicated value: "
<< key;
valueMap_[key].i = v;
return *this;
}
template <>
FuncConfig& FuncConfig::set<bool>(const std::string& key, bool v) {
CHECK_EQ(static_cast<int>(valueMap_.count(key)), 0) << "Duplicated value: "
<< key;
valueMap_[key].b = v;
return *this;
}
void BufferArgs::addArg(const Matrix& arg,
const TensorShape& shape,
ArgType argType) {
......
......@@ -18,32 +18,49 @@ limitations under the License. */
#include <vector>
#include "BufferArg.h"
#include "paddle/math/Matrix.h"
#include "paddle/utils/Any.h"
#include "paddle/utils/ClassRegistrar.h"
#include "paddle/utils/Error.h"
namespace paddle {
/**
* Function Configuration.
* The argument type of Function::init.
* Follow-up will consider moving this data structure to Proto inside.
*/
class FuncConfig {
public:
union value {
size_t s;
real r;
int i;
bool b;
};
template <typename T>
T get(const std::string& key) const;
T get(const std::string& key, Error* err = nullptr) const {
try {
return any_cast<T>(valueMap_.at(key));
} catch (std::exception& e) { // could be cast or out of range exception.
if (err) {
*err = Error(e.what());
} else {
LOG(FATAL) << "Cannot get key " << key << " with error " << e.what();
}
return T();
}
}
template <typename T>
FuncConfig& set(const std::string& key, T v);
FuncConfig& set(const std::string& key, T v, Error* err = nullptr) {
auto it = valueMap_.find(key);
if (it != valueMap_.end()) { // already contains key.
if (err) {
*err = Error("Key %s is already set in FuncConfig", key.c_str());
} else {
LOG(FATAL) << "Key " << key << " is already set in FuncConfig.";
}
return *this;
}
valueMap_[key] = any(v);
return *this;
}
protected:
std::map<std::string, value> valueMap_;
mutable std::unordered_map<std::string, any> valueMap_;
};
/**
......
......@@ -25,9 +25,9 @@ void Pad<DEVICE_TYPE_CPU>(real* outputs,
const int inH,
const int inW,
const PadConf& pad) {
int cstart = pad.channelStart, cend = pad.channelEnd;
int hstart = pad.heightStart, hend = pad.heightEnd;
int wstart = pad.widthStart, wend = pad.widthEnd;
int cstart = pad.channel[0], cend = pad.channel[1];
int hstart = pad.height[0], hend = pad.height[1];
int wstart = pad.width[0], wend = pad.width[1];
int outC = inC + cstart + cend;
int outH = inH + hstart + hend;
int outW = inW + wstart + wend;
......@@ -51,9 +51,9 @@ void PadGrad<DEVICE_TYPE_CPU>(real* inGrad,
const int inH,
const int inW,
const PadConf& pad) {
int cstart = pad.channelStart, cend = pad.channelEnd;
int hstart = pad.heightStart, hend = pad.heightEnd;
int wstart = pad.widthStart, wend = pad.widthEnd;
int cstart = pad.channel[0], cend = pad.channel[1];
int hstart = pad.height[0], hend = pad.height[1];
int wstart = pad.width[0], wend = pad.width[1];
int outC = inC + cstart + cend;
int outH = inH + hstart + hend;
int outW = inW + wstart + wend;
......@@ -71,6 +71,12 @@ void PadGrad<DEVICE_TYPE_CPU>(real* inGrad,
}
}
static inline PadConf castToPadConf(const FuncConfig& conf) {
return {conf.get<std::vector<uint32_t>>("channel"),
conf.get<std::vector<uint32_t>>("height"),
conf.get<std::vector<uint32_t>>("width")};
}
/**
* \brief Padding zeros to input according to the specify dimension.
* The struct pad_ contains the padding size in each dimension.
......@@ -127,14 +133,7 @@ void PadGrad<DEVICE_TYPE_CPU>(real* inGrad,
template <DeviceType Device>
class PadFunc : public FunctionBase {
public:
void init(const FuncConfig& config) override {
pad_.channelStart = config.get<int>("cstart");
pad_.channelEnd = config.get<int>("cend");
pad_.heightStart = config.get<int>("hstart");
pad_.heightEnd = config.get<int>("hend");
pad_.widthStart = config.get<int>("wstart");
pad_.widthEnd = config.get<int>("wend");
}
void init(const FuncConfig& config) override { pad_ = castToPadConf(config); }
void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
CHECK_EQ(1UL, inputs.size());
......@@ -175,14 +174,7 @@ private:
template <DeviceType Device>
class PadGradFunc : public FunctionBase {
public:
void init(const FuncConfig& config) override {
pad_.channelStart = config.get<int>("cstart");
pad_.channelEnd = config.get<int>("cend");
pad_.heightStart = config.get<int>("hstart");
pad_.heightEnd = config.get<int>("hend");
pad_.widthStart = config.get<int>("wstart");
pad_.widthEnd = config.get<int>("wend");
}
void init(const FuncConfig& config) override { pad_ = castToPadConf(config); }
void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
CHECK_EQ(1UL, inputs.size());
......
......@@ -19,18 +19,12 @@ limitations under the License. */
namespace paddle {
struct PadConf {
/// how many values to add before the data along channel dimension.
int channelStart;
/// how many values to add after the data along channel dimension.
int channelEnd;
/// how many values to add before the data along height dimension.
int heightStart;
/// how many values to add after the data along height dimension.
int heightEnd;
/// how many values to add before the data along width dimension.
int widthStart;
/// how many values to add after the data along width dimension.
int widthEnd;
/// how many values to add before/after the data along channel dimension.
std::vector<uint32_t> channel;
/// how many values to add before/after the data along height dimension.
std::vector<uint32_t> height;
/// how many values to add before/after the data along width dimension.
std::vector<uint32_t> width;
};
/**
......
......@@ -44,9 +44,9 @@ void Pad<DEVICE_TYPE_GPU>(real* outputs,
size_t nth = num * inC * inH * inW;
int blockSize = 1024;
int gridSize = (nth + 1024 - 1) / 1024;
int cstart = pad.channelStart, cend = pad.channelEnd;
int hstart = pad.heightStart, hend = pad.heightEnd;
int wstart = pad.widthStart, wend = pad.widthEnd;
int cstart = pad.channel[0], cend = pad.channel[1];
int hstart = pad.height[0], hend = pad.height[1];
int wstart = pad.width[0], wend = pad.width[1];
int outC = inC + cstart + cend;
int outH = inH + hstart + hend;
int outW = inW + wstart + wend;
......@@ -83,9 +83,9 @@ void PadGrad<DEVICE_TYPE_GPU>(real* inGrad,
int nth = num * inC * inH * inW;
int blockSize = 1024;
int gridSize = (nth + 1024 - 1) / 1024;
int cstart = pad.channelStart, cend = pad.channelEnd;
int hstart = pad.heightStart, hend = pad.heightEnd;
int wstart = pad.widthStart, wend = pad.widthEnd;
int cstart = pad.channel[0], cend = pad.channel[1];
int hstart = pad.height[0], hend = pad.height[1];
int wstart = pad.width[0], wend = pad.width[1];
int outC = inC + cstart + cend;
int outH = inH + hstart + hend;
int outW = inW + wstart + wend;
......
......@@ -24,48 +24,22 @@ TEST(Pad, real) {
for (size_t imgSizeW : {5, 32, 96}) {
VLOG(3) << " numSamples=" << numSamples << " channels=" << channels
<< " imgSizeH=" << imgSizeH << " imgSizeW=" << imgSizeW;
FunctionCompare compare("Pad",
FuncConfig()
.set("cstart", 2)
.set("cend", 3)
.set("hstart", 1)
.set("hend", 2)
.set("wstart", 3)
.set("wend", 2));
TensorShape inDims{numSamples, channels, imgSizeH, imgSizeW};
TensorShape outDims{
numSamples, channels + 5, imgSizeH + 3, imgSizeW + 5};
compare.addInputs(BufferArg(VALUE_TYPE_FLOAT, inDims));
compare.addOutputs(BufferArg(VALUE_TYPE_FLOAT, outDims, ASSIGN_TO));
compare.run();
}
}
}
}
}
TEST(PadGrad, real) {
for (size_t numSamples : {5, 32}) {
for (size_t channels : {1, 5, 32}) {
for (size_t imgSizeH : {5, 33, 100}) {
for (size_t imgSizeW : {5, 32, 96}) {
VLOG(3) << " numSamples=" << numSamples << " channels=" << channels
<< " imgSizeH=" << imgSizeH << " imgSizeW=" << imgSizeW;
FunctionCompare compare("PadGrad",
FuncConfig()
.set("cstart", 2)
.set("cend", 3)
.set("hstart", 1)
.set("hend", 2)
.set("wstart", 3)
.set("wend", 2));
TensorShape inDims{numSamples, channels, imgSizeH, imgSizeW};
TensorShape outDims{
numSamples, channels + 5, imgSizeH + 3, imgSizeW + 5};
compare.addInputs(BufferArg(VALUE_TYPE_FLOAT, outDims));
compare.addOutputs(BufferArg(VALUE_TYPE_FLOAT, inDims, ASSIGN_TO));
compare.run();
for (bool test_grad : {false, true}) {
FunctionCompare compare(
test_grad ? "PadGrad" : "Pad",
FuncConfig()
.set<std::vector<uint32_t>>("channel", {2, 3})
.set<std::vector<uint32_t>>("height", {1, 2})
.set<std::vector<uint32_t>>("width", {3, 2}));
TensorShape inDims{numSamples, channels, imgSizeH, imgSizeW};
TensorShape outDims{
numSamples, channels + 5, imgSizeH + 3, imgSizeW + 5};
compare.addInputs(
BufferArg(VALUE_TYPE_FLOAT, test_grad ? outDims : inDims));
compare.addOutputs(BufferArg(
VALUE_TYPE_FLOAT, test_grad ? inDims : outDims, ASSIGN_TO));
compare.run();
}
}
}
}
......
......@@ -134,9 +134,7 @@ public:
backward(callback);
}
virtual Argument getLayerOutput(const std::string& layerName) {
return *((Argument*)nullptr);
}
virtual Argument getLayerOutput(const std::string& layerName) = 0;
// see comment in Layer.h for the function with the same name
virtual void resetState() {}
......
......@@ -42,7 +42,8 @@ void AgentLayer::forward(PassType passType) {
// get Arguments from real layers
if (numSamples_ > 0 && numSamples_ < realHeight) {
if (realOutput.ids) {
output_.ids->subVecFrom(*realOutput.ids, 0, numSamples_);
output_.ids =
IVector::create(realOutput.ids->getData(), numSamples_, useGpu_);
} else {
output_.subArgFrom(
realOutput, /* offset */ 0, numSamples_, getSize(), useGpu_);
......
......@@ -26,8 +26,6 @@ bool AverageLayer::init(const LayerMap& layerMap,
const ParameterMap& parameterMap) {
SequencePoolLayer::init(layerMap, parameterMap);
dataMtx_ = Matrix::create(nullptr, 1, 1, false, useGpu_);
outMtx_ = Matrix::create(nullptr, 1, getSize(), false, useGpu_);
// average strategy
if (config_.average_strategy() == "average") {
mode_ = kAverage;
......@@ -60,43 +58,9 @@ void AverageLayer::forward(PassType passType) {
void AverageLayer::backward(const UpdateCallback& callback) {
SequencePoolLayer::backward(callback);
const int* starts = startPositions_->getData(false);
MatrixPtr grad = getInputGrad(0);
if (grad) {
size_t dim = getSize();
real* gradientData = getInputGrad(0)->getData();
real* gradient = getOutputGrad()->getData();
size_t numSequences = startPositions_->getSize() - 1;
for (size_t sequenceId = 0; sequenceId < numSequences; ++sequenceId) {
// TODO(Dangqingqing) optimization for GPU
int sequenceLength = starts[sequenceId + 1] - starts[sequenceId];
if (0 == sequenceLength) {
// empty sequence
continue;
}
dataMtx_->setData(
gradientData + starts[sequenceId] * dim, sequenceLength, dim);
outMtx_->setData(gradient + sequenceId * dim);
switch (mode_) {
case kAverage: {
// plain average
dataMtx_->addBias(*outMtx_, 1.0f / sequenceLength);
break;
}
case kSum: {
// sum instead of average
dataMtx_->addBias(*outMtx_, 1.0f);
break;
}
case kAverageSquareRootN: {
// divide by square root of sequenceLength
dataMtx_->addBias(*outMtx_, 1.0f / sqrt(sequenceLength));
break;
}
default: { LOG(FATAL) << "should not reach here"; }
}
}
if (getInputGrad(0)) {
getInputGrad(0)->sequenceAvgBackward(
*getOutputGrad(), *startPositions_->getVector(useGpu_), mode_);
}
}
......
......@@ -45,8 +45,6 @@ public:
void backward(const UpdateCallback& callback = nullptr) override;
protected:
MatrixPtr outMtx_;
MatrixPtr dataMtx_;
int mode_;
};
} // namespace paddle
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "Layer.h"
#include "NormLayer.h"
#include "paddle/math/BaseMatrix.h"
#include "paddle/math/Matrix.h"
namespace paddle {
MatrixPtr CrossChannelNormLayer::createSampleMatrix(MatrixPtr data,
size_t iter,
size_t spatialDim) {
return Matrix::create(data->getData() + iter * channels_ * spatialDim,
channels_,
spatialDim,
false,
useGpu_);
}
MatrixPtr CrossChannelNormLayer::createSpatialMatrix(MatrixPtr data,
size_t iter,
size_t spatialDim) {
return Matrix::create(
data->getData() + iter * spatialDim, 1, spatialDim, false, useGpu_);
}
void CrossChannelNormLayer::forward(PassType passType) {
Layer::forward(passType);
MatrixPtr inV = getInputValue(0);
size_t batchSize = inV->getHeight();
size_t dataDim = inV->getWidth();
CHECK_EQ(getSize(), dataDim);
reserveOutput(batchSize, dataDim);
MatrixPtr outV = getOutputValue();
size_t spatialDim = dataDim / channels_;
Matrix::resizeOrCreate(dataBuffer_, batchSize, dataDim, false, useGpu_);
Matrix::resizeOrCreate(spatialBuffer_, 1, spatialDim, false, useGpu_);
Matrix::resizeOrCreate(normBuffer_, batchSize, spatialDim, false, useGpu_);
normBuffer_->zeroMem();
// add eps to avoid overflow
normBuffer_->addScalar(*normBuffer_, 1e-6);
inV->square2(*dataBuffer_);
for (size_t i = 0; i < batchSize; i++) {
const MatrixPtr inVTmp = createSampleMatrix(inV, i, spatialDim);
const MatrixPtr dataTmp = createSampleMatrix(dataBuffer_, i, spatialDim);
MatrixPtr outVTmp = createSampleMatrix(outV, i, spatialDim);
MatrixPtr normTmp = createSpatialMatrix(normBuffer_, i, spatialDim);
// compute norm.
spatialBuffer_->sumCols(*dataTmp, 1, 0);
spatialBuffer_->sqrt2(*spatialBuffer_);
normTmp->copyFrom(*spatialBuffer_);
outVTmp->copyFrom(*inVTmp);
outVTmp->divRowVector(*spatialBuffer_);
// scale the layer.
outVTmp->mulColVector(*scale_->getW());
}
}
void CrossChannelNormLayer::backward(const UpdateCallback& callback) {
MatrixPtr inG = getInputGrad(0);
MatrixPtr inV = getInputValue(0);
MatrixPtr outG = getOutputGrad();
MatrixPtr outV = getOutputValue();
size_t batchSize = inG->getHeight();
size_t dataDim = inG->getWidth();
size_t spatialDim = dataDim / channels_;
dataBuffer_->dotMul(*outG, *outV);
Matrix::resizeOrCreate(scaleDiff_, channels_, 1, false, useGpu_);
Matrix::resizeOrCreate(channelBuffer_, channels_, 1, false, useGpu_);
Matrix::resizeOrCreate(sampleBuffer_, channels_, spatialDim, false, useGpu_);
scaleDiff_->zeroMem();
for (size_t i = 0; i < batchSize; i++) {
MatrixPtr outGTmp = createSampleMatrix(outG, i, spatialDim);
const MatrixPtr dataTmp = createSampleMatrix(dataBuffer_, i, spatialDim);
const MatrixPtr inVTmp = createSampleMatrix(inV, i, spatialDim);
const MatrixPtr inGTmp = createSampleMatrix(inG, i, spatialDim);
const MatrixPtr normTmp = createSpatialMatrix(normBuffer_, i, spatialDim);
channelBuffer_->sumRows(*dataTmp, 1, 0);
channelBuffer_->dotDiv(*channelBuffer_, *(scale_->getW()));
// store a / scale[i] in scaleDiff_ temporary
scaleDiff_->add(*channelBuffer_, 1.);
sampleBuffer_->dotMul(*inVTmp, *outGTmp);
spatialBuffer_->sumCols(*sampleBuffer_, 1., 1.);
// scale the grad
inGTmp->copyFrom(*inVTmp);
inGTmp->mulRowVector(*spatialBuffer_);
// divide by square of norm
spatialBuffer_->dotMul(*normTmp, *normTmp);
inGTmp->divRowVector(*spatialBuffer_);
// subtract
inGTmp->add(*outGTmp, -1, 1);
// divide by norm
inGTmp->divRowVector(*normTmp);
// scale the diff
inGTmp->mulColVector(*scale_->getW());
}
// updata scale
if (scale_->getWGrad()) scale_->getWGrad()->copyFrom(*scaleDiff_);
scale_->getParameterPtr()->incUpdate(callback);
}
} // namespace paddle
......@@ -107,6 +107,10 @@ void ExpandConvBaseLayer::expandOneFrame(MatrixPtr image,
int channel = isDeconv_ ? numFilters_ : channels_[inIdx];
resetExpandInput(subK_[inIdx] * groups_[inIdx], subN_[inIdx]);
CHECK_EQ(image->getWidth(),
static_cast<size_t>(imgSizeH_[inIdx] * imgSizeW_[inIdx] * channel));
real *imgData = image->getData() + startIdx * image->getWidth();
MatrixPtr imageTmp =
Matrix::create(imgData,
......
......@@ -36,7 +36,7 @@ namespace paddle {
* | |- 5
* |
* |-*- 0
* |- 1
* |- 1
* @endcode
*
* where * indicates an internal node, and each leaf node represents a class.
......
......@@ -26,6 +26,8 @@ Layer* NormLayer::create(const LayerConfig& config) {
return new ResponseNormLayer(config);
} else if (norm == "cmrnorm-projection") {
return new CMRProjectionNormLayer(config);
} else if (norm == "cross-channel-norm") {
return new CrossChannelNormLayer(config);
} else {
LOG(FATAL) << "Unknown norm type: " << norm;
return nullptr;
......@@ -54,4 +56,14 @@ bool ResponseNormLayer::init(const LayerMap& layerMap,
return true;
}
bool CrossChannelNormLayer::init(const LayerMap& layerMap,
const ParameterMap& parameterMap) {
Layer::init(layerMap, parameterMap);
CHECK(parameters_[0]);
const NormConfig& conf = config_.inputs(0).norm_conf();
channels_ = conf.channels();
scale_.reset(new Weight(channels_, 1, parameters_[0]));
return true;
}
} // namespace paddle
......@@ -65,4 +65,35 @@ public:
}
};
/**
* This layer applys normalization across the channels of each sample to a
* conv layer's output, and scales the output by a group of trainable factors
* whose dimensions equal to the number of channels.
* - Input: One and only one input layer are accepted.
* - Output: The normalized data of the input data.
* Reference:
* Wei Liu, Dragomir Anguelov, Dumitru Erhan, Christian Szegedy, Scott Reed,
* Cheng-Yang Fu, Alexander C. Berg. SSD: Single Shot MultiBox Detector
*/
class CrossChannelNormLayer : public NormLayer {
public:
explicit CrossChannelNormLayer(const LayerConfig& config)
: NormLayer(config) {}
bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
void forward(PassType passType);
void backward(const UpdateCallback& callback);
MatrixPtr createSampleMatrix(MatrixPtr data, size_t iter, size_t spatialDim);
MatrixPtr createSpatialMatrix(MatrixPtr data, size_t iter, size_t spatialDim);
protected:
size_t channels_;
std::unique_ptr<Weight> scale_;
MatrixPtr scaleDiff_;
MatrixPtr normBuffer_;
MatrixPtr dataBuffer_;
MatrixPtr channelBuffer_;
MatrixPtr spatialBuffer_;
MatrixPtr sampleBuffer_;
};
} // namespace paddle
......@@ -36,12 +36,9 @@ bool PadLayer::init(const LayerMap& layerMap,
CHECK_EQ(2, pad_conf.pad_c_size());
CHECK_EQ(2, pad_conf.pad_h_size());
CHECK_EQ(2, pad_conf.pad_w_size());
padc_.push_back(pad_conf.pad_c(0));
padc_.push_back(pad_conf.pad_c(1));
padh_.push_back(pad_conf.pad_h(0));
padh_.push_back(pad_conf.pad_h(1));
padw_.push_back(pad_conf.pad_w(0));
padw_.push_back(pad_conf.pad_w(1));
padc_ = {pad_conf.pad_c(0), pad_conf.pad_c(1)};
padh_ = {pad_conf.pad_h(0), pad_conf.pad_h(1)};
padw_ = {pad_conf.pad_w(0), pad_conf.pad_w(1)};
outDims_ = TensorShape(4);
setOutDims(0);
......@@ -49,21 +46,15 @@ bool PadLayer::init(const LayerMap& layerMap,
createFunction(forward_,
"Pad",
FuncConfig()
.set("cstart", padc_[0])
.set("cend", padc_[1])
.set("hstart", padh_[0])
.set("hend", padh_[1])
.set("wstart", padw_[0])
.set("wend", padw_[1]));
.set("channel", padc_)
.set("height", padh_)
.set("width", padw_));
createFunction(backward_,
"PadGrad",
FuncConfig()
.set("cstart", padc_[0])
.set("cend", padc_[1])
.set("hstart", padh_[0])
.set("hend", padh_[1])
.set("wstart", padw_[0])
.set("wend", padw_[1]));
.set("channel", padc_)
.set("height", padh_)
.set("width", padw_));
return true;
}
......
......@@ -38,9 +38,9 @@ protected:
void setOutDims(const size_t batchSize);
void setTensorDim(const size_t batchSize);
std::vector<int> padc_;
std::vector<int> padh_;
std::vector<int> padw_;
std::vector<uint32_t> padc_;
std::vector<uint32_t> padh_;
std::vector<uint32_t> padw_;
TensorShape inDims_;
TensorShape outDims_;
};
......
......@@ -20,7 +20,7 @@ namespace paddle {
/**
* @brief A layer for generating priorbox locations and variances.
* - Input: Two and only two input layer are accepted. The input layer must be
* be a data output layer and a convolution output layer.
* be a data output layer and a convolution output layer.
* - Output: The priorbox locations and variances of the input data.
* Reference:
* Wei Liu, Dragomir Anguelov, Dumitru Erhan, Christian Szegedy, Scott Reed,
......@@ -45,27 +45,32 @@ protected:
MatrixPtr buffer_;
};
REGISTER_LAYER(priorbox, PriorBoxLayer);
bool PriorBoxLayer::init(const LayerMap& layerMap,
const ParameterMap& parameterMap) {
Layer::init(layerMap, parameterMap);
auto pbConf = config_.inputs(0).priorbox_conf();
std::vector<real> tmp;
aspectRatio_.push_back(1.);
std::copy(pbConf.min_size().begin(),
pbConf.min_size().end(),
std::back_inserter(minSize_));
std::copy(pbConf.max_size().begin(),
pbConf.max_size().end(),
std::back_inserter(maxSize_));
std::copy(pbConf.aspect_ratio().begin(),
pbConf.aspect_ratio().end(),
std::back_inserter(aspectRatio_));
std::copy(pbConf.variance().begin(),
pbConf.variance().end(),
std::back_inserter(variance_));
std::copy(pbConf.aspect_ratio().begin(),
pbConf.aspect_ratio().end(),
std::back_inserter(tmp));
// flip
int inputRatioLength = aspectRatio_.size();
for (int index = 0; index < inputRatioLength; index++)
aspectRatio_.push_back(1 / aspectRatio_[index]);
aspectRatio_.push_back(1.);
int inputRatioLength = tmp.size();
for (int index = 0; index < inputRatioLength; index++) {
aspectRatio_.push_back(tmp[index]);
aspectRatio_.push_back(1 / tmp[index]);
}
numPriors_ = aspectRatio_.size();
if (maxSize_.size() > 0) numPriors_++;
return true;
......@@ -94,12 +99,12 @@ void PriorBoxLayer::forward(PassType passType) {
for (int w = 0; w < layerWidth; ++w) {
real centerX = (w + 0.5) * stepW;
real centerY = (h + 0.5) * stepH;
int minSize = 0;
real minSize = 0;
for (size_t s = 0; s < minSize_.size(); s++) {
// first prior.
minSize = minSize_[s];
int boxWidth = minSize;
int boxHeight = minSize;
real boxWidth = minSize;
real boxHeight = minSize;
// xmin, ymin, xmax, ymax.
tmpPtr[idx++] = (centerX - boxWidth / 2.) / imageWidth;
tmpPtr[idx++] = (centerY - boxHeight / 2.) / imageHeight;
......@@ -112,7 +117,7 @@ void PriorBoxLayer::forward(PassType passType) {
CHECK_EQ(minSize_.size(), maxSize_.size());
// second prior.
for (size_t s = 0; s < maxSize_.size(); s++) {
int maxSize = maxSize_[s];
real maxSize = maxSize_[s];
boxWidth = boxHeight = sqrt(minSize * maxSize);
tmpPtr[idx++] = (centerX - boxWidth / 2.) / imageWidth;
tmpPtr[idx++] = (centerY - boxHeight / 2.) / imageHeight;
......@@ -145,6 +150,5 @@ void PriorBoxLayer::forward(PassType passType) {
MatrixPtr outV = getOutputValue();
outV->copyFrom(buffer_->data_, dim * 2);
}
REGISTER_LAYER(priorbox, PriorBoxLayer);
} // namespace paddle
......@@ -25,6 +25,11 @@ namespace paddle {
* Input: a sequence
* If SequenceLevel = kNonseq:
* Output: a sequence containing only the last instance of the input sequence
* If stride_ > 0:
* Output: a shorten sequence. The operation of getting last instance of a
* sequence is independently performed on every slice of the input
* sequence, which is obtained by sliding a window with the window
* size set to stride_.
* If SequenceLevel = kSeq:
* Check input sequence must has sub-sequence
* Output: a sequence containing only the last instance of each sub-sequence
......@@ -37,6 +42,7 @@ class SequenceLastInstanceLayer : public SequencePoolLayer {
protected:
MatrixPtr tmpSrc_;
MatrixPtr tmpDest_;
std::vector<int> instanceIds_;
public:
explicit SequenceLastInstanceLayer(const LayerConfig& config)
......@@ -54,6 +60,7 @@ REGISTER_LAYER(seqlastins, SequenceLastInstanceLayer);
bool SequenceLastInstanceLayer::init(const LayerMap& layerMap,
const ParameterMap& parameterMap) {
SequencePoolLayer::init(layerMap, parameterMap);
reversed_ = config_.select_first();
tmpSrc_ =
Matrix::create(nullptr, /* height= */ 1, 1, /* trans= */ false, useGpu_);
......@@ -66,7 +73,8 @@ bool SequenceLastInstanceLayer::init(const LayerMap& layerMap,
void SequenceLastInstanceLayer::forward(PassType passType) {
SequencePoolLayer::forward(passType);
const int* starts = startPositions_->getData(false);
auto starts = (stride_ > 0) ? stridePositions_->getData()
: startPositions_->getData(false);
MatrixPtr inputValue = getInputValue(0);
MatrixPtr outputValue = getOutputValue();
......@@ -74,9 +82,10 @@ void SequenceLastInstanceLayer::forward(PassType passType) {
AsyncGpuBlock asyncGpuBlock;
REGISTER_TIMER_INFO("SequenceLastInstanceLayerForward", getName().c_str());
instanceIds_.clear();
for (size_t seqId = 0; seqId < newBatchSize_; ++seqId) {
int insId =
config_.select_first() ? starts[seqId] : starts[seqId + 1] - 1;
int insId = reversed_ ? starts[seqId] : starts[seqId + 1] - 1;
instanceIds_.push_back(insId);
outputValue->subMatrix(seqId, 1, tmpDest_)
->assign(*(inputValue->subMatrix(insId, 1, tmpSrc_)));
......@@ -96,18 +105,13 @@ void SequenceLastInstanceLayer::backward(const UpdateCallback& callback) {
MatrixPtr inputGrad = getInputGrad(0);
MatrixPtr outputGrad = getOutputGrad();
const int* starts = startPositions_->getData(false);
size_t numSequences = startPositions_->getSize() - 1;
if (inputGrad) {
AsyncGpuBlock asyncGpuBlock;
REGISTER_TIMER_INFO("SequenceLastInstanceLayerBackward", getName().c_str());
for (size_t seqId = 0; seqId < numSequences; ++seqId) {
int insId =
config_.select_first() ? starts[seqId] : starts[seqId + 1] - 1;
inputGrad->subMatrix(insId, 1, tmpDest_)
for (size_t seqId = 0; seqId < newBatchSize_; ++seqId) {
inputGrad->subMatrix(instanceIds_[seqId], 1, tmpDest_)
->add(*(outputGrad->subMatrix(seqId, 1, tmpSrc_)));
}
}
......
......@@ -37,6 +37,7 @@ bool SequencePoolLayer::init(const LayerMap& layerMap,
} else {
LOG(FATAL) << "Unknown trans_type: " << config_.trans_type();
}
stride_ = config_.seq_pool_stride();
setNeedSequenceInfo(false);
return true;
}
......@@ -55,8 +56,6 @@ void SequencePoolLayer::forward(PassType passType) {
CHECK_EQ(starts->getData()[newBatchSize_], input.getBatchSize());
CHECK_EQ(newBatchSize_, starts->getSize() - 1);
resetOutput(newBatchSize_, dim);
/* If type_ = kNonSeq, both seq has or not has sub-seq degrade to a non-seq,
* thus, in this case, output_ has no sequenceStartPositions.
* If type_ = kSeq, seq has sub-seq degrades to a seq, thus, only in this
......@@ -67,6 +66,15 @@ void SequencePoolLayer::forward(PassType passType) {
<< "when trans_type = seq, input must hasSubseq";
output_.degradeSequence(input);
}
if (stride_ > 0) {
CHECK_EQ(input.hasSubseq(), 0UL)
<< "sequence stride pooling is invalid for hasSubseq now";
output_.poolSequenceWithStride(
input, stride_, &stridePositions_, reversed_);
newBatchSize_ = stridePositions_->getSize() - 1;
}
resetOutput(newBatchSize_, dim);
}
void SequencePoolLayer::backward(const UpdateCallback& callback) {
......
......@@ -26,6 +26,10 @@ namespace paddle {
* Output: output size is the number of input sequences (NOT input instances)
* output[i] = seqlastin/average/max_{for each instance in this
* sequence}{input[i]}
* If stride_ > 0:
* Check input sequence must not have sub-sequence
* Output: a shorten sequence, pooling is performed upon a small local
* area
* If SequenceLevel = kSeq:
* Check input sequence must has sub-sequence
* Output: output size is the number of input sub-sequences
......@@ -42,6 +46,11 @@ protected:
enum SequenceLevel { kNonSeq = 0, kSeq = 1 };
size_t newBatchSize_;
ICpuGpuVectorPtr startPositions_;
int stride_;
// Store the start position of each window.
IVectorPtr stridePositions_;
// Whether the input sequence is reversed or not.
bool reversed_ = false;
public:
explicit SequencePoolLayer(const LayerConfig& config) : Layer(config) {}
......
......@@ -778,8 +778,10 @@ void testProjectionGrad(ProjectionConfig conf,
config.biasSize = biasSize == 0 ? config.layerConfig.size() : biasSize;
config.layerConfig.set_bias_size(config.biasSize);
config.layerConfig.set_shared_biases(sharedBias);
config.inputDefs.push_back(
{inputType, "layer_0", conf.input_size(), parameterSize});
config.inputDefs.push_back({inputType,
"layer_0",
static_cast<size_t>(conf.input_size()),
parameterSize});
*config.layerConfig.add_inputs()->mutable_proj_conf() = conf;
config.testState = testState;
testLayerGrad(config, "mixed", batchSize, false, useGpu);
......
......@@ -804,10 +804,14 @@ TEST(Layer, ExpandLayer) {
testExpandLayer("seq", true); // seq expand to hasSubseq
}
void testDegradeLayer(bool hasSubseq, string layer_type, string trans_type) {
void testDegradeLayer(bool hasSubseq,
string layer_type,
string trans_type,
int stride) {
TestConfig config;
config.layerConfig.set_type(layer_type);
config.layerConfig.set_size(10);
config.layerConfig.set_seq_pool_stride(stride);
config.biasSize = 0;
config.inputDefs.push_back(
......@@ -827,36 +831,46 @@ void testDegradeLayer(bool hasSubseq, string layer_type, string trans_type) {
if (layer_type == "average") {
for (auto strategy : {"average", "sum", "squarerootn"}) {
LOG(INFO) << " hasSubseq=" << hasSubseq << " trans_type=" << trans_type
<< " average_strategy=" << strategy;
<< " average_strategy=" << strategy
<< " seq_pool_stride=" << stride;
config.layerConfig.set_average_strategy(strategy);
testDegradeLayerGrad(config, layer_type);
}
} else {
LOG(INFO) << " hasSubseq=" << hasSubseq << " trans_type=" << trans_type;
LOG(INFO) << " hasSubseq=" << hasSubseq << " trans_type=" << trans_type
<< " seq_pool_stride=" << stride;
testDegradeLayerGrad(config, layer_type);
}
}
TEST(Layer, MaxLayer) {
testDegradeLayer(false, "max", "non-seq"); // seq max to non-seq
testDegradeLayer(true, "max", "non-seq"); // hasSubseq max to non-seq
testDegradeLayer(true, "max", "seq"); // hasSubseq max to seq
testDegradeLayer(false, "max", "non-seq", -1); // seq max to non-seq
testDegradeLayer(true, "max", "non-seq", -1); // hasSubseq max to non-seq
testDegradeLayer(true, "max", "seq", -1); // hasSubseq max to seq
}
TEST(Layer, SequenceLastInstanceLayer) {
testDegradeLayer(false,
"seqlastins",
"non-seq"); // seq seqlastins to non-seq
"non-seq",
-1); // seq seqlastins to non-seq
testDegradeLayer(false,
"seqlastins",
"non-seq",
5); // seq seqlastins to a shorten seq, stride window = 5
testDegradeLayer(true,
"seqlastins",
"non-seq"); // hasSubseq seqlastins to non-seq
testDegradeLayer(true, "seqlastins", "seq"); // hasSubseq seqlastins to seq
"non-seq",
-1); // hasSubseq seqlastins to non-seq
testDegradeLayer(
true, "seqlastins", "seq", -1); // hasSubseq seqlastins to seq
}
TEST(Layer, AverageLayer) {
testDegradeLayer(false, "average", "non-seq"); // seq average to non-seq
testDegradeLayer(true, "average", "non-seq"); // hasSubseq average to non-seq
testDegradeLayer(true, "average", "seq"); // hasSubseq average to seq
testDegradeLayer(false, "average", "non-seq", -1); // seq average to non-seq
testDegradeLayer(
true, "average", "non-seq", -1); // hasSubseq average to non-seq
testDegradeLayer(true, "average", "seq", -1); // hasSubseq average to seq
}
TEST(Layer, SequenceConcatLayer) {
......@@ -1642,6 +1656,25 @@ TEST(Layer, PadLayer) {
}
}
TEST(Layer, CrossChannelNormLayer) {
TestConfig config;
config.layerConfig.set_type("norm");
config.layerConfig.set_size(100);
LayerInputConfig* input = config.layerConfig.add_inputs();
NormConfig* norm = input->mutable_norm_conf();
norm->set_norm_type("cross-channel-norm");
norm->set_channels(10);
norm->set_size(100);
norm->set_scale(0);
norm->set_pow(0);
norm->set_blocked(0);
config.inputDefs.push_back({INPUT_DATA, "layer_0", 100, 10});
for (auto useGpu : {false, true}) {
testLayerGrad(config, "cross-channel-norm", 10, false, useGpu, false, 5);
}
}
TEST(Layer, smooth_l1) {
TestConfig config;
config.layerConfig.set_type("smooth_l1");
......
......@@ -1453,6 +1453,24 @@ void BaseMatrixT<T>::divRowVector(BaseMatrixT& b) {
true_type() /* bAsRowVector */, false_type());
}
template<class T>
void BaseMatrixT<T>::mulColVector(BaseMatrixT& b) {
MatrixOffset offset(0, 0, 0, 0);
int numRows = height_;
int numCols = width_;
applyBinary(binary::DotMul<T>(), b, numRows, numCols, offset,
false_type(), true_type() /* bAsColVector */);
}
template<class T>
void BaseMatrixT<T>::divColVector(BaseMatrixT& b) {
MatrixOffset offset(0, 0, 0, 0);
int numRows = height_;
int numCols = width_;
applyBinary(binary::DotDiv<T>(), b, numRows, numCols, offset,
false_type(), true_type() /* bAsColVector */);
}
template<>
template <class Agg>
int BaseMatrixT<real>::applyRow(Agg agg, BaseMatrixT& b) {
......
......@@ -545,6 +545,9 @@ public:
void mulRowVector(BaseMatrixT& b);
void divRowVector(BaseMatrixT& b);
void mulColVector(BaseMatrixT& b);
void divColVector(BaseMatrixT& b);
void addP2P(BaseMatrixT& b);
/**
......
......@@ -85,11 +85,16 @@ int getrf<float>(const CBLAS_ORDER order,
float* A,
const int lda,
int* ipiv) {
#ifdef PADDLE_USE_LAPACK
#ifdef PADDLE_USE_ATLAS
return clapack_sgetrf(order, M, N, A, lda, ipiv);
#else
return LAPACKE_sgetrf(order, M, N, A, lda, ipiv);
#endif
#else
LOG(FATAL) << "Not implemented";
#endif
return 0;
}
template <>
......@@ -99,11 +104,16 @@ int getrf<double>(const CBLAS_ORDER order,
double* A,
const int lda,
int* ipiv) {
#ifdef PADDLE_USE_LAPACK
#ifdef PADDLE_USE_ATLAS
return clapack_dgetrf(order, M, N, A, lda, ipiv);
#else
return LAPACKE_dgetrf(order, M, N, A, lda, ipiv);
#endif
#else
LOG(FATAL) << "Not implemented";
#endif
return 0;
}
template <>
......@@ -112,11 +122,16 @@ int getri<float>(const CBLAS_ORDER order,
float* A,
const int lda,
const int* ipiv) {
#ifdef PADDLE_USE_LAPACK
#ifdef PADDLE_USE_ATLAS
return clapack_sgetri(order, N, A, lda, ipiv);
#else
return LAPACKE_sgetri(order, N, A, lda, ipiv);
#endif
#else
LOG(FATAL) << "Not implemented";
#endif
return 0;
}
template <>
......@@ -125,11 +140,16 @@ int getri<double>(const CBLAS_ORDER order,
double* A,
const int lda,
const int* ipiv) {
#ifdef PADDLE_USE_LAPACK
#ifdef PADDLE_USE_ATLAS
return clapack_dgetri(order, N, A, lda, ipiv);
#else
return LAPACKE_dgetri(order, N, A, lda, ipiv);
#endif
#else
LOG(FATAL) << "Not implemented";
#endif
return 0;
}
template <>
......
......@@ -17,11 +17,14 @@ limitations under the License. */
#ifdef PADDLE_USE_MKL
#include <mkl.h>
#ifdef PADDLE_USE_LAPACK
#include <mkl_lapacke.h>
#endif
#else
extern "C" {
#include <cblas.h>
}
#ifdef PADDLE_USE_LAPACK
#ifdef PADDLE_USE_ATLAS
extern "C" {
#include <clapack.h>
......@@ -30,6 +33,7 @@ extern "C" {
#include <lapacke.h>
#endif
#endif
#endif
#include <cmath>
......
......@@ -483,6 +483,20 @@ void GpuMatrix::sequenceAvgForward(Matrix& a,
hl_sequence_avg_forward(dst, src, starts, height, width, mode);
}
void GpuMatrix::sequenceAvgBackward(Matrix& a,
const IVector& startsPos,
int mode) {
size_t height = a.getHeight();
size_t width = getWidth();
CHECK_EQ(height, startsPos.getSize() - 1);
CHECK_EQ(width, a.getWidth());
real* dst = getData();
real* src = a.getData();
const int* starts = startsPos.getData();
hl_sequence_avg_backward(dst, src, starts, height, width, mode);
}
/* this = scaleAB*(a*b) + scaleT*this */
void GpuMatrix::mul(const GpuMatrix& a,
const GpuMatrix& b,
......@@ -2304,6 +2318,41 @@ void CpuMatrix::sequenceAvgForward(Matrix& a,
}
}
void CpuMatrix::sequenceAvgBackward(Matrix& a,
const IVector& startsPos,
int mode) {
size_t height = a.getHeight();
size_t width = getWidth();
CHECK_EQ(height, startsPos.getSize() - 1);
CHECK_EQ(width, a.getWidth());
real* dst = getData();
real* src = a.getData();
const int* starts = startsPos.getData();
MatrixPtr outMtx = Matrix::create(nullptr, 1, width, false, false);
MatrixPtr dataMtx = Matrix::create(nullptr, 1, width, false, false);
for (size_t i = 0; i < height; ++i) {
int sequenceLength = starts[i + 1] - starts[i];
if (0 == sequenceLength) {
// empty sequence
continue;
}
outMtx->setData(dst + starts[i] * width, sequenceLength, width);
dataMtx->setData(src + i * width);
if (mode == 0) {
// plain average
outMtx->addBias(*dataMtx, 1.0f / sequenceLength);
} else if (mode == 1) {
// sum instead of average
outMtx->addBias(*dataMtx, 1.0f);
} else if (mode == 2) {
// divide by square root of sequenceLength
outMtx->addBias(*dataMtx, 1.0f / std::sqrt(sequenceLength));
} else {
LOG(FATAL) << "should not reach here";
}
}
}
/* this = scaleAB*(a*b) + scaleT*this*/
void CpuMatrix::mul(const Matrix& a,
const Matrix& b,
......@@ -2377,41 +2426,8 @@ void CpuMatrix::mul(CpuMatrix* a, CpuMatrix* b, real scaleAB, real scaleT) {
int lda = a->getStride();
int ldb = b->getStride();
int ldc = getStride();
#ifndef PADDLE_TYPE_DOUBLE
cblas_sgemm(CblasRowMajor,
a_trans,
b_trans,
M,
N,
K,
scaleAB,
A,
lda,
B,
ldb,
scaleT,
C,
ldc);
#else
cblas_dgemm(CblasRowMajor,
a_trans,
b_trans,
M,
N,
K,
scaleAB,
A,
lda,
B,
ldb,
scaleT,
C,
ldc);
// TODO(yuyang18): Is gemm defined other place?
#endif
VLOG(2) << " A[0]=" << A[0] << " A[1]=" << A[1] << " B[0]=" << B[0]
<< " B[1]=" << B[1] << " C[0]=" << C[0] << " C[1]=" << C[1];
gemm<real>(
a_trans, b_trans, M, N, K, scaleAB, A, lda, B, ldb, scaleT, C, ldc);
}
void CpuMatrix::mul(
......
......@@ -461,6 +461,12 @@ public:
LOG(FATAL) << "Not implemented";
}
virtual void sequenceAvgBackward(Matrix& a,
const IVector& startsPos,
int mode) {
LOG(FATAL) << "Not implemented";
}
/**
* @code
* this = scaleAB*(a*b) + scaleT*this
......@@ -1203,6 +1209,7 @@ public:
void collectSharedBias(Matrix& a, real scale);
void sequenceAvgForward(Matrix& a, const IVector& startsPos, int mode);
void sequenceAvgBackward(Matrix& a, const IVector& startsPos, int mode);
/**
* @code
......@@ -1619,6 +1626,7 @@ public:
void collectSharedBias(Matrix& a, real scale);
void sequenceAvgForward(Matrix& a, const IVector& startsPos, int mode);
void sequenceAvgBackward(Matrix& a, const IVector& startsPos, int mode);
/**
* @code
......
......@@ -13,119 +13,12 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "SIMDFunctions.h"
#ifdef __SSE3__
#include <immintrin.h>
#endif
#include <algorithm>
#ifndef __AVX__
static void addto_sse(float* a, const float* b, size_t len) {
int offset = len % 16;
__m128 ma0, ma1, ma2, ma3;
__m128 mb0, mb1, mb2, mb3;
for (unsigned int k = 0; k < len / 16; k++, a += 16, b += 16) {
ma0 = _mm_load_ps(a);
ma1 = _mm_load_ps(a + 4);
ma2 = _mm_load_ps(a + 8);
ma3 = _mm_load_ps(a + 12);
mb0 = _mm_load_ps(b);
mb1 = _mm_load_ps(b + 4);
mb2 = _mm_load_ps(b + 8);
mb3 = _mm_load_ps(b + 12);
ma0 = _mm_add_ps(ma0, mb0);
ma1 = _mm_add_ps(ma1, mb1);
ma2 = _mm_add_ps(ma2, mb2);
ma3 = _mm_add_ps(ma3, mb3);
_mm_store_ps(a, ma0);
_mm_store_ps(a + 4, ma1);
_mm_store_ps(a + 8, ma2);
_mm_store_ps(a + 12, ma3);
}
for (int i = 0; i < offset; i++) a[i] += b[i];
}
static void batch_addto_sse(float* a, const float* b[], int batch, size_t len) {
int offset = len % 16;
__m128 ma0, ma1, ma2, ma3;
__m128 mb0, mb1, mb2, mb3;
for (unsigned int k = 0; k < len / 16; k++, a += 16) {
ma0 = _mm_load_ps(a);
ma1 = _mm_load_ps(a + 4);
ma2 = _mm_load_ps(a + 8);
ma3 = _mm_load_ps(a + 12);
for (int i = 0; i < batch; i++) {
mb0 = _mm_load_ps(b[i]);
mb1 = _mm_load_ps(b[i] + 4);
mb2 = _mm_load_ps(b[i] + 8);
mb3 = _mm_load_ps(b[i] + 12);
ma0 = _mm_add_ps(ma0, mb0);
ma1 = _mm_add_ps(ma1, mb1);
ma2 = _mm_add_ps(ma2, mb2);
ma3 = _mm_add_ps(ma3, mb3);
b[i] += 16;
}
_mm_store_ps(a, ma0);
_mm_store_ps(a + 4, ma1);
_mm_store_ps(a + 8, ma2);
_mm_store_ps(a + 12, ma3);
}
for (int i = 0; i < offset; i++) {
for (int k = 0; k < batch; k++) a[i] += b[k][i];
}
return;
}
static void col_max_sse(float* result,
const float* data,
int dim,
int numSamples) {
// first sample, direct copy
for (int d = 0; d < dim; ++d) {
result[d] = data[d];
}
int offset = dim % 16;
__m128 ma0, ma1, ma2, ma3;
__m128 mb0, mb1, mb2, mb3;
// first 16n dims
for (int k = 0; k < dim / 16; k++, result += 16, data += 16) {
ma0 = _mm_load_ps(result);
ma1 = _mm_load_ps(result + 4);
ma2 = _mm_load_ps(result + 8);
ma3 = _mm_load_ps(result + 12);
for (int i = 1; i < numSamples; i++) {
mb0 = _mm_load_ps(data + i * dim);
mb1 = _mm_load_ps(data + i * dim + 4);
mb2 = _mm_load_ps(data + i * dim + 8);
mb3 = _mm_load_ps(data + i * dim + 12);
ma0 = _mm_max_ps(ma0, mb0);
ma1 = _mm_max_ps(ma1, mb1);
ma2 = _mm_max_ps(ma2, mb2);
ma3 = _mm_max_ps(ma3, mb3);
}
_mm_store_ps(result, ma0);
_mm_store_ps(result + 4, ma1);
_mm_store_ps(result + 8, ma2);
_mm_store_ps(result + 12, ma3);
}
// last dims
for (int d = 0; d < offset; ++d) {
float sm = data[d];
for (int i = 1; i < numSamples; ++i) {
sm = std::max(sm, data[i * dim + d]);
}
result[d] = sm;
}
}
#else
#ifdef __AVX__
static void addto_avx(float* a, const float* b, size_t len) {
int offset = len % 32;
......@@ -355,17 +248,128 @@ static void decayL1_avx(
}
}
#elif defined(__SSE3__)
static void addto_sse(float* a, const float* b, size_t len) {
int offset = len % 16;
__m128 ma0, ma1, ma2, ma3;
__m128 mb0, mb1, mb2, mb3;
for (unsigned int k = 0; k < len / 16; k++, a += 16, b += 16) {
ma0 = _mm_load_ps(a);
ma1 = _mm_load_ps(a + 4);
ma2 = _mm_load_ps(a + 8);
ma3 = _mm_load_ps(a + 12);
mb0 = _mm_load_ps(b);
mb1 = _mm_load_ps(b + 4);
mb2 = _mm_load_ps(b + 8);
mb3 = _mm_load_ps(b + 12);
ma0 = _mm_add_ps(ma0, mb0);
ma1 = _mm_add_ps(ma1, mb1);
ma2 = _mm_add_ps(ma2, mb2);
ma3 = _mm_add_ps(ma3, mb3);
_mm_store_ps(a, ma0);
_mm_store_ps(a + 4, ma1);
_mm_store_ps(a + 8, ma2);
_mm_store_ps(a + 12, ma3);
}
for (int i = 0; i < offset; i++) a[i] += b[i];
}
static void batch_addto_sse(float* a, const float* b[], int batch, size_t len) {
int offset = len % 16;
__m128 ma0, ma1, ma2, ma3;
__m128 mb0, mb1, mb2, mb3;
for (unsigned int k = 0; k < len / 16; k++, a += 16) {
ma0 = _mm_load_ps(a);
ma1 = _mm_load_ps(a + 4);
ma2 = _mm_load_ps(a + 8);
ma3 = _mm_load_ps(a + 12);
for (int i = 0; i < batch; i++) {
mb0 = _mm_load_ps(b[i]);
mb1 = _mm_load_ps(b[i] + 4);
mb2 = _mm_load_ps(b[i] + 8);
mb3 = _mm_load_ps(b[i] + 12);
ma0 = _mm_add_ps(ma0, mb0);
ma1 = _mm_add_ps(ma1, mb1);
ma2 = _mm_add_ps(ma2, mb2);
ma3 = _mm_add_ps(ma3, mb3);
b[i] += 16;
}
_mm_store_ps(a, ma0);
_mm_store_ps(a + 4, ma1);
_mm_store_ps(a + 8, ma2);
_mm_store_ps(a + 12, ma3);
}
for (int i = 0; i < offset; i++) {
for (int k = 0; k < batch; k++) a[i] += b[k][i];
}
return;
}
static void col_max_sse(float* result,
const float* data,
int dim,
int numSamples) {
// first sample, direct copy
for (int d = 0; d < dim; ++d) {
result[d] = data[d];
}
int offset = dim % 16;
__m128 ma0, ma1, ma2, ma3;
__m128 mb0, mb1, mb2, mb3;
// first 16n dims
for (int k = 0; k < dim / 16; k++, result += 16, data += 16) {
ma0 = _mm_load_ps(result);
ma1 = _mm_load_ps(result + 4);
ma2 = _mm_load_ps(result + 8);
ma3 = _mm_load_ps(result + 12);
for (int i = 1; i < numSamples; i++) {
mb0 = _mm_load_ps(data + i * dim);
mb1 = _mm_load_ps(data + i * dim + 4);
mb2 = _mm_load_ps(data + i * dim + 8);
mb3 = _mm_load_ps(data + i * dim + 12);
ma0 = _mm_max_ps(ma0, mb0);
ma1 = _mm_max_ps(ma1, mb1);
ma2 = _mm_max_ps(ma2, mb2);
ma3 = _mm_max_ps(ma3, mb3);
}
_mm_store_ps(result, ma0);
_mm_store_ps(result + 4, ma1);
_mm_store_ps(result + 8, ma2);
_mm_store_ps(result + 12, ma3);
}
// last dims
for (int d = 0; d < offset; ++d) {
float sm = data[d];
for (int i = 1; i < numSamples; ++i) {
sm = std::max(sm, data[i * dim + d]);
}
result[d] = sm;
}
}
#endif
#ifndef __AVX__
#define SIMD_INVOKE(func, ...) func##_sse(__VA_ARGS__)
#else
#if defined(__AVX__)
#define SIMD_INVOKE(func, ...) func##_avx(__VA_ARGS__)
#elif defined(__SSE3__)
#define SIMD_INVOKE(func, ...) func##_sse(__VA_ARGS__)
#endif
namespace paddle {
namespace simd {
namespace internal {
#ifdef __SSE3__
void addToImpl(float* a, const float* b, size_t len) {
SIMD_INVOKE(addto, a, b, len);
}
......@@ -376,6 +380,7 @@ void batchAddToImpl(float* a, const float* b[], int batch, size_t len) {
void colMaxImpl(float* result, const float* data, int dim, int numSamples) {
SIMD_INVOKE(col_max, result, data, dim, numSamples);
}
#endif
#ifdef __AVX__
void decayL1AvxImpl(float* dst, float* src, float lambda, size_t len) {
......@@ -385,8 +390,8 @@ void decayL1AvxImpl(
float* dst, float* src, float* lr, float lambda, size_t len) {
decayL1_avx(dst, src, lr, lambda, len);
}
#endif
} // namespace internal
} // namespace simd
} // namespace paddle
......@@ -128,17 +128,29 @@ void decayL1AvxImpl(
template <>
inline void addTo(float* a, const float* b, size_t len) {
#ifdef __SSE3__
internal::addToImpl(a, b, len);
#else
naive::addTo(a, b, len);
#endif
}
template <>
inline void batchAddTo(float* a, const float* b[], int batch, size_t len) {
#ifdef __SSE3__
internal::batchAddToImpl(a, b, batch, len);
#else
naive::batchAddTo(a, b, batch, len);
#endif
}
template <>
inline void colMax(float* result, const float* data, int dim, int numSamples) {
#ifdef __SSE3__
internal::colMaxImpl(result, data, dim, numSamples);
#else
naive::colMax(result, data, dim, numSamples);
#endif
}
template <>
......
......@@ -14,6 +14,7 @@ limitations under the License. */
#include "Storage.h"
#include "Allocator.h"
#include "paddle/utils/StringUtil.h"
#include "paddle/utils/Util.h"
DEFINE_int32(pool_limit_size,
......@@ -62,7 +63,7 @@ PoolAllocator* StorageEngine::getGpuAllocator(int deviceId) {
}
if (gpuAllocator_[deviceId] == nullptr) {
std::string name =
"gpu" + std::to_string(deviceId) + std::string("_pool");
"gpu" + str::to_string(deviceId) + std::string("_pool");
gpuAllocator_[deviceId] =
new PoolAllocator(new GpuAllocator(), FLAGS_pool_limit_size, name);
}
......
......@@ -110,6 +110,8 @@ TEST(BaseMatrix, BaseMatrix) {
compare(&BaseMatrix::addRowVector);
compare(&BaseMatrix::mulRowVector);
compare(&BaseMatrix::divRowVector);
compare(&BaseMatrix::mulColVector);
compare(&BaseMatrix::divColVector);
compare(&BaseMatrix::addP2P);
compare(&BaseMatrix::invSqrt);
}
......
......@@ -685,7 +685,7 @@ TEST(SMatrix, topK) {
}
}
void testMatrixSequenceAvgForward(int batchSize, int inputDim, int mode) {
void testMatrixSequenceAvg(int batchSize, int inputDim, int mode) {
MatrixPtr cpuInput = std::make_shared<CpuMatrix>(batchSize, inputDim);
MatrixPtr gpuInput = std::make_shared<GpuMatrix>(batchSize, inputDim);
cpuInput->randomizeUniform();
......@@ -706,15 +706,25 @@ void testMatrixSequenceAvgForward(int batchSize, int inputDim, int mode) {
gpuOutput->sequenceAvgForward(*gpuInput, *gpuSequence, mode);
TensorCheckErr(*cpuOutput, *gpuOutput);
MatrixPtr cpuInGrad = std::make_shared<CpuMatrix>(batchSize, inputDim);
MatrixPtr gpuInGrad = std::make_shared<GpuMatrix>(batchSize, inputDim);
cpuInGrad->randomizeUniform();
gpuInGrad->copyFrom(*cpuInGrad);
cpuInGrad->sequenceAvgBackward(*cpuOutput, *cpuSequence, mode);
gpuInGrad->sequenceAvgBackward(*gpuOutput, *gpuSequence, mode);
TensorCheckErr(*cpuInGrad, *gpuInGrad);
}
TEST(Matrix, sequenceAvgForward) {
TEST(Matrix, sequenceAvg) {
for (auto batchSize : {10, 128, 6000}) {
for (auto inputDim : {32, 100, 512}) {
for (auto mode : {0, 1, 2}) {
VLOG(3) << " batchSize=" << batchSize << " inputDim=" << inputDim
<< " mode=" << mode;
testMatrixSequenceAvgForward(batchSize, inputDim, mode);
testMatrixSequenceAvg(batchSize, inputDim, mode);
}
}
}
......
......@@ -559,6 +559,49 @@ void Argument::degradeSequence(const Argument& input) {
tgtBuf[numSequences] = numSubSequences;
}
void Argument::poolSequenceWithStride(const Argument& input,
size_t stride,
IVectorPtr* stridePostions,
bool reversed) {
// If input.sequenceStartPositions = [0, 9, 14, 17, 30] and stride = 5,
// then sequenceStartPositions = [0, 2, 3, 4, 7].
// If reversed = false, stridePostions = [0, 5, 9, 14, 17, 22, 27, 30];
// else reversed = true, stridePostions = [0, 4, 9, 14, 17, 20, 25, 30]
CHECK(input.sequenceStartPositions);
CHECK_EQ(input.hasSubseq(), 0UL);
CHECK_GT(stride, 0) << "stride must larger than 0";
size_t numSequences = input.getNumSequences();
ICpuGpuVector::resizeOrCreate(
sequenceStartPositions, numSequences + 1, false);
const int* starts = input.sequenceStartPositions->getData(false);
int* tgtBuf = sequenceStartPositions->getMutableData(false);
// first index of target sequence and stride positions are both 0
tgtBuf[0] = 0;
std::vector<int> stridePos;
for (size_t seqId = 0; seqId < numSequences; ++seqId) {
size_t seqLength = starts[seqId + 1] - starts[seqId];
stridePos.emplace_back(starts[seqId]);
if (seqLength == 0) {
// empty sequence
tgtBuf[seqId + 1] = tgtBuf[seqId];
} else {
int size = ceil((float)seqLength / stride);
tgtBuf[seqId + 1] = tgtBuf[seqId] + size;
for (int i = 0; i < size - 1; ++i) {
int cur = reversed ? starts[seqId + 1] - (size - 1 - i) * stride
: stridePos.back() + stride;
stridePos.emplace_back(cur);
}
}
}
stridePos.emplace_back(starts[numSequences]);
int size = stridePos.size();
CHECK_EQ(size - 1, tgtBuf[numSequences]);
IVector::resizeOrCreate(*stridePostions, size, false);
(*stridePostions)->copyFrom(stridePos.data(), size);
}
void Argument::getValueString(
std::unordered_map<std::string, std::string>* out) const {
if (value) {
......
......@@ -291,6 +291,15 @@ struct Argument {
*/
void degradeSequence(const Argument& input);
/*
After pooling with stride n (n is smaller than sequence length),
a long sequence will be shorten.
This function is invalid for sequence having sub-sequence.
*/
void poolSequenceWithStride(const Argument& input,
size_t stride,
IVectorPtr* stridePositions,
bool reversed = false);
/**
* @brief getValueString will return the argument's output in string. There
* are several kinds of output. The keys of output dictionary are 'value',
......
add_simple_unittest(test_common)
add_simple_unittest(test_argument)
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <gtest/gtest.h>
#include <paddle/parameter/Argument.h>
using namespace paddle; // NOLINT
TEST(Argument, poolSequenceWithStride) {
Argument input, output;
ICpuGpuVector::resizeOrCreate(input.sequenceStartPositions, 5, false);
int* inStart = input.sequenceStartPositions->getMutableData(false);
inStart[0] = 0;
inStart[1] = 9;
inStart[2] = 14;
inStart[3] = 17;
inStart[4] = 30;
int strideResult[] = {0, 5, 9, 14, 17, 22, 27, 30};
int strideResultReversed[] = {0, 4, 9, 14, 17, 20, 25, 30};
for (auto reversed : {false, true}) {
IVectorPtr stridePositions;
output.poolSequenceWithStride(
input, 5 /* stride */, &stridePositions, reversed);
const int* outStart = output.sequenceStartPositions->getData(false);
CHECK_EQ(outStart[0], 0);
CHECK_EQ(outStart[1], 2);
CHECK_EQ(outStart[2], 3);
CHECK_EQ(outStart[3], 4);
CHECK_EQ(outStart[4], 7);
CHECK_EQ(stridePositions->getSize(), 8);
auto result = reversed ? strideResultReversed : strideResult;
for (int i = 0; i < 8; i++) {
CHECK_EQ(stridePositions->getData()[i], result[i]);
}
}
}
int main(int argc, char** argv) {
testing::InitGoogleTest(&argc, argv);
initMain(argc, argv);
return RUN_ALL_TESTS();
}
......@@ -29,6 +29,7 @@ limitations under the License. */
#include "paddle/utils/Flags.h"
#include "paddle/utils/GlobalConstants.h"
#include "paddle/utils/Stat.h"
#include "paddle/utils/StringUtil.h"
DEFINE_int32(pserver_num_threads, 1, "number of threads for sync op exec");
DEFINE_double(async_lagged_ratio_min,
......@@ -218,7 +219,8 @@ void ParameterServer2::setConfig(const SetConfigRequest& request,
callback(response);
/// always defined, barrier slowest node function need it.
statSet_.reset(new StatSet("ParameterServer" + std::to_string(serverId_)));
statSet_.reset(new StatSet("ParameterServer" +
str::to_string(static_cast<int>(serverId_))));
}
real bufferSum(const std::vector<ParameterServer2::Buffer>& buffers) {
......
......@@ -160,10 +160,19 @@ class SparseFloatScanner(SparseBinaryScanner):
class IndexScanner(IScanner):
def __init__(self, input_type, pos):
IScanner.__init__(self, input_type, pos)
self.__ids__ = []
self.__ids__ = None
self.__idx__ = 0
def pre_scan(self, dat):
self.__idx__ += 1
def finish_pre_scan(self, argument):
self.__ids__ = [0] * self.__idx__
self.__idx__ = 0
def scan(self, dat):
self.__ids__.append(dat)
self.__ids__[self.__idx__] = dat
self.__idx__ += 1
def finish_scan(self, argument):
ids = swig_paddle.IVector.create(self.__ids__, self.data_in_gpu)
......@@ -178,6 +187,13 @@ class SequenceScanner(IScanner):
self.__inner_scanner__ = inner_scanner
self.__setter__ = setter
def pre_scan(self, dat):
for each in dat:
self.__inner_scanner__.pre_scan(each)
def finish_pre_scan(self, argument):
self.__inner_scanner__.finish_pre_scan(argument)
def scan(self, dat):
self.__seq__.append(self.__seq__[-1] + self.get_size(dat))
for each in dat:
......
......@@ -83,13 +83,17 @@ def __arguments_to_numpy__(i, arg):
assert isinstance(arg, swig_paddle.Arguments)
value = arg.getSlotValue(i)
ids = arg.getSlotIds(i)
prob = arg.getSlotIn(i)
if value is not None:
assert isinstance(value, swig_paddle.Matrix)
value = value.copyToNumpyMat()
if ids is not None:
assert isinstance(ids, swig_paddle.IVector)
ids = ids.copyToNumpyArray()
return {"value": value, "id": ids}
if prob is not None:
assert isinstance(prob, swig_paddle.Matrix)
prob = prob.copyToNumpyMat()
return {"value": value, "id": ids, "prob": prob}
def __monkeypatch_gradient_machine__():
......
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册