diff --git a/CMakeLists.txt b/CMakeLists.txt index e0db0d535b3fc661c6398f74e17d2cb048217677..1a59db8c71bf3b1ea472c1ee56a1cd97de42dad8 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -12,19 +12,26 @@ # See the License for the specific language governing permissions and # limitations under the License -cmake_minimum_required(VERSION 3.0) - -project(paddle CXX C) - set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_SOURCE_DIR}/cmake") set(PROJ_ROOT ${CMAKE_SOURCE_DIR}) +include(system) + +if(ANDROID) + cmake_minimum_required(VERSION 3.7) +else() + cmake_minimum_required(VERSION 3.0) +endif() + +project(paddle CXX C) + find_package(Sphinx) -find_package(CUDA QUIET) +if(NOT CMAKE_CROSSCOMPILING) + find_package(CUDA QUIET) +endif(NOT CMAKE_CROSSCOMPILING) find_package(Git REQUIRED) find_package(Threads REQUIRED) -include(system) include(simd) ################################ Configurations ####################################### @@ -51,6 +58,21 @@ if(NOT CMAKE_BUILD_TYPE) FORCE) endif() +if(ANDROID) + if(${CMAKE_SYSTEM_VERSION} VERSION_LESS "21") + message(FATAL_ERROR "Unsupport standalone toolchains with Android API level lower than 21") + endif() + + set(WITH_GPU OFF CACHE STRING + "Disable GPU when cross-compiling for Android" FORCE) + set(WITH_AVX OFF CACHE STRING + "Disable AVX when cross-compiling for Android" FORCE) + set(WITH_PYTHON OFF CACHE STRING + "Disable PYTHON when cross-compiling for Android" FORCE) + set(WITH_RDMA OFF CACHE STRING + "Disable RDMA when cross-compiling for Android" FORCE) +endif(ANDROID) + set(THIRD_PARTY_PATH "${PROJ_ROOT}/third_party" CACHE STRING "A path setting third party libraries download & build directories.") ######################################################################################## @@ -64,6 +86,7 @@ include(external/python) # download, build, install python include(external/openblas) # download, build, install openblas include(external/swig) # download, build, install swig include(external/warpctc) # download, build, install warpctc +include(external/any) # download libn::any include(package) # set paddle packages include(cpplint) # set paddle c++ style @@ -74,7 +97,6 @@ include(flags) # set paddle compile flags include(cudnn) # set cudnn libraries include(version) # set PADDLE_VERSION include(coveralls) # set code coverage - include(configure) # add paddle env configuration include_directories("${PROJ_ROOT}") diff --git a/Dockerfile b/Dockerfile index c4502e863f2d9fb771f88218a795a44283818186..59287c52257825923a1c898b9b0bb0a11d4c04cf 100644 --- a/Dockerfile +++ b/Dockerfile @@ -7,13 +7,12 @@ ARG UBUNTU_MIRROR RUN /bin/bash -c 'if [[ -n ${UBUNTU_MIRROR} ]]; then sed -i 's#http://archive.ubuntu.com/ubuntu#${UBUNTU_MIRROR}#g' /etc/apt/sources.list; fi' # ENV variables -ARG BUILD_WOBOQ ARG WITH_GPU ARG WITH_AVX ARG WITH_DOC ARG WITH_STYLE_CHECK -ENV BUILD_WOBOQ=${BUILD_WOBOQ:-OFF} +ENV WOBOQ OFF ENV WITH_GPU=${WITH_AVX:-OFF} ENV WITH_AVX=${WITH_AVX:-ON} ENV WITH_DOC=${WITH_DOC:-OFF} @@ -48,7 +47,7 @@ RUN curl -sSL https://cmake.org/files/v3.4/cmake-3.4.1.tar.gz | tar -xz && \ cd cmake-3.4.1 && ./bootstrap && make -j `nproc` && make install && \ cd .. && rm -rf cmake-3.4.1 -VOLUME ["/usr/share/nginx/html/data", "/usr/share/nginx/html/paddle"] +VOLUME ["/woboq_out"] # Configure OpenSSH server. c.f. https://docs.docker.com/engine/examples/running_ssh_service RUN mkdir /var/run/sshd diff --git a/cmake/cblas.cmake b/cmake/cblas.cmake index 235c95f017f2b6ef24195a0210ccafff36b6ed61..b8bf1bb07a1f779354b2c10071264bf41d279f6c 100644 --- a/cmake/cblas.cmake +++ b/cmake/cblas.cmake @@ -19,9 +19,9 @@ set(CBLAS_FOUND OFF) set(INTEL_ROOT "/opt/intel" CACHE PATH "Folder contains intel libs") set(MKL_ROOT ${INTEL_ROOT}/mkl CACHE PATH "Folder contains MKL") -find_path(MKL_INCLUDE_DIR mkl.h PATHS +find_path(MKL_INC_DIR mkl.h PATHS ${MKL_ROOT}/include) -find_path(MKL_INCLUDE_DIR mkl_lapacke.h PATHS +find_path(MKL_LAPACK_INC_DIR mkl_lapacke.h PATHS ${MKL_ROOT}/include) find_library(MKL_CORE_LIB NAMES mkl_core PATHS ${MKL_ROOT}/lib @@ -34,15 +34,19 @@ find_library(MKL_INTEL_LP64 NAMES mkl_intel_lp64 PATHS ${MKL_ROOT}/lib/intel64) -if(MKL_INCLUDE_DIR AND MKL_CORE_LIB AND MKL_SEQUENTIAL_LIB AND MKL_INTEL_LP64) +if(MKL_INC_DIR AND MKL_CORE_LIB AND MKL_SEQUENTIAL_LIB AND MKL_INTEL_LP64) set(CBLAS_PROVIDER MKL) - set(CBLAS_INC_DIR ${MKL_INCLUDE_DIR}) + set(CBLAS_INC_DIR ${MKL_INC_DIR}) set(CBLAS_LIBRARIES ${MKL_INTEL_LP64} ${MKL_SEQUENTIAL_LIB} ${MKL_CORE_LIB}) add_definitions(-DPADDLE_USE_MKL) message(STATUS "Found MKL (include: ${CBLAS_INC_DIR}, library: ${CBLAS_LIBRARIES})") set(CBLAS_FOUND ON) + if(${MKL_LAPACK_INC_DIR}) + add_definitions(-DPADDLE_USE_LAPACK) + message(STATUS "Found lapack in MKL (include: ${MKL_LAPACK_INC_DIR})") + endif() return() # return file. endif() @@ -68,13 +72,17 @@ find_library(ATLAS_CBLAS_LIB NAMES cblas libcblas.so.3 find_library(ATLAS_LIB NAMES lapack_atlas liblapack_atlas.so.3 PATHS ${ATLAS_LIB_SEARCH_PATHS}) -if(ATLAS_INC_DIR AND ATLAS_CBLAS_LIB AND ATLAS_LIB) +if(ATLAS_INC_DIR AND ATLAS_CBLAS_LIB AND ATLAS_LIB AND NOT CBLAS_FOUND) set(CBLAS_PROVIDER ATLAS) - set(CBLAS_INC_DIR ${ATLAS_INC_DIR} ${ATLAS_CLAPACK_INC_DIR}) + set(CBLAS_INC_DIR ${ATLAS_INC_DIR}) set(CBLAS_LIBRARIES ${ATLAS_LIB} ${ATLAS_CBLAS_LIB}) add_definitions(-DPADDLE_USE_ATLAS) - message(STATUS "Found Atlas (include: ${CBLAS_INC_DIR}, library: ${CBLAS_LIBRARIES})") + message(STATUS "Found ATLAS (include: ${CBLAS_INC_DIR}, library: ${CBLAS_LIBRARIES})") set(CBLAS_FOUND ON) + if(ATLAS_CLAPACK_INC_DIR) + add_definitions(-DPADDLE_USE_LAPACK) + message(STATUS "Found lapack in ATLAS (include: ${ATLAS_CLAPACK_INC_DIR})") + endif() return() endif() @@ -103,8 +111,12 @@ if(OPENBLAS_INC_DIR AND OPENBLAS_LIB) set(CBLAS_PROVIDER OPENBLAS) set(CBLAS_INC_DIR ${OPENBLAS_INC_DIR}) set(CBLAS_LIBRARIES ${OPENBLAS_LIB}) - message(STATUS "Found OpenBlas (include: ${CBLAS_INC_DIR}, library: ${CBLAS_LIBRARIES})") + message(STATUS "Found OpenBLAS (include: ${CBLAS_INC_DIR}, library: ${CBLAS_LIBRARIES})") set(CBLAS_FOUND ON) + if(OPENBLAS_LAPACKE_INC_DIR) + add_definitions(-DPADDLE_USE_LAPACK) + message(STATUS "Found lapack in OpenBLAS (include: ${OPENBLAS_LAPACKE_INC_DIR})") + endif() return() endif() diff --git a/cmake/configure.cmake b/cmake/configure.cmake index 0bb016201dd8ae912ac8ec9f925bc5277fad7aed..5e507e78f74eee885922f502f35e3c15fafb622d 100644 --- a/cmake/configure.cmake +++ b/cmake/configure.cmake @@ -32,6 +32,14 @@ if(NOT WITH_PROFILER) add_definitions(-DPADDLE_DISABLE_PROFILER) endif(NOT WITH_PROFILER) +if(NOT CMAKE_CROSSCOMPILING) + if(WITH_AVX AND AVX_FOUND) + set(SIMD_FLAG ${AVX_FLAG}) + elseif(SSE3_FOUND) + set(SIMD_FLAG ${SSE3_FLAG}) + endif() +endif() + if(NOT WITH_GPU) add_definitions(-DPADDLE_ONLY_CPU) add_definitions(-DHPPL_STUB_FUNC) @@ -48,21 +56,12 @@ else() message(FATAL_ERROR "Paddle need cudnn to compile") endif() - if(WITH_AVX) - set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "-Xcompiler ${AVX_FLAG}") - else(WITH_AVX) - set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "-Xcompiler ${SSE3_FLAG}") - endif(WITH_AVX) + set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "-Xcompiler ${SIMD_FLAG}") # Include cuda and cudnn include_directories(${CUDNN_INCLUDE_DIR}) include_directories(${CUDA_TOOLKIT_INCLUDE}) endif(NOT WITH_GPU) -if(WITH_AVX) - set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${AVX_FLAG}") - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${AVX_FLAG}") -else(WITH_AVX) - set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${SSE3_FLAG}") - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${SSE3_FLAG}") -endif(WITH_AVX) +set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${SIMD_FLAG}") +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${SIMD_FLAG}") diff --git a/cmake/cudnn.cmake b/cmake/cudnn.cmake index 5754407d66c18872bf0cf314ee6e0a32e0d4329d..af9be86961833dcd62371227165d411a3b61d79e 100644 --- a/cmake/cudnn.cmake +++ b/cmake/cudnn.cmake @@ -1,3 +1,7 @@ +if(NOT WITH_GPU) + return() +endif() + set(CUDNN_ROOT "" CACHE PATH "CUDNN ROOT") find_path(CUDNN_INCLUDE_DIR cudnn.h PATHS ${CUDNN_ROOT} ${CUDNN_ROOT}/include diff --git a/cmake/external/any.cmake b/cmake/external/any.cmake new file mode 100644 index 0000000000000000000000000000000000000000..8116f235d535917c03deb646ff4ec083a0cdadc7 --- /dev/null +++ b/cmake/external/any.cmake @@ -0,0 +1,20 @@ +INCLUDE(ExternalProject) + +SET(ANY_SOURCE_DIR ${THIRD_PARTY_PATH}/any) + +INCLUDE_DIRECTORIES(${ANY_SOURCE_DIR}/src/linb_any) + +ExternalProject_Add( + linb_any + ${EXTERNAL_PROJECT_LOG_ARGS} + GIT_REPOSITORY "https://github.com/thelink2012/any.git" + GIT_TAG "8fef1e93710a0edf8d7658999e284a1142c4c020" + PREFIX ${ANY_SOURCE_DIR} + UPDATE_COMMAND "" + CONFIGURE_COMMAND "" + BUILD_COMMAND "" + INSTALL_COMMAND "" + TEST_COMMAND "" +) + +add_definitions(-DANY_IMPL_ANY_CAST_MOVEABLE) diff --git a/cmake/external/gflags.cmake b/cmake/external/gflags.cmake index 2a49d76eb30f592a28746f5897b14b7dd319d784..0afb3ab9af48046af01f03838eefa0bd2fcb2821 100644 --- a/cmake/external/gflags.cmake +++ b/cmake/external/gflags.cmake @@ -31,9 +31,17 @@ ExternalProject_Add( GIT_REPOSITORY "https://github.com/gflags/gflags.git" PREFIX ${GFLAGS_SOURCES_DIR} UPDATE_COMMAND "" + CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} + CMAKE_ARGS -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} + CMAKE_ARGS -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS} + CMAKE_ARGS -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS} CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${GFLAGS_INSTALL_DIR} CMAKE_ARGS -DCMAKE_POSITION_INDEPENDENT_CODE=ON CMAKE_ARGS -DBUILD_TESTING=OFF + CMAKE_ARGS -DCMAKE_BUILD_TYPE=Release + CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${GFLAGS_INSTALL_DIR} + -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON + -DCMAKE_BUILD_TYPE:STRING=Release ) LIST(APPEND external_project_dependencies gflags) diff --git a/cmake/external/glog.cmake b/cmake/external/glog.cmake index ab105611c812a4f4b642ac5b1213fdfe93fab97d..4a9e2ecc6bbe74c5856a55fb0c982777d7ac25b7 100644 --- a/cmake/external/glog.cmake +++ b/cmake/external/glog.cmake @@ -33,11 +33,19 @@ ExternalProject_Add( GIT_REPOSITORY "https://github.com/google/glog.git" PREFIX ${GLOG_SOURCES_DIR} UPDATE_COMMAND "" + CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} + CMAKE_ARGS -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} + CMAKE_ARGS -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS} + CMAKE_ARGS -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS} CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${GLOG_INSTALL_DIR} CMAKE_ARGS -DCMAKE_POSITION_INDEPENDENT_CODE=ON CMAKE_ARGS -DWITH_GFLAGS=ON CMAKE_ARGS -Dgflags_DIR=${GFLAGS_INSTALL_DIR}/lib/cmake/gflags CMAKE_ARGS -DBUILD_TESTING=OFF + CMAKE_ARGS -DCMAKE_BUILD_TYPE=Release + CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${GLOG_INSTALL_DIR} + -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON + -DCMAKE_BUILD_TYPE:STRING=Release ) LIST(APPEND external_project_dependencies glog) diff --git a/cmake/external/gtest.cmake b/cmake/external/gtest.cmake index 11d829a9e2f239848803130505c9862695b25029..49c7d71443cda700a14af6be65ff6658eec7229f 100644 --- a/cmake/external/gtest.cmake +++ b/cmake/external/gtest.cmake @@ -41,11 +41,19 @@ IF(WITH_TESTING) GIT_TAG "release-1.8.0" PREFIX ${GTEST_SOURCES_DIR} UPDATE_COMMAND "" - CMAKE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${GTEST_INSTALL_DIR} + CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} + CMAKE_ARGS -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} + CMAKE_ARGS -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS} + CMAKE_ARGS -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS} + CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${GTEST_INSTALL_DIR} CMAKE_ARGS -DCMAKE_POSITION_INDEPENDENT_CODE=ON CMAKE_ARGS -DBUILD_GMOCK=ON CMAKE_ARGS -Dgtest_disable_pthreads=ON CMAKE_ARGS -Dgtest_force_shared_crt=ON + CMAKE_ARGS -DCMAKE_BUILD_TYPE=Release + CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${GTEST_INSTALL_DIR} + -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON + -DCMAKE_BUILD_TYPE:STRING=Release ) LIST(APPEND external_project_dependencies gtest) ENDIF(WITH_TESTING) diff --git a/cmake/external/openblas.cmake b/cmake/external/openblas.cmake index 00dde9a9fdd4d4825947b987b3e8e0460f4a5f3a..92ea23c7633e974fd09251f967965364b1928307 100644 --- a/cmake/external/openblas.cmake +++ b/cmake/external/openblas.cmake @@ -29,7 +29,24 @@ IF(NOT ${CBLAS_FOUND}) IF(CMAKE_COMPILER_IS_GNUCC) ENABLE_LANGUAGE(Fortran) - LIST(APPEND CBLAS_LIBRARIES gfortran pthread) + if (NOT CMAKE_Fortran_COMPILER_VERSION) + # cmake < 3.4 cannot get CMAKE_Fortran_COMPILER_VERSION directly. + execute_process(COMMAND ${CMAKE_Fortran_COMPILER} -dumpversion + OUTPUT_VARIABLE CMAKE_Fortran_COMPILER_VERSION) + endif() + string(REGEX MATCHALL "[0-9]+" Fortran_VERSION ${CMAKE_Fortran_COMPILER_VERSION}) + list(GET Fortran_VERSION 0 Fortran_MAJOR) + list(GET Fortran_VERSION 1 Fortran_MINOR) + find_library(GFORTRAN_LIBRARY NAMES gfortran PATHS + /lib + /usr/lib + /usr/lib/gcc/x86_64-linux-gnu/${Fortran_MAJOR}.${Fortran_MINOR}/ + /usr/lib/gcc/x86_64-linux-gnu/${Fortran_MAJOR}/) + if (NOT GFORTRAN_LIBRARY) + message(FATAL_ERROR "Cannot found gfortran library which it is used by openblas") + endif() + find_package(Threads REQUIRED) + LIST(APPEND CBLAS_LIBRARIES ${GFORTRAN_LIBRARY} ${CMAKE_THREAD_LIBS_INIT}) ENDIF(CMAKE_COMPILER_IS_GNUCC) IF(NOT CMAKE_Fortran_COMPILER) @@ -37,6 +54,8 @@ IF(NOT ${CBLAS_FOUND}) "you need to set gfortran compiler: cmake .. -DCMAKE_Fortran_COMPILER=...") ENDIF(NOT CMAKE_Fortran_COMPILER) + ADD_DEFINITIONS(-DPADDLE_USE_LAPACK) + ExternalProject_Add( openblas ${EXTERNAL_PROJECT_LOG_ARGS} diff --git a/cmake/external/protobuf.cmake b/cmake/external/protobuf.cmake index ad1426fd940c7b163668c33d41731fe75d89dd89..2df042d226af8308d00f7870e7d2de0eacfdf07e 100644 --- a/cmake/external/protobuf.cmake +++ b/cmake/external/protobuf.cmake @@ -58,12 +58,20 @@ IF(NOT PROTOBUF_FOUND) GIT_TAG "9f75c5aa851cd877fb0d93ccc31b8567a6706546" CONFIGURE_COMMAND ${CMAKE_COMMAND} ${PROTOBUF_SOURCES_DIR}/src/protobuf/cmake - -Dprotobuf_BUILD_TESTS=OFF - -DZLIB_ROOT:FILEPATH=${ZLIB_ROOT} - -DCMAKE_POSITION_INDEPENDENT_CODE=ON - -DCMAKE_BUILD_TYPE=Release - -DCMAKE_INSTALL_PREFIX=${PROTOBUF_INSTALL_DIR} - -DCMAKE_INSTALL_LIBDIR=lib + -Dprotobuf_BUILD_TESTS=OFF + -DZLIB_ROOT:FILEPATH=${ZLIB_ROOT} + -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} + -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} + -DCMAKE_POSITION_INDEPENDENT_CODE=ON + -DCMAKE_BUILD_TYPE=Release + -DCMAKE_INSTALL_PREFIX=${PROTOBUF_INSTALL_DIR} + -DCMAKE_INSTALL_LIBDIR=lib + CMAKE_CACHE_ARGS + -DCMAKE_INSTALL_PREFIX:PATH=${PROTOBUF_INSTALL_DIR} + -DCMAKE_BUILD_TYPE:STRING=Release + -DCMAKE_VERBOSE_MAKEFILE:BOOL=OFF + -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON + -DZLIB_ROOT:STRING=${ZLIB_ROOT} ) LIST(APPEND external_project_dependencies protobuf) diff --git a/cmake/external/python.cmake b/cmake/external/python.cmake index 93d7275df05d723d7dd66ef0c5ac15672c051c34..9fd3afd0998b38c18b4490e6fb1c6fe0222ed142 100644 --- a/cmake/external/python.cmake +++ b/cmake/external/python.cmake @@ -219,9 +219,9 @@ ELSE(PYTHONLIBS_FOUND AND PYTHONINTERP_FOUND) ENDIF(PYTHONLIBS_FOUND AND PYTHONINTERP_FOUND) -INCLUDE_DIRECTORIES(${PYTHON_INCLUDE_DIR}) -INCLUDE_DIRECTORIES(${PYTHON_NUMPY_INCLUDE_DIR}) - -IF(NOT WITH_PYTHON) +IF(WITH_PYTHON) + INCLUDE_DIRECTORIES(${PYTHON_INCLUDE_DIR}) + INCLUDE_DIRECTORIES(${PYTHON_NUMPY_INCLUDE_DIR}) +ELSE() SET(PYTHON_LIBRARIES "") ENDIF() diff --git a/cmake/external/warpctc.cmake b/cmake/external/warpctc.cmake index 172c318b35d611d0432b78f2a18eb58a7d272b90..293070c3cfcc1196001f64469f3254289b0de792 100644 --- a/cmake/external/warpctc.cmake +++ b/cmake/external/warpctc.cmake @@ -50,12 +50,19 @@ ExternalProject_Add( UPDATE_COMMAND "" CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} CMAKE_ARGS -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} + CMAKE_ARGS -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS} + CMAKE_ARGS -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS} CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${WARPCTC_INSTALL_DIR} CMAKE_ARGS -DWITH_GPU=${WITH_GPU} CMAKE_ARGS -DWITH_OMP=${USE_OMP} CMAKE_ARGS -DWITH_TORCH=OFF - CMAKE_ARGS -DCMAKE_DISABLE_FIND_PACKAGE_Torch=TRUE + CMAKE_ARGS -DCMAKE_DISABLE_FIND_PACKAGE_Torch=ON CMAKE_ARGS -DBUILD_SHARED=ON + CMAKE_ARGS -DCMAKE_POSITION_INDEPENDENT_CODE=ON + CMAKE_ARGS -DCMAKE_BUILD_TYPE=Release + CMAKE_CACHE_ARGS -DCMAKE_BUILD_TYPE:STRING=Release + -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON + -DCMAKE_INSTALL_PREFIX:PATH=${WARPCTC_INSTALL_DIR} ) LIST(APPEND external_project_dependencies warpctc) diff --git a/cmake/external/zlib.cmake b/cmake/external/zlib.cmake index 47fa8817fb64fb8fd718e2892ad5bae7bbe956eb..45ca5542b7dc30216b45487782f849b93c5f8fca 100644 --- a/cmake/external/zlib.cmake +++ b/cmake/external/zlib.cmake @@ -22,7 +22,7 @@ SET(ZLIB_INCLUDE_DIR "${ZLIB_INSTALL_DIR}/include" CACHE PATH "zlib include dire IF(WIN32) SET(ZLIB_LIBRARIES "${ZLIB_INSTALL_DIR}/lib/zlibstatic.lib" CACHE FILEPATH "zlib library." FORCE) ELSE(WIN32) - set(ZLIB_LIBRARIES "${ZLIB_INSTALL_DIR}/lib/libz.a" CACHE FILEPATH "zlib library." FORCE) + SET(ZLIB_LIBRARIES "${ZLIB_INSTALL_DIR}/lib/libz.a" CACHE FILEPATH "zlib library." FORCE) ENDIF(WIN32) INCLUDE_DIRECTORIES(${ZLIB_INCLUDE_DIR}) @@ -34,10 +34,18 @@ ExternalProject_Add( GIT_TAG "v1.2.8" PREFIX ${ZLIB_SOURCES_DIR} UPDATE_COMMAND "" + CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} + CMAKE_ARGS -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} + CMAKE_ARGS -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS} + CMAKE_ARGS -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS} CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${ZLIB_INSTALL_DIR} CMAKE_ARGS -DBUILD_SHARED_LIBS=OFF CMAKE_ARGS -DCMAKE_POSITION_INDEPENDENT_CODE=ON CMAKE_ARGS -DCMAKE_MACOSX_RPATH=ON + CMAKE_ARGS -DCMAKE_BUILD_TYPE=Release + CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${ZLIB_INSTALL_DIR} + -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON + -DCMAKE_BUILD_TYPE:STRING=Release ) LIST(APPEND external_project_dependencies zlib) diff --git a/cmake/flags.cmake b/cmake/flags.cmake index b76852fc6c50e80633c8294fb2724b83f15293a7..7eb92efcb00fa18461e61e0508b485c13ef23a1f 100644 --- a/cmake/flags.cmake +++ b/cmake/flags.cmake @@ -2,6 +2,7 @@ include(CheckCXXCompilerFlag) include(CheckCCompilerFlag) include(CheckCXXSymbolExists) +include(CheckTypeSize) function(CheckCompilerCXX11Flag) if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU") @@ -25,7 +26,7 @@ function(CheckCompilerCXX11Flag) endfunction() CheckCompilerCXX11Flag() -LIST(APPEND CMAKE_CXX_FLAGS -std=c++11) +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11") # safe_set_flag # @@ -83,6 +84,17 @@ if(NOT UINT64_MAX_EXISTS) endif() endif() +SET(CMAKE_EXTRA_INCLUDE_FILES "pthread.h") +CHECK_TYPE_SIZE(pthread_spinlock_t SPINLOCK_FOUND) +CHECK_TYPE_SIZE(pthread_barrier_t BARRIER_FOUND) +if(SPINLOCK_FOUND) + add_definitions(-DPADDLE_USE_PTHREAD_SPINLOCK) +endif(SPINLOCK_FOUND) +if(BARRIER_FOUND) + add_definitions(-DPADDLE_USE_PTHREAD_BARRIER) +endif(BARRIER_FOUND) +SET(CMAKE_EXTRA_INCLUDE_FILES "") + # Common flags. the compiler flag used for C/C++ sources whenever release or debug # Do not care if this flag is support for gcc. set(COMMON_FLAGS diff --git a/cmake/simd.cmake b/cmake/simd.cmake index d380c996dfa95f0caa2b9cd9daa0ac9141e51fe0..46035a908b588861607a25d3a21cf34b7b6fd4b8 100644 --- a/cmake/simd.cmake +++ b/cmake/simd.cmake @@ -2,6 +2,7 @@ # so that PaddlePaddle can unleash the vectorization power of muticore. INCLUDE(CheckCXXSourceRuns) +INCLUDE(CheckCXXSourceCompiles) IF(CMAKE_COMPILER_IS_GNUCC OR CMAKE_COMPILER_IS_GNUCXX OR CMAKE_CXX_COMPILER_ID MATCHES "Clang") set(MMX_FLAG "-mmmx") @@ -17,6 +18,8 @@ ELSEIF(MSVC) SET(AVX2_FLAG "/arch:AVX2") ENDIF() +set(CMAKE_REQUIRED_FLAGS_RETAINED ${CMAKE_REQUIRED_FLAGS}) + # Check MMX set(CMAKE_REQUIRED_FLAGS ${MMX_FLAG}) CHECK_CXX_SOURCE_RUNS(" @@ -73,4 +76,5 @@ int main() return 0; }" AVX2_FOUND) +set(CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS_RETAINED}) mark_as_advanced(MMX_FOUND SSE2_FOUND SSE3_FOUND AVX_FOUND AVX2_FOUND) diff --git a/cmake/system.cmake b/cmake/system.cmake index 3e472da7e0bd9c433f92f3e8b52970cd2cc6dcba..3ca06665ab2385e34302a6bcce7ada549ea1e247 100644 --- a/cmake/system.cmake +++ b/cmake/system.cmake @@ -67,6 +67,12 @@ MARK_AS_ADVANCED(HOST_SYSTEM CPU_CORES) MESSAGE(STATUS "Found Paddle host system: ${HOST_SYSTEM}") MESSAGE(STATUS "Found Paddle host system's CPU: ${CPU_CORES} cores") +IF(DEFINED CMAKE_SYSTEM_NAME) + IF(${CMAKE_SYSTEM_NAME} STREQUAL "Android") + SET(ANDROID TRUE) + ENDIF() +ENDIF() + # external dependencies log output SET(EXTERNAL_PROJECT_LOG_ARGS LOG_DOWNLOAD 0 # Wrap download in script to log output diff --git a/cmake/util.cmake b/cmake/util.cmake index bacb64eb9ee65fffc824e4587a22fc432c092b19..099a85809d93e01772bf8c5c329fd9055ee4f054 100644 --- a/cmake/util.cmake +++ b/cmake/util.cmake @@ -90,6 +90,10 @@ function(link_paddle_exe TARGET_NAME) ${RDMA_LD_FLAGS} ${RDMA_LIBS}) + if(ANDROID) + target_link_libraries(${TARGET_NAME} log) + endif(ANDROID) + add_dependencies(${TARGET_NAME} ${external_project_dependencies}) endfunction() diff --git a/demo/seqToseq/api_train_v2.py b/demo/seqToseq/api_train_v2.py index 5d138a8c4f91976d90b19441781248f7b67c854a..2809054e7d3a367f441188fe7f91037cfa5f1579 100644 --- a/demo/seqToseq/api_train_v2.py +++ b/demo/seqToseq/api_train_v2.py @@ -1,13 +1,17 @@ import sys + import paddle.v2 as paddle -def seqToseq_net(source_dict_dim, target_dict_dim): +def seqToseq_net(source_dict_dim, target_dict_dim, is_generating=False): ### Network Architecture word_vector_dim = 512 # dimension of word vector decoder_size = 512 # dimension of hidden unit in GRU Decoder network encoder_size = 512 # dimension of hidden unit in GRU Encoder network + beam_size = 3 + max_length = 250 + #### Encoder src_word_id = paddle.layer.data( name='source_language_word', @@ -67,30 +71,57 @@ def seqToseq_net(source_dict_dim, target_dict_dim): group_input2 = paddle.layer.StaticInputV2(input=encoded_proj, is_seq=True) group_inputs = [group_input1, group_input2] - trg_embedding = paddle.layer.embedding( - input=paddle.layer.data( - name='target_language_word', - type=paddle.data_type.integer_value_sequence(target_dict_dim)), - size=word_vector_dim, - param_attr=paddle.attr.ParamAttr(name='_target_language_embedding')) - group_inputs.append(trg_embedding) - - # For decoder equipped with attention mechanism, in training, - # target embeding (the groudtruth) is the data input, - # while encoded source sequence is accessed to as an unbounded memory. - # Here, the StaticInput defines a read-only memory - # for the recurrent_group. - decoder = paddle.layer.recurrent_group( - name=decoder_group_name, - step=gru_decoder_with_attention, - input=group_inputs) - - lbl = paddle.layer.data( - name='target_language_next_word', - type=paddle.data_type.integer_value_sequence(target_dict_dim)) - cost = paddle.layer.classification_cost(input=decoder, label=lbl) - - return cost + if not is_generating: + trg_embedding = paddle.layer.embedding( + input=paddle.layer.data( + name='target_language_word', + type=paddle.data_type.integer_value_sequence(target_dict_dim)), + size=word_vector_dim, + param_attr=paddle.attr.ParamAttr(name='_target_language_embedding')) + group_inputs.append(trg_embedding) + + # For decoder equipped with attention mechanism, in training, + # target embeding (the groudtruth) is the data input, + # while encoded source sequence is accessed to as an unbounded memory. + # Here, the StaticInput defines a read-only memory + # for the recurrent_group. + decoder = paddle.layer.recurrent_group( + name=decoder_group_name, + step=gru_decoder_with_attention, + input=group_inputs) + + lbl = paddle.layer.data( + name='target_language_next_word', + type=paddle.data_type.integer_value_sequence(target_dict_dim)) + cost = paddle.layer.classification_cost(input=decoder, label=lbl) + + return cost + else: + # In generation, the decoder predicts a next target word based on + # the encoded source sequence and the last generated target word. + + # The encoded source sequence (encoder's output) must be specified by + # StaticInput, which is a read-only memory. + # Embedding of the last generated word is automatically gotten by + # GeneratedInputs, which is initialized by a start mark, such as , + # and must be included in generation. + + trg_embedding = paddle.layer.GeneratedInputV2( + size=target_dict_dim, + embedding_name='_target_language_embedding', + embedding_size=word_vector_dim) + group_inputs.append(trg_embedding) + + beam_gen = paddle.layer.beam_search( + name=decoder_group_name, + step=gru_decoder_with_attention, + input=group_inputs, + bos_id=0, + eos_id=1, + beam_size=beam_size, + max_length=max_length) + + return beam_gen def main(): diff --git a/doc/getstarted/build_and_install/docker_install_en.rst b/doc/getstarted/build_and_install/docker_install_en.rst index f43e83d1297637a84f8a8bd581d1ab94089efc28..8fb9369e0e8e31e620169fa2856094c414efe23e 100644 --- a/doc/getstarted/build_and_install/docker_install_en.rst +++ b/doc/getstarted/build_and_install/docker_install_en.rst @@ -8,199 +8,255 @@ Please be aware that you will need to change `Dockers settings `_ to make full use of your hardware resource on Mac OS X and Windows. +Working With Docker +------------------- + +Docker is simple as long as we understand a few basic concepts: + +- *image*: A Docker image is a pack of software. It could contain one or more programs and all their dependencies. For example, the PaddlePaddle's Docker image includes pre-built PaddlePaddle and Python and many Python packages. We can run a Docker image directly, other than installing all these software. We can type + + .. code-block:: bash + + docker images + + to list all images in the system. We can also run + + .. code-block:: bash + + docker pull paddlepaddle/paddle:0.10.0rc2 + + to download a Docker image, paddlepaddle/paddle in this example, + from Dockerhub.com. + +- *container*: considering a Docker image a program, a container is a + "process" that runs the image. Indeed, a container is exactly an + operating system process, but with a virtualized filesystem, network + port space, and other virtualized environment. We can type + + .. code-block:: bash + + docker run paddlepaddle/paddle:0.10.0rc2 + + to start a container to run a Docker image, paddlepaddle/paddle in this example. + +- By default docker container have an isolated file system namespace, + we can not see the files in the host file system. By using *volume*, + mounted files in host will be visible inside docker container. + Following command will mount current dirctory into /data inside + docker container, run docker container from debian image with + command :code:`ls /data`. + + .. code-block:: bash + + docker run --rm -v $(pwd):/data debian ls /data Usage of CPU-only and GPU Images ---------------------------------- -For each version of PaddlePaddle, we release 2 types of Docker images: development -image and production image. Production image includes CPU-only version and a CUDA -GPU version and their no-AVX versions. We put the docker images on -`dockerhub.com `_. You can find the -latest versions under "tags" tab at dockerhub.com. -1. development image :code:`paddlepaddle/paddle:-dev` +For each version of PaddlePaddle, we release two types of Docker images: +development image and production image. Production image includes +CPU-only version and a CUDA GPU version and their no-AVX versions. We +put the docker images on `dockerhub.com +`_. You can find the +latest versions under "tags" tab at dockerhub.com - This image has packed related develop tools and runtime environment. Users and - developers can use this image instead of their own local computer to accomplish - development, build, releasing, document writing etc. While different version of - paddle may depends on different version of libraries and tools, if you want to - setup a local environment, you must pay attention to the versions. - The development image contains: - - gcc/clang - - nvcc - - Python - - sphinx - - woboq - - sshd - Many developers use servers with GPUs, they can use ssh to login to the server - and run :code:`docker exec` to enter the docker container and start their work. - Also they can start a development docker image with SSHD service, so they can login to - the container and start work. +1. Production images, this image might have multiple variants: - To run the CPU-only image as an interactive container: + - GPU/AVX::code:`paddlepaddle/paddle:-gpu` + - GPU/no-AVX::code:`paddlepaddle/paddle:-gpu-noavx` + - CPU/AVX::code:`paddlepaddle/paddle:` + - CPU/no-AVX::code:`paddlepaddle/paddle:-noavx` - .. code-block:: bash + Please be aware that the CPU-only and the GPU images both use the + AVX instruction set, but old computers produced before 2008 do not + support AVX. The following command checks if your Linux computer + supports AVX: - docker run -it --rm paddledev/paddle: /bin/bash + .. code-block:: bash - or, we can run it as a daemon container + if cat /proc/cpuinfo | grep -i avx; then echo Yes; else echo No; fi - .. code-block:: bash + + To run the CPU-only image as an interactive container: - docker run -d -p 2202:22 -p 8888:8888 paddledev/paddle: + .. code-block:: bash - and SSH to this container using password :code:`root`: + docker run -it --rm paddlepaddle/paddle:0.10.0rc2 /bin/bash - .. code-block:: bash + Above method work with the GPU image too -- the recommended way is + using `nvidia-docker `_. - ssh -p 2202 root@localhost + Please install nvidia-docker first following this `tutorial + `_. - An advantage of using SSH is that we can connect to PaddlePaddle from - more than one terminals. For example, one terminal running vi and - another one running Python interpreter. Another advantage is that we - can run the PaddlePaddle container on a remote server and SSH to it - from a laptop. + Now you can run a GPU image: + .. code-block:: bash -2. Production images, this image might have multiple variants: - - GPU/AVX::code:`paddlepaddle/paddle:-gpu` - - GPU/no-AVX::code:`paddlepaddle/paddle:-gpu-noavx` - - CPU/AVX::code:`paddlepaddle/paddle:` - - CPU/no-AVX::code:`paddlepaddle/paddle:-noavx` + nvidia-docker run -it --rm paddlepaddle/paddle:0.10.0rc2-gpu /bin/bash - Please be aware that the CPU-only and the GPU images both use the AVX - instruction set, but old computers produced before 2008 do not support - AVX. The following command checks if your Linux computer supports - AVX: +2. development image :code:`paddlepaddle/paddle:-dev` - .. code-block:: bash + This image has packed related develop tools and runtime + environment. Users and developers can use this image instead of + their own local computer to accomplish development, build, + releasing, document writing etc. While different version of paddle + may depends on different version of libraries and tools, if you + want to setup a local environment, you must pay attention to the + versions. The development image contains: + + - gcc/clang + - nvcc + - Python + - sphinx + - woboq + - sshd + + Many developers use servers with GPUs, they can use ssh to login to + the server and run :code:`docker exec` to enter the docker + container and start their work. Also they can start a development + docker image with SSHD service, so they can login to the container + and start work. - if cat /proc/cpuinfo | grep -i avx; then echo Yes; else echo No; fi +Train Model Using Python API +---------------------------- - If it doesn't, we will use the non-AVX images. +Our official docker image provides a runtime for PaddlePaddle +programs. The typical workflow will be as follows: - Above methods work with the GPU image too -- just please don't forget - to install GPU driver. To support GPU driver, we recommend to use - [nvidia-docker](https://github.com/NVIDIA/nvidia-docker). Run using +Create a directory as workspace: - .. code-block:: bash +.. code-block:: bash - nvidia-docker run -it --rm paddledev/paddle:0.10.0rc1-gpu /bin/bash + mkdir ~/workspace - Note: If you would have a problem running nvidia-docker, you may try the old method we have used (not recommended). +Edit a PaddlePaddle python program using your favourite editor - .. code-block:: bash +.. code-block:: bash - export CUDA_SO="$(\ls /usr/lib64/libcuda* | xargs -I{} echo '-v {}:{}') $(\ls /usr/lib64/libnvidia* | xargs -I{} echo '-v {}:{}')" - export DEVICES=$(\ls /dev/nvidia* | xargs -I{} echo '--device {}:{}') - docker run ${CUDA_SO} ${DEVICES} -it paddledev/paddle:-gpu + emacs ~/workspace/example.py +Run the program using docker: -3. Use production image to release you AI application - Suppose that we have a simple application program in :code:`a.py`, we can test and run it using the production image: +.. code-block:: bash - ```bash - docker run -it -v $PWD:/work paddle /work/a.py - ``` + docker run --rm -v ~/workspace:/workspace paddlepaddle/paddle:0.10.0rc2 python /workspace/example.py - But this works only if all dependencies of :code:`a.py` are in the production image. If this is not the case, we need to build a new Docker image from the production image and with more dependencies installs. +Or if you are using GPU for training: +.. code-block:: bash -PaddlePaddle Book ------------------- + nvidia-docker run --rm -v ~/workspace:/workspace paddlepaddle/paddle:0.10.0rc2-gpu python /workspace/example.py -The Jupyter Notebook is an open-source web application that allows -you to create and share documents that contain live code, equations, -visualizations and explanatory text in a single browser. +Above commands will start a docker container by running :code:`python +/workspace/example.py`. It will stop once :code:`python +/workspace/example.py` finishes. -PaddlePaddle Book is an interactive Jupyter Notebook for users and developers. -We already exposed port 8888 for this book. If you want to -dig deeper into deep learning, PaddlePaddle Book definitely is your best choice. +Another way is to tell docker to start a :code:`/bin/bash` session and +run PaddlePaddle program interactively: -We provide a packaged book image, simply issue the command: +.. code-block:: bash + + docker run -it -v ~/workspace:/workspace paddlepaddle/paddle:0.10.0rc2 /bin/bash + # now we are inside docker container + cd /workspace + python example.py + +Running with GPU is identical: .. code-block:: bash - docker run -p 8888:8888 paddlepaddle/book + nvidia-docker run -it -v ~/workspace:/workspace paddlepaddle/paddle:0.10.0rc2-gpu /bin/bash + # now we are inside docker container + cd /workspace + python example.py -Then, you would back and paste the address into the local browser: -.. code-block:: text +Develop PaddlePaddle or Train Model Using C++ API +--------------------------------------------------- - http://localhost:8888/ +We will be using PaddlePaddle development image since it contains all +compiling tools and dependencies. -That's all. Enjoy your journey! +Let's clone PaddlePaddle repo first: -Development Using Docker ------------------------- +.. code-block:: bash -Developers can work on PaddlePaddle using Docker. This allows -developers to work on different platforms -- Linux, Mac OS X, and -Windows -- in a consistent way. + git clone https://github.com/PaddlePaddle/Paddle.git && cd Paddle -1. Build the Development Docker Image +Mount both workspace folder and paddle code folder into docker +container, so we can access them inside docker container. There are +two ways of using PaddlePaddle development docker image: - .. code-block:: bash +- run interactive bash directly - git clone --recursive https://github.com/PaddlePaddle/Paddle - cd Paddle - docker build -t paddle:dev . + .. code-block:: bash - Note that by default :code:`docker build` wouldn't import source - tree into the image and build it. If we want to do that, we need docker the - development docker image and then run the following command: + # use nvidia-docker instead of docker if you need to use GPU + docker run -it -v ~/workspace:/workspace -v $(pwd):/paddle paddlepaddle/paddle:0.10.0rc2-dev /bin/bash + # now we are inside docker container - .. code-block:: bash +- or, we can run it as a daemon container - docker run -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_AVX=ON" -e "TEST=OFF" paddle:dev + .. code-block:: bash + # use nvidia-docker instead of docker if you need to use GPU + docker run -d -p 2202:22 -p 8888:8888 -v ~/workspace:/workspace -v $(pwd):/paddle paddlepaddle/paddle:0.10.0rc2-dev /usr/sbin/sshd -D -2. Run the Development Environment + and SSH to this container using password :code:`root`: - Once we got the image :code:`paddle:dev`, we can use it to develop - Paddle by mounting the local source code tree into a container that - runs the image: + .. code-block:: bash - .. code-block:: bash + ssh -p 2202 root@localhost - docker run -d -p 2202:22 -p 8888:8888 -v $PWD:/paddle paddle:dev sshd + An advantage is that we can run the PaddlePaddle container on a + remote server and SSH to it from a laptop. - This runs a container of the development environment Docker image - with the local source tree mounted to :code:`/paddle` of the - container. +When developing PaddlePaddle, you can edit PaddlePaddle source code +from outside of docker container using your favoriate editor. To +compile PaddlePaddle, run inside container: - The above :code:`docker run` commands actually starts - an SSHD server listening on port 2202. This allows us to log into - this container with: +.. code-block:: bash - .. code-block:: bash + WITH_GPU=OFF WITH_AVX=ON WITH_TEST=ON bash /paddle/paddle/scripts/docker/build.sh - ssh root@localhost -p 2202 +This builds everything about Paddle in :code:`/paddle/build`. And we +can run unit tests there: - Usually, I run above commands on my Mac. I can also run them on a - GPU server :code:`xxx.yyy.zzz.www` and ssh from my Mac to it: +.. code-block:: bash - .. code-block:: bash + cd /paddle/build + ctest - my-mac$ ssh root@xxx.yyy.zzz.www -p 2202 +When training model using C++ API, we can edit paddle program in +~/workspace outside of docker. And build from /workspace inside of +docker. -3. Build and Install Using the Development Environment +PaddlePaddle Book +------------------ - Once I am in the container, I can use - :code:`paddle/scripts/docker/build.sh` to build, install, and test - Paddle: +The Jupyter Notebook is an open-source web application that allows +you to create and share documents that contain live code, equations, +visualizations and explanatory text in a single browser. - .. code-block:: bash +PaddlePaddle Book is an interactive Jupyter Notebook for users and developers. +We already exposed port 8888 for this book. If you want to +dig deeper into deep learning, PaddlePaddle Book definitely is your best choice. - /paddle/paddle/scripts/docker/build.sh +We provide a packaged book image, simply issue the command: - This builds everything about Paddle in :code:`/paddle/build`. And - we can run unit tests there: +.. code-block:: bash - .. code-block:: bash + docker run -p 8888:8888 paddlepaddle/book + +Then, you would back and paste the address into the local browser: + +.. code-block:: text + + http://localhost:8888/ - cd /paddle/build - ctest +That's all. Enjoy your journey! Documentation diff --git a/doc/templates/conf.py.cn.in b/doc/templates/conf.py.cn.in index 6dc48704bc230bd1a573c4b4b2e7c07791e48ced..95cad835b11816f4d2e256c2abd662a545a5bad2 100644 --- a/doc/templates/conf.py.cn.in +++ b/doc/templates/conf.py.cn.in @@ -55,6 +55,7 @@ extensions = [ 'sphinx.ext.napoleon', 'sphinx.ext.graphviz' ] +mathjax_path="https://cdn.bootcss.com/mathjax/2.7.0/MathJax.js" table_styling_embed_css = True autodoc_member_order = 'bysource' diff --git a/paddle/cuda/include/hl_cpu_matrix_kernel.cuh b/paddle/cuda/include/hl_cpu_matrix_kernel.cuh index f35bfbc5c8253d632f8089f5037421f527633aad..9c49a4bd2083794e98b099b25944bedec3d5a2ff 100644 --- a/paddle/cuda/include/hl_cpu_matrix_kernel.cuh +++ b/paddle/cuda/include/hl_cpu_matrix_kernel.cuh @@ -17,7 +17,11 @@ limitations under the License. */ #include #include "hl_base.h" +#if defined(__ARM_NEON__) || defined(__ARM_NEON) +#include "hl_neon_matrix_kernel.cuh" +#else #include "hl_sse_matrix_kernel.cuh" +#endif /** * @brief cpu element wise unary operator. diff --git a/paddle/cuda/include/hl_matrix_base.cuh b/paddle/cuda/include/hl_matrix_base.cuh index db35ee2037433163ebb3673edb350e3fab71fba9..8b755c1095c2c4fdb7e74d8cddc948e6a6af380b 100644 --- a/paddle/cuda/include/hl_matrix_base.cuh +++ b/paddle/cuda/include/hl_matrix_base.cuh @@ -66,6 +66,8 @@ typedef BaseOp SSESquaredDiff; typedef BaseOp SSEFirst; typedef BaseOp SSESecond; typedef BaseOp SSEClassificationError; +#elif defined(__ARM__NEON__) || defined(__ARM_NEON) +#include "hl_matrix_base_neon.cuh" #else #include "hl_matrix_base_sse.cuh" #endif diff --git a/paddle/cuda/include/hl_matrix_base_neon.cuh b/paddle/cuda/include/hl_matrix_base_neon.cuh new file mode 100644 index 0000000000000000000000000000000000000000..e13019f5ee24ad600005c99678426ee3808b0e54 --- /dev/null +++ b/paddle/cuda/include/hl_matrix_base_neon.cuh @@ -0,0 +1,161 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + + +#ifndef HL_MATRIX_BASE_NEON_CUH_ +#define HL_MATRIX_BASE_NEON_CUH_ + +namespace aggregate { +class SSESum { +public: + static const bool sse = true; + INLINE float32x4_t vecOp(const float32x4_t a, const float32x4_t b) const { + return vaddq_f32(a, b); + } +}; + +class SSEMax { +public: + static const bool sse = true; + INLINE float32x4_t vecOp(const float32x4_t a, const float32x4_t b) const { + return vmaxq_f32(a, b); + } +}; + +class SSEMin { +public: + static const bool sse = true; + INLINE float32x4_t vecOp(const float32x4_t a, const float32x4_t b) const { + return vminq_f32(a, b); + } +}; +} // namespace aggregate + +namespace base { +namespace unary { +class SSEIdentity { +public: + static const bool sse = true; + INLINE float32x4_t vecOp(const float32x4_t a) const { + return a; + } +}; +} // namespace unary + +namespace binary { +class SSEAdd { +public: + static const bool sse = true; + INLINE float32x4_t vecOp(const float32x4_t a, const float32x4_t b) const { + return vaddq_f32(a, b); + } +}; + +class SSEAdd2 { +public: + static const bool sse = true; + const real p1; + const real p2; + float32x4_t mp1; + float32x4_t mp2; + +public: + SSEAdd2(const real s1, const real s2) : p1(s1), p2(s2) { + mp1 = vdupq_n_f32(p1); + mp2 = vdupq_n_f32(p2); + } + INLINE float32x4_t vecOp(const float32x4_t a, const float32x4_t b) const { + float32x4_t tmp1, tmp2; + tmp1 = vmulq_f32(mp1, a); + tmp2 = vmulq_f32(mp2, b); + return vaddq_f32(tmp1, tmp2); + } +}; + +class SSESub { +public: + static const bool sse = true; + INLINE float32x4_t vecOp(const float32x4_t a, const float32x4_t b) const { + return vsubq_f32(a, b); + } +}; + +class SSEMul { +public: + static const bool sse = true; + INLINE float32x4_t vecOp(const float32x4_t a, const float32x4_t b) const { + return vmulq_f32(a, b); + } +}; + +class SSEDiv { +public: + static const bool sse = true; + INLINE float32x4_t vecOp(const float32x4_t a, const float32x4_t b) const { + float32x4_t tmp; + tmp = vrecpeq_f32(b); + return vmulq_f32(a, tmp); + } +}; + +class SSESquaredDiff { +public: + static const bool sse = true; + INLINE float32x4_t vecOp(const float32x4_t a, const float32x4_t b) const { + float32x4_t tmp; + tmp = vsubq_f32(a, b); + return vmulq_f32(tmp, tmp); + } +}; + +class SSEFirst { +public: + static const bool sse = true; + INLINE float32x4_t vecOp(const float32x4_t a, const float32x4_t b) const { + return a; + } +}; + +class SSESecond { +public: + static const bool sse = true; + INLINE float32x4_t vecOp(const float32x4_t a, const float32x4_t b) const { + return b; + } +}; + +class SSEClassificationError { +public: + static const bool sse = true; + const real p; + float32x4_t mp; + uint32x4_t result; + +public: + explicit SSEClassificationError(const real s) : p(s) { + mp = vdupq_n_f32(p); + result = vdupq_n_u32(1); + } + // TODO: to be check + INLINE float32x4_t vecOp(const float32x4_t a, const float32x4_t b) const { + uint32x4_t tmp1 = vcgtq_f32(a, mp); + uint32x4_t tmp2 = vcgtq_f32(b, mp); + uint32x4_t tmp3 = veorq_u32(tmp1, tmp2); + return vcvtq_f32_u32(vandq_u32(tmp3, result)); + } +}; +} // namespace binary +} // namespace base + +#endif /* HL_MATRIX_BASE_NEON_CUH_ */ diff --git a/paddle/cuda/include/hl_matrix_type.cuh b/paddle/cuda/include/hl_matrix_type.cuh index 59213eee75f50d3c054ed8684a9a0e1053342a0a..f965ba966793f6f6eea0ad3606f60553fe904dda 100644 --- a/paddle/cuda/include/hl_matrix_type.cuh +++ b/paddle/cuda/include/hl_matrix_type.cuh @@ -17,13 +17,20 @@ limitations under the License. */ #include "hl_base.h" -#ifdef __CUDA_ARCH__ +#if defined(__CUDA_ARCH__) #include #ifndef PADDLE_TYPE_DOUBLE typedef float4 vecType; #else typedef double2 vecType; #endif +#elif (defined __ARM_NEON) || (defined __ARM_NEON__) +#include +#ifndef PADDLE_TYPE_DOUBLE +typedef float32x4_t vecType; +#else +#error NEON instructions does not support double precision +#endif #else #include #include diff --git a/paddle/cuda/include/hl_neon_matrix_kernel.cuh b/paddle/cuda/include/hl_neon_matrix_kernel.cuh new file mode 100644 index 0000000000000000000000000000000000000000..7b4e5b00079b66d0a46a1344a43f41962cf50f10 --- /dev/null +++ b/paddle/cuda/include/hl_neon_matrix_kernel.cuh @@ -0,0 +1,299 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + + +#ifndef HL_NEON_MATRIX_KERNEL_CUH_ +#define HL_NEON_MATRIX_KERNEL_CUH_ + +#include "hl_matrix_type.cuh" + +#define VECTOR_SIZE 16 + +/* number of float in vector */ +#define VECTOR_LEN 4 +#define VECTOR_SET vdupq_n_f32 + +inline bool hl_check_align(size_t size) { + return !(size & (VECTOR_SIZE - 1)); +} + +inline bool hl_check_align(void *ptr) { + return hl_check_align(reinterpret_cast(ptr)); +} + +template +inline real hl_agg_op(Agg agg, vecType mm) { + float32x4_t rev = vrev64q_f32(mm); + float32x4_t tmp1 = agg.vecOp(rev, rev); + float32x2_t lo = vget_high_f32(rev); + float32x2_t hi = vget_low_f32(rev); + float32x4_t tmp2 = vcombine_f32(hi, lo); + float32x4_t ret = agg.vecOp(tmp1, tmp2); + + return vgetq_lane_f32(ret, 0); +} + +template +void hl_sse_matrix_row_op(Agg agg, Op op, Saver sv, + int dimM, int dimN, + real *dst, int ld, + real *A, int lda) { + for (int i = 0; i < dimM; i++, A += lda) { + vecType mm = VECTOR_SET(agg.init()); + vecType *a = (vecType*)(A); + for (int j = 0; j < dimN / VECTOR_LEN; j++, a++) { + mm = agg.vecOp(mm, op.vecOp(*a)); + } + + int rem = dimN % VECTOR_LEN; + if (rem) { + real tmp = hl_agg_op(agg, mm); + real *a = A + (dimN / VECTOR_LEN) * VECTOR_LEN; + for (int j = 0; j < rem; j++) { + tmp = agg(tmp, op(a[j])); + } + dst[i*ld] = sv(dst[i*ld], tmp); + } else { + dst[i*ld] = sv(dst[i*ld], hl_agg_op(agg, mm)); + } + } +} + +template +void hl_sse_matrix_row_op(Agg agg, Op op, Saver sv, + int dimM, int dimN, + real *dst, int ld, + real *A, int lda, + real *B, int ldb) { + for (int i = 0; i < dimM; i++, A += lda, B += ldb) { + vecType mm = VECTOR_SET(agg.init()); + vecType *a = (vecType*)(A); + vecType *b = (vecType*)(B); + for (int j = 0; j < dimN / VECTOR_LEN; j++, a++, b++) { + mm = agg.vecOp(mm, op.vecOp(*a, *b)); + } + + int rem = dimN % VECTOR_LEN; + if (rem) { + real tmp = hl_agg_op(agg, mm); + real *a = A + (dimN / VECTOR_LEN) * VECTOR_LEN; + real *b = B + (dimN / VECTOR_LEN) * VECTOR_LEN; + for (int j = 0; j < rem; j++) { + tmp = agg(tmp, op(a[j], b[j])); + } + dst[i*ld] = sv(dst[i*ld], tmp); + } else { + dst[i*ld] = sv(dst[i*ld], hl_agg_op(agg, mm)); + } + } +} + +template +void hl_matrix_column_op(Agg agg, Op op, Saver sv, + int dimM, int dimN, + real *dst, + real *A, int lda) { + for (int j = 0; j < dimN; j++) { + real tmp = agg.init(); + for (int i = 0; i < dimM; i++) { + tmp = agg(tmp, op(A[i * lda + j])); + } + dst[j] = sv(dst[j], tmp); + } +} + +template +void hl_matrix_column_op(Agg agg, Op op, Saver sv, + int dimM, int dimN, + real *dst, + real *A, int lda, + real *B, int ldb) { + for (int j = 0; j < dimN; j++) { + real tmp = agg.init(); + for (int i = 0; i < dimM; i++) { + tmp = agg(tmp, op(A[i * lda + j], B[i * ldb + j])); + } + dst[j] = sv(dst[j], tmp); + } +} + +/* + * MaxRow greater than or equal dimN + * dimN is multiples of VECTOR_LEN + * so rem <= MaxRow / VECTOR_LEN + */ +template +void hl_sse_column_op_with_rem(Agg agg, Op op, Saver sv, + int dimM, int dimN, + real *dst, + real *A, int lda) { + vecType mm[MaxRow / VECTOR_LEN]; + for (int n = 0; n < MaxRow / VECTOR_LEN; n++) { + mm[n] = VECTOR_SET(agg.init()); + } + + for (int i = 0; i < dimM; i++) { + vecType *a = (vecType*)(A + i * lda); + for (int n = 0; n < dimN / VECTOR_LEN; n++) { + mm[n] = agg.vecOp(mm[n], op.vecOp(a[n])); + } + } + + vecType *result = (vecType*)(dst); + for (int n = 0; n < dimN / VECTOR_LEN; n++) { + result[n] = sv.vecOp(result[n], mm[n]); + } + + int rem = dimN % VECTOR_LEN; + if (rem) { + A += (dimN / VECTOR_LEN) * VECTOR_LEN; + dst += (dimN / VECTOR_LEN) * VECTOR_LEN; + hl_matrix_column_op(agg, op, sv, dimM, rem, dst, A, lda); + } +} + +/* + * dimN is multiples of VECTOR_LEN + * dimN greater than Step + */ +template +void hl_sse_matrix_column_op(Agg agg, Op op, Saver sv, + int dimM, int dimN, + real *dst, + real *A, int lda) { + for (int j = 0; j < dimN / Step; j++, dst += Step, A += Step) { + vecType mm[Step / VECTOR_LEN]; + for (int n = 0; n < Step / VECTOR_LEN; n++) { + mm[n] = VECTOR_SET(agg.init()); + } + + for (int i = 0; i < dimM; i++) { + vecType *a = (vecType*)(A + i * lda); + for (int n = 0; n < Step / VECTOR_LEN; n++) { + mm[n] = agg.vecOp(mm[n], op.vecOp(a[n])); + } + } + + vecType *result = (vecType*)(dst); + for (int n = 0; n < Step / VECTOR_LEN; n++) { + result[n] = sv.vecOp(result[n], mm[n]); + } + } + + int remRow = dimN % Step; + if (remRow) { + hl_sse_column_op_with_rem(agg, op, sv, dimM, remRow, dst, A, lda); + } +} + +template +void hl_sse_matrix_column_op(Agg agg, Op op, Saver sv, + int dimM, int dimN, + real *dst, + real *A, int lda) { + if (dimN <= 16) { + hl_sse_matrix_column_op<16>(agg, op, sv, dimM, dimN, dst, A, lda); + } else if (dimN <= 32) { + hl_sse_matrix_column_op<32>(agg, op, sv, dimM, dimN, dst, A, lda); + } else if (dimN <= 1024 || dimM <= 512) { + hl_sse_matrix_column_op<64>(agg, op, sv, dimM, dimN, dst, A, lda); + } else { + hl_sse_matrix_column_op<1024>(agg, op, sv, dimM, dimN, dst, A, lda); + } +} + +template +void hl_sse_column_op_with_rem(Agg agg, Op op, Saver sv, + int dimM, int dimN, + real *dst, + real *A, int lda, + real *B, int ldb) { + vecType mm[MaxRow / VECTOR_LEN]; + for (int n = 0; n < MaxRow / VECTOR_LEN; n++) { + mm[n] = VECTOR_SET(agg.init()); + } + + for (int i = 0; i < dimM; i++) { + vecType *a = (vecType*)(A + i * lda); + vecType *b = (vecType*)(B + i * ldb); + for (int n = 0; n < dimN / VECTOR_LEN; n++) { + mm[n] = agg.vecOp(mm[n], op.vecOp(a[n], b[n])); + } + } + + vecType *result = (vecType*)(dst); + for (int n = 0; n < dimN / VECTOR_LEN; n++) { + result[n] = sv.vecOp(result[n], mm[n]); + } + + int rem = dimN % VECTOR_LEN; + if (rem) { + A += (dimN / VECTOR_LEN) * VECTOR_LEN; + B += (dimN / VECTOR_LEN) * VECTOR_LEN; + dst += (dimN / VECTOR_LEN) * VECTOR_LEN; + hl_matrix_column_op(agg, op, sv, dimM, rem, dst, A, lda, B, ldb); + } +} + +template +void hl_sse_matrix_column_op(Agg agg, Op op, Saver sv, + int dimM, int dimN, + real *dst, + real *A, int lda, + real *B, int ldb) { + for (int j = 0; j < dimN / Step; j++, dst += Step, A += Step, B += Step) { + vecType mm[Step / VECTOR_LEN]; + for (int n = 0; n < Step / VECTOR_LEN; n++) { + mm[n] = VECTOR_SET(agg.init()); + } + + for (int i = 0; i < dimM; i++) { + vecType *a = (vecType*)(A + i * lda); + vecType *b = (vecType*)(B + i * ldb); + for (int n = 0; n < Step / VECTOR_LEN; n++) { + mm[n] = agg.vecOp(mm[n], op.vecOp(a[n], b[n])); + } + } + + vecType *result = (vecType*)(dst); + for (int n = 0; n < Step / VECTOR_LEN; n++) { + result[n] = sv.vecOp(result[n], mm[n]); + } + } + + int remRow = dimN % Step; + if (remRow) { + hl_sse_column_op_with_rem( + agg, op, sv, dimM, remRow, dst, A, lda, B, ldb); + } +} + +template +void hl_sse_matrix_column_op(Agg agg, Op op, Saver sv, + int dimM, int dimN, + real *dst, + real *A, int lda, + real *B, int ldb) { + if (dimN <= 16) { + hl_sse_matrix_column_op<16>(agg, op, sv, dimM, dimN, dst, A, lda, B, ldb); + } else if (dimN <= 32) { + hl_sse_matrix_column_op<32>(agg, op, sv, dimM, dimN, dst, A, lda, B, ldb); + } else if (dimN <= 1024 || dimM <= 512) { + hl_sse_matrix_column_op<64>(agg, op, sv, dimM, dimN, dst, A, lda, B, ldb); + } else { + hl_sse_matrix_column_op<1024>(agg, op, sv, dimM, dimN, dst, A, lda, B, ldb); + } +} + +#endif /* HL_NEON_MATRIX_KERNEL_CUH_ */ diff --git a/paddle/cuda/include/hl_sequence.h b/paddle/cuda/include/hl_sequence.h index 9f9d8f972e3a4c62e5caedcf85054be5681b96c1..973ddcceed99ba4177b3db277e664611d42ac51b 100644 --- a/paddle/cuda/include/hl_sequence.h +++ b/paddle/cuda/include/hl_sequence.h @@ -159,4 +159,10 @@ extern void hl_sequence_avg_forward(real* dst, int width, const int mode); +extern void hl_sequence_avg_backward(real* dst, + real* src, + const int* starts, + int height, + int width, + const int mode); #endif /* HL_SEQUENCE_H_ */ diff --git a/paddle/cuda/include/stub/hl_sequence_stub.h b/paddle/cuda/include/stub/hl_sequence_stub.h index 05e51bce9e1df6fc6ef1cad891b44a9172da185d..920b417b1c717efaff75f70f1b9d2b574469e425 100644 --- a/paddle/cuda/include/stub/hl_sequence_stub.h +++ b/paddle/cuda/include/stub/hl_sequence_stub.h @@ -57,4 +57,10 @@ inline void hl_sequence_avg_forward(real* dst, int width, const int mode) {} +inline void hl_sequence_avg_backward(real* dst, + real* src, + const int* starts, + int height, + int width, + const int mode) {} #endif // HL_SEQUENCE_STUB_H_ diff --git a/paddle/cuda/src/hl_cuda_sequence.cu b/paddle/cuda/src/hl_cuda_sequence.cu index ba823de2720336851bf9c49d8162360af93e8601..0fe2877f89f8d0fbc4db40c400037be30bb87ff7 100644 --- a/paddle/cuda/src/hl_cuda_sequence.cu +++ b/paddle/cuda/src/hl_cuda_sequence.cu @@ -325,12 +325,12 @@ __global__ void KeSequenceAvgForward(real* dst, int seqLength = end - start; if (seqLength == 0) return; real sum = 0.0; - for (int i = 0; i < seqLength; i++) { - sum += src[(start + i) * width + col]; + for (int i = start; i < end; i++) { + sum += src[i * width + col]; } sum = mode == 1 ? sum : (mode == 0 ? sum / seqLength : sum * my_rsqrt((real)seqLength)); - dst[row * width + col] = sum; + dst[gid] = sum; } } @@ -354,3 +354,48 @@ void hl_sequence_avg_forward(real* dst, (dst, src, starts, height, width, mode); CHECK_SYNC("hl_sequence_avg_forward failed"); } + +__global__ void KeSequenceAvgBackward(real* dst, + real* src, + const int* starts, + int height, + int width, + const int mode) { + int gid = blockIdx.x * blockDim.x + threadIdx.x; + int row = gid / width; + int col = gid % width; + + if (gid < height * width) { + int start = starts[row]; + int end = starts[row + 1]; + int seqLength = end - start; + if (seqLength == 0) return; + real grad = src[gid]; + grad = mode == 1 ? grad : + (mode == 0 ? grad / seqLength : grad * my_rsqrt((real)seqLength)); + for (int i = start; i < end; i++) { + dst[i * width + col] += grad; + } + } +} + +void hl_sequence_avg_backward(real* dst, + real* src, + const int* starts, + int height, + int width, + const int mode) { + CHECK_NOTNULL(dst); + CHECK_NOTNULL(src); + CHECK_NOTNULL(starts); + + int block = 512; + int grid = DIVUP(width * height, 512); + + CHECK(mode == 0 || mode == 1 || mode == 2) + << "mode error in hl_sequence_avg_backward!"; + + KeSequenceAvgBackward<<< grid, block, 0, STREAM_DEFAULT >>> + (dst, src, starts, height, width, mode); + CHECK_SYNC("hl_sequence_avg_backward failed"); +} diff --git a/paddle/function/Function.cpp b/paddle/function/Function.cpp index f47d55a4ade97d76e0f1940a2234e34e20efade6..f71c0f681b3bc524ba96c55f1dcad30ef59478c8 100644 --- a/paddle/function/Function.cpp +++ b/paddle/function/Function.cpp @@ -16,66 +16,6 @@ limitations under the License. */ namespace paddle { -template <> -size_t FuncConfig::get(const std::string& key) const { - auto it = valueMap_.find(key); - CHECK(it != valueMap_.end()) << "Cannot find value: '" << key << "'"; - return it->second.s; -} - -template <> -real FuncConfig::get(const std::string& key) const { - auto it = valueMap_.find(key); - CHECK(it != valueMap_.end()) << "Cannot find value: '" << key << "'"; - return it->second.r; -} - -template <> -int FuncConfig::get(const std::string& key) const { - auto it = valueMap_.find(key); - CHECK(it != valueMap_.end()) << "Cannot find value: '" << key << "'"; - return it->second.i; -} - -template <> -bool FuncConfig::get(const std::string& key) const { - auto it = valueMap_.find(key); - CHECK(it != valueMap_.end()) << "Cannot find value: '" << key << "'"; - return it->second.b; -} - -template <> -FuncConfig& FuncConfig::set(const std::string& key, size_t v) { - CHECK_EQ(static_cast(valueMap_.count(key)), 0) << "Duplicated value: " - << key; - valueMap_[key].s = v; - return *this; -} - -template <> -FuncConfig& FuncConfig::set(const std::string& key, real v) { - CHECK_EQ(static_cast(valueMap_.count(key)), 0) << "Duplicated value: " - << key; - valueMap_[key].r = v; - return *this; -} - -template <> -FuncConfig& FuncConfig::set(const std::string& key, int v) { - CHECK_EQ(static_cast(valueMap_.count(key)), 0) << "Duplicated value: " - << key; - valueMap_[key].i = v; - return *this; -} - -template <> -FuncConfig& FuncConfig::set(const std::string& key, bool v) { - CHECK_EQ(static_cast(valueMap_.count(key)), 0) << "Duplicated value: " - << key; - valueMap_[key].b = v; - return *this; -} - void BufferArgs::addArg(const Matrix& arg, const TensorShape& shape, ArgType argType) { diff --git a/paddle/function/Function.h b/paddle/function/Function.h index 3bbeb6e525f85bdde9a54c8d60146eaa30a1bb4d..15eb35b7f7dac1b98f2d8694707d83b84bda0f2e 100644 --- a/paddle/function/Function.h +++ b/paddle/function/Function.h @@ -18,32 +18,49 @@ limitations under the License. */ #include #include "BufferArg.h" #include "paddle/math/Matrix.h" +#include "paddle/utils/Any.h" #include "paddle/utils/ClassRegistrar.h" +#include "paddle/utils/Error.h" namespace paddle { /** * Function Configuration. * The argument type of Function::init. - * Follow-up will consider moving this data structure to Proto inside. */ class FuncConfig { public: - union value { - size_t s; - real r; - int i; - bool b; - }; - template - T get(const std::string& key) const; + T get(const std::string& key, Error* err = nullptr) const { + try { + return any_cast(valueMap_.at(key)); + } catch (std::exception& e) { // could be cast or out of range exception. + if (err) { + *err = Error(e.what()); + } else { + LOG(FATAL) << "Cannot get key " << key << " with error " << e.what(); + } + return T(); + } + } template - FuncConfig& set(const std::string& key, T v); + FuncConfig& set(const std::string& key, T v, Error* err = nullptr) { + auto it = valueMap_.find(key); + if (it != valueMap_.end()) { // already contains key. + if (err) { + *err = Error("Key %s is already set in FuncConfig", key.c_str()); + } else { + LOG(FATAL) << "Key " << key << " is already set in FuncConfig."; + } + return *this; + } + valueMap_[key] = any(v); + return *this; + } protected: - std::map valueMap_; + mutable std::unordered_map valueMap_; }; /** diff --git a/paddle/function/PadOp.cpp b/paddle/function/PadOp.cpp index f1a0d2a1a96f24ddff8cd120681a8bc8cddaf40a..adba7c92ece505eecc74edce6b393cf27fa10ccc 100644 --- a/paddle/function/PadOp.cpp +++ b/paddle/function/PadOp.cpp @@ -25,9 +25,9 @@ void Pad(real* outputs, const int inH, const int inW, const PadConf& pad) { - int cstart = pad.channelStart, cend = pad.channelEnd; - int hstart = pad.heightStart, hend = pad.heightEnd; - int wstart = pad.widthStart, wend = pad.widthEnd; + int cstart = pad.channel[0], cend = pad.channel[1]; + int hstart = pad.height[0], hend = pad.height[1]; + int wstart = pad.width[0], wend = pad.width[1]; int outC = inC + cstart + cend; int outH = inH + hstart + hend; int outW = inW + wstart + wend; @@ -51,9 +51,9 @@ void PadGrad(real* inGrad, const int inH, const int inW, const PadConf& pad) { - int cstart = pad.channelStart, cend = pad.channelEnd; - int hstart = pad.heightStart, hend = pad.heightEnd; - int wstart = pad.widthStart, wend = pad.widthEnd; + int cstart = pad.channel[0], cend = pad.channel[1]; + int hstart = pad.height[0], hend = pad.height[1]; + int wstart = pad.width[0], wend = pad.width[1]; int outC = inC + cstart + cend; int outH = inH + hstart + hend; int outW = inW + wstart + wend; @@ -71,6 +71,12 @@ void PadGrad(real* inGrad, } } +static inline PadConf castToPadConf(const FuncConfig& conf) { + return {conf.get>("channel"), + conf.get>("height"), + conf.get>("width")}; +} + /** * \brief Padding zeros to input according to the specify dimension. * The struct pad_ contains the padding size in each dimension. @@ -127,14 +133,7 @@ void PadGrad(real* inGrad, template class PadFunc : public FunctionBase { public: - void init(const FuncConfig& config) override { - pad_.channelStart = config.get("cstart"); - pad_.channelEnd = config.get("cend"); - pad_.heightStart = config.get("hstart"); - pad_.heightEnd = config.get("hend"); - pad_.widthStart = config.get("wstart"); - pad_.widthEnd = config.get("wend"); - } + void init(const FuncConfig& config) override { pad_ = castToPadConf(config); } void calc(const BufferArgs& inputs, const BufferArgs& outputs) override { CHECK_EQ(1UL, inputs.size()); @@ -175,14 +174,7 @@ private: template class PadGradFunc : public FunctionBase { public: - void init(const FuncConfig& config) override { - pad_.channelStart = config.get("cstart"); - pad_.channelEnd = config.get("cend"); - pad_.heightStart = config.get("hstart"); - pad_.heightEnd = config.get("hend"); - pad_.widthStart = config.get("wstart"); - pad_.widthEnd = config.get("wend"); - } + void init(const FuncConfig& config) override { pad_ = castToPadConf(config); } void calc(const BufferArgs& inputs, const BufferArgs& outputs) override { CHECK_EQ(1UL, inputs.size()); diff --git a/paddle/function/PadOp.h b/paddle/function/PadOp.h index 7b5c730a6a0fa57833e63beba085cb17054ae2f5..0e226ec7370b9897ebdc697ee528b90a37e4ec56 100644 --- a/paddle/function/PadOp.h +++ b/paddle/function/PadOp.h @@ -19,18 +19,12 @@ limitations under the License. */ namespace paddle { struct PadConf { - /// how many values to add before the data along channel dimension. - int channelStart; - /// how many values to add after the data along channel dimension. - int channelEnd; - /// how many values to add before the data along height dimension. - int heightStart; - /// how many values to add after the data along height dimension. - int heightEnd; - /// how many values to add before the data along width dimension. - int widthStart; - /// how many values to add after the data along width dimension. - int widthEnd; + /// how many values to add before/after the data along channel dimension. + std::vector channel; + /// how many values to add before/after the data along height dimension. + std::vector height; + /// how many values to add before/after the data along width dimension. + std::vector width; }; /** diff --git a/paddle/function/PadOpGpu.cu b/paddle/function/PadOpGpu.cu index 9104b1aca507c526858c2117e0a5db59f535091e..9094f1528433fdcaad3397a991aa8ac6fa04bc01 100644 --- a/paddle/function/PadOpGpu.cu +++ b/paddle/function/PadOpGpu.cu @@ -44,9 +44,9 @@ void Pad(real* outputs, size_t nth = num * inC * inH * inW; int blockSize = 1024; int gridSize = (nth + 1024 - 1) / 1024; - int cstart = pad.channelStart, cend = pad.channelEnd; - int hstart = pad.heightStart, hend = pad.heightEnd; - int wstart = pad.widthStart, wend = pad.widthEnd; + int cstart = pad.channel[0], cend = pad.channel[1]; + int hstart = pad.height[0], hend = pad.height[1]; + int wstart = pad.width[0], wend = pad.width[1]; int outC = inC + cstart + cend; int outH = inH + hstart + hend; int outW = inW + wstart + wend; @@ -83,9 +83,9 @@ void PadGrad(real* inGrad, int nth = num * inC * inH * inW; int blockSize = 1024; int gridSize = (nth + 1024 - 1) / 1024; - int cstart = pad.channelStart, cend = pad.channelEnd; - int hstart = pad.heightStart, hend = pad.heightEnd; - int wstart = pad.widthStart, wend = pad.widthEnd; + int cstart = pad.channel[0], cend = pad.channel[1]; + int hstart = pad.height[0], hend = pad.height[1]; + int wstart = pad.width[0], wend = pad.width[1]; int outC = inC + cstart + cend; int outH = inH + hstart + hend; int outW = inW + wstart + wend; diff --git a/paddle/function/PadOpTest.cpp b/paddle/function/PadOpTest.cpp index cd22d9113567912f7694e05e5d631e49d940e3ac..f77ac2a8c49c83f2d6c64c2a30b6a2f2eb09ac10 100644 --- a/paddle/function/PadOpTest.cpp +++ b/paddle/function/PadOpTest.cpp @@ -24,48 +24,22 @@ TEST(Pad, real) { for (size_t imgSizeW : {5, 32, 96}) { VLOG(3) << " numSamples=" << numSamples << " channels=" << channels << " imgSizeH=" << imgSizeH << " imgSizeW=" << imgSizeW; - - FunctionCompare compare("Pad", - FuncConfig() - .set("cstart", 2) - .set("cend", 3) - .set("hstart", 1) - .set("hend", 2) - .set("wstart", 3) - .set("wend", 2)); - TensorShape inDims{numSamples, channels, imgSizeH, imgSizeW}; - TensorShape outDims{ - numSamples, channels + 5, imgSizeH + 3, imgSizeW + 5}; - compare.addInputs(BufferArg(VALUE_TYPE_FLOAT, inDims)); - compare.addOutputs(BufferArg(VALUE_TYPE_FLOAT, outDims, ASSIGN_TO)); - compare.run(); - } - } - } - } -} - -TEST(PadGrad, real) { - for (size_t numSamples : {5, 32}) { - for (size_t channels : {1, 5, 32}) { - for (size_t imgSizeH : {5, 33, 100}) { - for (size_t imgSizeW : {5, 32, 96}) { - VLOG(3) << " numSamples=" << numSamples << " channels=" << channels - << " imgSizeH=" << imgSizeH << " imgSizeW=" << imgSizeW; - FunctionCompare compare("PadGrad", - FuncConfig() - .set("cstart", 2) - .set("cend", 3) - .set("hstart", 1) - .set("hend", 2) - .set("wstart", 3) - .set("wend", 2)); - TensorShape inDims{numSamples, channels, imgSizeH, imgSizeW}; - TensorShape outDims{ - numSamples, channels + 5, imgSizeH + 3, imgSizeW + 5}; - compare.addInputs(BufferArg(VALUE_TYPE_FLOAT, outDims)); - compare.addOutputs(BufferArg(VALUE_TYPE_FLOAT, inDims, ASSIGN_TO)); - compare.run(); + for (bool test_grad : {false, true}) { + FunctionCompare compare( + test_grad ? "PadGrad" : "Pad", + FuncConfig() + .set>("channel", {2, 3}) + .set>("height", {1, 2}) + .set>("width", {3, 2})); + TensorShape inDims{numSamples, channels, imgSizeH, imgSizeW}; + TensorShape outDims{ + numSamples, channels + 5, imgSizeH + 3, imgSizeW + 5}; + compare.addInputs( + BufferArg(VALUE_TYPE_FLOAT, test_grad ? outDims : inDims)); + compare.addOutputs(BufferArg( + VALUE_TYPE_FLOAT, test_grad ? inDims : outDims, ASSIGN_TO)); + compare.run(); + } } } } diff --git a/paddle/gserver/layers/AgentLayer.cpp b/paddle/gserver/layers/AgentLayer.cpp index 2d300290279d6aafc162f11dbc809537a308ca79..7b1b99b135e35e5fe41dbb3d053a96e3e31e5cf1 100644 --- a/paddle/gserver/layers/AgentLayer.cpp +++ b/paddle/gserver/layers/AgentLayer.cpp @@ -42,7 +42,8 @@ void AgentLayer::forward(PassType passType) { // get Arguments from real layers if (numSamples_ > 0 && numSamples_ < realHeight) { if (realOutput.ids) { - output_.ids->subVecFrom(*realOutput.ids, 0, numSamples_); + output_.ids = + IVector::create(realOutput.ids->getData(), numSamples_, useGpu_); } else { output_.subArgFrom( realOutput, /* offset */ 0, numSamples_, getSize(), useGpu_); diff --git a/paddle/gserver/layers/AverageLayer.cpp b/paddle/gserver/layers/AverageLayer.cpp index b8955ab04f209629c855ed66f8e8e9701b7224a3..96cc4288c6faad4b80c790ed2ce6f5128ea83b6d 100644 --- a/paddle/gserver/layers/AverageLayer.cpp +++ b/paddle/gserver/layers/AverageLayer.cpp @@ -26,8 +26,6 @@ bool AverageLayer::init(const LayerMap& layerMap, const ParameterMap& parameterMap) { SequencePoolLayer::init(layerMap, parameterMap); - dataMtx_ = Matrix::create(nullptr, 1, 1, false, useGpu_); - outMtx_ = Matrix::create(nullptr, 1, getSize(), false, useGpu_); // average strategy if (config_.average_strategy() == "average") { mode_ = kAverage; @@ -60,43 +58,9 @@ void AverageLayer::forward(PassType passType) { void AverageLayer::backward(const UpdateCallback& callback) { SequencePoolLayer::backward(callback); - const int* starts = startPositions_->getData(false); - MatrixPtr grad = getInputGrad(0); - - if (grad) { - size_t dim = getSize(); - real* gradientData = getInputGrad(0)->getData(); - real* gradient = getOutputGrad()->getData(); - size_t numSequences = startPositions_->getSize() - 1; - for (size_t sequenceId = 0; sequenceId < numSequences; ++sequenceId) { - // TODO(Dangqingqing) optimization for GPU - int sequenceLength = starts[sequenceId + 1] - starts[sequenceId]; - if (0 == sequenceLength) { - // empty sequence - continue; - } - dataMtx_->setData( - gradientData + starts[sequenceId] * dim, sequenceLength, dim); - outMtx_->setData(gradient + sequenceId * dim); - switch (mode_) { - case kAverage: { - // plain average - dataMtx_->addBias(*outMtx_, 1.0f / sequenceLength); - break; - } - case kSum: { - // sum instead of average - dataMtx_->addBias(*outMtx_, 1.0f); - break; - } - case kAverageSquareRootN: { - // divide by square root of sequenceLength - dataMtx_->addBias(*outMtx_, 1.0f / sqrt(sequenceLength)); - break; - } - default: { LOG(FATAL) << "should not reach here"; } - } - } + if (getInputGrad(0)) { + getInputGrad(0)->sequenceAvgBackward( + *getOutputGrad(), *startPositions_->getVector(useGpu_), mode_); } } diff --git a/paddle/gserver/layers/AverageLayer.h b/paddle/gserver/layers/AverageLayer.h index 621e1d7bb12ec5b8c7a6173bd601835d9406e814..332552a30479a368c24db10e5ef3a9d59408c8ef 100644 --- a/paddle/gserver/layers/AverageLayer.h +++ b/paddle/gserver/layers/AverageLayer.h @@ -45,8 +45,6 @@ public: void backward(const UpdateCallback& callback = nullptr) override; protected: - MatrixPtr outMtx_; - MatrixPtr dataMtx_; int mode_; }; } // namespace paddle diff --git a/paddle/gserver/layers/ExpandConvBaseLayer.cpp b/paddle/gserver/layers/ExpandConvBaseLayer.cpp index 9ddccc202705c024076db795a9aeda0c823e9399..fdcf994cdb47f2409b045a1337332e2f4c304fbc 100644 --- a/paddle/gserver/layers/ExpandConvBaseLayer.cpp +++ b/paddle/gserver/layers/ExpandConvBaseLayer.cpp @@ -107,6 +107,10 @@ void ExpandConvBaseLayer::expandOneFrame(MatrixPtr image, int channel = isDeconv_ ? numFilters_ : channels_[inIdx]; resetExpandInput(subK_[inIdx] * groups_[inIdx], subN_[inIdx]); + + CHECK_EQ(image->getWidth(), + static_cast(imgSizeH_[inIdx] * imgSizeW_[inIdx] * channel)); + real *imgData = image->getData() + startIdx * image->getWidth(); MatrixPtr imageTmp = Matrix::create(imgData, diff --git a/paddle/gserver/layers/HierarchicalSigmoidLayer.h b/paddle/gserver/layers/HierarchicalSigmoidLayer.h index 3f6875fb9f007c0938bfcd7cad99c73b4ba1511b..9afd40b1674680da962d6e51caa56b46279b70de 100644 --- a/paddle/gserver/layers/HierarchicalSigmoidLayer.h +++ b/paddle/gserver/layers/HierarchicalSigmoidLayer.h @@ -36,7 +36,7 @@ namespace paddle { * | |- 5 * | * |-*- 0 - * |- 1 + * |- 1 * @endcode * * where * indicates an internal node, and each leaf node represents a class. diff --git a/paddle/gserver/layers/PadLayer.cpp b/paddle/gserver/layers/PadLayer.cpp index bb618c09f9777785d93995fa7140dd4a5383cd1b..a5ed7e057aea8f065ee752f8c0f0d2d9bdddfc8b 100644 --- a/paddle/gserver/layers/PadLayer.cpp +++ b/paddle/gserver/layers/PadLayer.cpp @@ -36,12 +36,9 @@ bool PadLayer::init(const LayerMap& layerMap, CHECK_EQ(2, pad_conf.pad_c_size()); CHECK_EQ(2, pad_conf.pad_h_size()); CHECK_EQ(2, pad_conf.pad_w_size()); - padc_.push_back(pad_conf.pad_c(0)); - padc_.push_back(pad_conf.pad_c(1)); - padh_.push_back(pad_conf.pad_h(0)); - padh_.push_back(pad_conf.pad_h(1)); - padw_.push_back(pad_conf.pad_w(0)); - padw_.push_back(pad_conf.pad_w(1)); + padc_ = {pad_conf.pad_c(0), pad_conf.pad_c(1)}; + padh_ = {pad_conf.pad_h(0), pad_conf.pad_h(1)}; + padw_ = {pad_conf.pad_w(0), pad_conf.pad_w(1)}; outDims_ = TensorShape(4); setOutDims(0); @@ -49,21 +46,15 @@ bool PadLayer::init(const LayerMap& layerMap, createFunction(forward_, "Pad", FuncConfig() - .set("cstart", padc_[0]) - .set("cend", padc_[1]) - .set("hstart", padh_[0]) - .set("hend", padh_[1]) - .set("wstart", padw_[0]) - .set("wend", padw_[1])); + .set("channel", padc_) + .set("height", padh_) + .set("width", padw_)); createFunction(backward_, "PadGrad", FuncConfig() - .set("cstart", padc_[0]) - .set("cend", padc_[1]) - .set("hstart", padh_[0]) - .set("hend", padh_[1]) - .set("wstart", padw_[0]) - .set("wend", padw_[1])); + .set("channel", padc_) + .set("height", padh_) + .set("width", padw_)); return true; } diff --git a/paddle/gserver/layers/PadLayer.h b/paddle/gserver/layers/PadLayer.h index b2bbf28082e630aeb429ee997a1d43ce7ba05d1c..fe9388d8cc260ed599af0113361f4687f3f4a18b 100644 --- a/paddle/gserver/layers/PadLayer.h +++ b/paddle/gserver/layers/PadLayer.h @@ -38,9 +38,9 @@ protected: void setOutDims(const size_t batchSize); void setTensorDim(const size_t batchSize); - std::vector padc_; - std::vector padh_; - std::vector padw_; + std::vector padc_; + std::vector padh_; + std::vector padw_; TensorShape inDims_; TensorShape outDims_; }; diff --git a/paddle/gserver/tests/LayerGradUtil.cpp b/paddle/gserver/tests/LayerGradUtil.cpp index 7617af10ba719490d1b33dd297b070cd8c7c292c..a0b1cd471dd02fd20bb2247395bdb74651610bbf 100644 --- a/paddle/gserver/tests/LayerGradUtil.cpp +++ b/paddle/gserver/tests/LayerGradUtil.cpp @@ -778,8 +778,10 @@ void testProjectionGrad(ProjectionConfig conf, config.biasSize = biasSize == 0 ? config.layerConfig.size() : biasSize; config.layerConfig.set_bias_size(config.biasSize); config.layerConfig.set_shared_biases(sharedBias); - config.inputDefs.push_back( - {inputType, "layer_0", conf.input_size(), parameterSize}); + config.inputDefs.push_back({inputType, + "layer_0", + static_cast(conf.input_size()), + parameterSize}); *config.layerConfig.add_inputs()->mutable_proj_conf() = conf; config.testState = testState; testLayerGrad(config, "mixed", batchSize, false, useGpu); diff --git a/paddle/math/MathFunctions.cpp b/paddle/math/MathFunctions.cpp index d7aa1184872d5a6129becca1f6e282776c9dbe15..6203cd3b9ab9f95853cd3c46750fd55d6dfbba4a 100644 --- a/paddle/math/MathFunctions.cpp +++ b/paddle/math/MathFunctions.cpp @@ -85,11 +85,16 @@ int getrf(const CBLAS_ORDER order, float* A, const int lda, int* ipiv) { +#ifdef PADDLE_USE_LAPACK #ifdef PADDLE_USE_ATLAS return clapack_sgetrf(order, M, N, A, lda, ipiv); #else return LAPACKE_sgetrf(order, M, N, A, lda, ipiv); #endif +#else + LOG(FATAL) << "Not implemented"; +#endif + return 0; } template <> @@ -99,11 +104,16 @@ int getrf(const CBLAS_ORDER order, double* A, const int lda, int* ipiv) { +#ifdef PADDLE_USE_LAPACK #ifdef PADDLE_USE_ATLAS return clapack_dgetrf(order, M, N, A, lda, ipiv); #else return LAPACKE_dgetrf(order, M, N, A, lda, ipiv); #endif +#else + LOG(FATAL) << "Not implemented"; +#endif + return 0; } template <> @@ -112,11 +122,16 @@ int getri(const CBLAS_ORDER order, float* A, const int lda, const int* ipiv) { +#ifdef PADDLE_USE_LAPACK #ifdef PADDLE_USE_ATLAS return clapack_sgetri(order, N, A, lda, ipiv); #else return LAPACKE_sgetri(order, N, A, lda, ipiv); #endif +#else + LOG(FATAL) << "Not implemented"; +#endif + return 0; } template <> @@ -125,11 +140,16 @@ int getri(const CBLAS_ORDER order, double* A, const int lda, const int* ipiv) { +#ifdef PADDLE_USE_LAPACK #ifdef PADDLE_USE_ATLAS return clapack_dgetri(order, N, A, lda, ipiv); #else return LAPACKE_dgetri(order, N, A, lda, ipiv); #endif +#else + LOG(FATAL) << "Not implemented"; +#endif + return 0; } template <> diff --git a/paddle/math/MathFunctions.h b/paddle/math/MathFunctions.h index c8559eefd8378450fc18c2ba821c65b39c8cc046..9f8f84a87c5e60b2a6573844f251c42152d8156b 100644 --- a/paddle/math/MathFunctions.h +++ b/paddle/math/MathFunctions.h @@ -17,11 +17,14 @@ limitations under the License. */ #ifdef PADDLE_USE_MKL #include +#ifdef PADDLE_USE_LAPACK #include +#endif #else extern "C" { #include } +#ifdef PADDLE_USE_LAPACK #ifdef PADDLE_USE_ATLAS extern "C" { #include @@ -30,6 +33,7 @@ extern "C" { #include #endif #endif +#endif #include diff --git a/paddle/math/Matrix.cpp b/paddle/math/Matrix.cpp index 9eead5b62c690b0a3310d8b68bfa3f1870be17c2..55a7344495f8e57dc95095ab1b81b45008fa9acc 100644 --- a/paddle/math/Matrix.cpp +++ b/paddle/math/Matrix.cpp @@ -483,6 +483,20 @@ void GpuMatrix::sequenceAvgForward(Matrix& a, hl_sequence_avg_forward(dst, src, starts, height, width, mode); } +void GpuMatrix::sequenceAvgBackward(Matrix& a, + const IVector& startsPos, + int mode) { + size_t height = a.getHeight(); + size_t width = getWidth(); + CHECK_EQ(height, startsPos.getSize() - 1); + CHECK_EQ(width, a.getWidth()); + real* dst = getData(); + real* src = a.getData(); + const int* starts = startsPos.getData(); + + hl_sequence_avg_backward(dst, src, starts, height, width, mode); +} + /* this = scaleAB*(a*b) + scaleT*this */ void GpuMatrix::mul(const GpuMatrix& a, const GpuMatrix& b, @@ -2304,6 +2318,41 @@ void CpuMatrix::sequenceAvgForward(Matrix& a, } } +void CpuMatrix::sequenceAvgBackward(Matrix& a, + const IVector& startsPos, + int mode) { + size_t height = a.getHeight(); + size_t width = getWidth(); + CHECK_EQ(height, startsPos.getSize() - 1); + CHECK_EQ(width, a.getWidth()); + real* dst = getData(); + real* src = a.getData(); + const int* starts = startsPos.getData(); + MatrixPtr outMtx = Matrix::create(nullptr, 1, width, false, false); + MatrixPtr dataMtx = Matrix::create(nullptr, 1, width, false, false); + for (size_t i = 0; i < height; ++i) { + int sequenceLength = starts[i + 1] - starts[i]; + if (0 == sequenceLength) { + // empty sequence + continue; + } + outMtx->setData(dst + starts[i] * width, sequenceLength, width); + dataMtx->setData(src + i * width); + if (mode == 0) { + // plain average + outMtx->addBias(*dataMtx, 1.0f / sequenceLength); + } else if (mode == 1) { + // sum instead of average + outMtx->addBias(*dataMtx, 1.0f); + } else if (mode == 2) { + // divide by square root of sequenceLength + outMtx->addBias(*dataMtx, 1.0f / std::sqrt(sequenceLength)); + } else { + LOG(FATAL) << "should not reach here"; + } + } +} + /* this = scaleAB*(a*b) + scaleT*this*/ void CpuMatrix::mul(const Matrix& a, const Matrix& b, @@ -2377,41 +2426,8 @@ void CpuMatrix::mul(CpuMatrix* a, CpuMatrix* b, real scaleAB, real scaleT) { int lda = a->getStride(); int ldb = b->getStride(); int ldc = getStride(); -#ifndef PADDLE_TYPE_DOUBLE - cblas_sgemm(CblasRowMajor, - a_trans, - b_trans, - M, - N, - K, - scaleAB, - A, - lda, - B, - ldb, - scaleT, - C, - ldc); -#else - cblas_dgemm(CblasRowMajor, - a_trans, - b_trans, - M, - N, - K, - scaleAB, - A, - lda, - B, - ldb, - scaleT, - C, - ldc); -// TODO(yuyang18): Is gemm defined other place? -#endif - - VLOG(2) << " A[0]=" << A[0] << " A[1]=" << A[1] << " B[0]=" << B[0] - << " B[1]=" << B[1] << " C[0]=" << C[0] << " C[1]=" << C[1]; + gemm( + a_trans, b_trans, M, N, K, scaleAB, A, lda, B, ldb, scaleT, C, ldc); } void CpuMatrix::mul( diff --git a/paddle/math/Matrix.h b/paddle/math/Matrix.h index dbdb629614546b7c7b569d7473d96a06d0c5a9c7..3252adb19e4c2e48f86c3c811bfc7d75fd06a8f7 100644 --- a/paddle/math/Matrix.h +++ b/paddle/math/Matrix.h @@ -461,6 +461,12 @@ public: LOG(FATAL) << "Not implemented"; } + virtual void sequenceAvgBackward(Matrix& a, + const IVector& startsPos, + int mode) { + LOG(FATAL) << "Not implemented"; + } + /** * @code * this = scaleAB*(a*b) + scaleT*this @@ -1203,6 +1209,7 @@ public: void collectSharedBias(Matrix& a, real scale); void sequenceAvgForward(Matrix& a, const IVector& startsPos, int mode); + void sequenceAvgBackward(Matrix& a, const IVector& startsPos, int mode); /** * @code @@ -1619,6 +1626,7 @@ public: void collectSharedBias(Matrix& a, real scale); void sequenceAvgForward(Matrix& a, const IVector& startsPos, int mode); + void sequenceAvgBackward(Matrix& a, const IVector& startsPos, int mode); /** * @code diff --git a/paddle/math/SIMDFunctions.cpp b/paddle/math/SIMDFunctions.cpp index 95219debf50e57407b668d315b91141d259fc779..d66d543a61450b47b7758b50eaecc107c6fe3576 100644 --- a/paddle/math/SIMDFunctions.cpp +++ b/paddle/math/SIMDFunctions.cpp @@ -13,119 +13,12 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "SIMDFunctions.h" +#ifdef __SSE3__ #include +#endif #include -#ifndef __AVX__ -static void addto_sse(float* a, const float* b, size_t len) { - int offset = len % 16; - __m128 ma0, ma1, ma2, ma3; - __m128 mb0, mb1, mb2, mb3; - - for (unsigned int k = 0; k < len / 16; k++, a += 16, b += 16) { - ma0 = _mm_load_ps(a); - ma1 = _mm_load_ps(a + 4); - ma2 = _mm_load_ps(a + 8); - ma3 = _mm_load_ps(a + 12); - - mb0 = _mm_load_ps(b); - mb1 = _mm_load_ps(b + 4); - mb2 = _mm_load_ps(b + 8); - mb3 = _mm_load_ps(b + 12); - - ma0 = _mm_add_ps(ma0, mb0); - ma1 = _mm_add_ps(ma1, mb1); - ma2 = _mm_add_ps(ma2, mb2); - ma3 = _mm_add_ps(ma3, mb3); - - _mm_store_ps(a, ma0); - _mm_store_ps(a + 4, ma1); - _mm_store_ps(a + 8, ma2); - _mm_store_ps(a + 12, ma3); - } - - for (int i = 0; i < offset; i++) a[i] += b[i]; -} - -static void batch_addto_sse(float* a, const float* b[], int batch, size_t len) { - int offset = len % 16; - - __m128 ma0, ma1, ma2, ma3; - __m128 mb0, mb1, mb2, mb3; - - for (unsigned int k = 0; k < len / 16; k++, a += 16) { - ma0 = _mm_load_ps(a); - ma1 = _mm_load_ps(a + 4); - ma2 = _mm_load_ps(a + 8); - ma3 = _mm_load_ps(a + 12); - - for (int i = 0; i < batch; i++) { - mb0 = _mm_load_ps(b[i]); - mb1 = _mm_load_ps(b[i] + 4); - mb2 = _mm_load_ps(b[i] + 8); - mb3 = _mm_load_ps(b[i] + 12); - ma0 = _mm_add_ps(ma0, mb0); - ma1 = _mm_add_ps(ma1, mb1); - ma2 = _mm_add_ps(ma2, mb2); - ma3 = _mm_add_ps(ma3, mb3); - b[i] += 16; - } - - _mm_store_ps(a, ma0); - _mm_store_ps(a + 4, ma1); - _mm_store_ps(a + 8, ma2); - _mm_store_ps(a + 12, ma3); - } - - for (int i = 0; i < offset; i++) { - for (int k = 0; k < batch; k++) a[i] += b[k][i]; - } - return; -} - -static void col_max_sse(float* result, - const float* data, - int dim, - int numSamples) { - // first sample, direct copy - for (int d = 0; d < dim; ++d) { - result[d] = data[d]; - } - int offset = dim % 16; - __m128 ma0, ma1, ma2, ma3; - __m128 mb0, mb1, mb2, mb3; - // first 16n dims - for (int k = 0; k < dim / 16; k++, result += 16, data += 16) { - ma0 = _mm_load_ps(result); - ma1 = _mm_load_ps(result + 4); - ma2 = _mm_load_ps(result + 8); - ma3 = _mm_load_ps(result + 12); - for (int i = 1; i < numSamples; i++) { - mb0 = _mm_load_ps(data + i * dim); - mb1 = _mm_load_ps(data + i * dim + 4); - mb2 = _mm_load_ps(data + i * dim + 8); - mb3 = _mm_load_ps(data + i * dim + 12); - ma0 = _mm_max_ps(ma0, mb0); - ma1 = _mm_max_ps(ma1, mb1); - ma2 = _mm_max_ps(ma2, mb2); - ma3 = _mm_max_ps(ma3, mb3); - } - _mm_store_ps(result, ma0); - _mm_store_ps(result + 4, ma1); - _mm_store_ps(result + 8, ma2); - _mm_store_ps(result + 12, ma3); - } - // last dims - for (int d = 0; d < offset; ++d) { - float sm = data[d]; - for (int i = 1; i < numSamples; ++i) { - sm = std::max(sm, data[i * dim + d]); - } - result[d] = sm; - } -} - -#else +#ifdef __AVX__ static void addto_avx(float* a, const float* b, size_t len) { int offset = len % 32; @@ -355,17 +248,128 @@ static void decayL1_avx( } } +#elif defined(__SSE3__) + +static void addto_sse(float* a, const float* b, size_t len) { + int offset = len % 16; + __m128 ma0, ma1, ma2, ma3; + __m128 mb0, mb1, mb2, mb3; + + for (unsigned int k = 0; k < len / 16; k++, a += 16, b += 16) { + ma0 = _mm_load_ps(a); + ma1 = _mm_load_ps(a + 4); + ma2 = _mm_load_ps(a + 8); + ma3 = _mm_load_ps(a + 12); + + mb0 = _mm_load_ps(b); + mb1 = _mm_load_ps(b + 4); + mb2 = _mm_load_ps(b + 8); + mb3 = _mm_load_ps(b + 12); + + ma0 = _mm_add_ps(ma0, mb0); + ma1 = _mm_add_ps(ma1, mb1); + ma2 = _mm_add_ps(ma2, mb2); + ma3 = _mm_add_ps(ma3, mb3); + + _mm_store_ps(a, ma0); + _mm_store_ps(a + 4, ma1); + _mm_store_ps(a + 8, ma2); + _mm_store_ps(a + 12, ma3); + } + + for (int i = 0; i < offset; i++) a[i] += b[i]; +} + +static void batch_addto_sse(float* a, const float* b[], int batch, size_t len) { + int offset = len % 16; + + __m128 ma0, ma1, ma2, ma3; + __m128 mb0, mb1, mb2, mb3; + + for (unsigned int k = 0; k < len / 16; k++, a += 16) { + ma0 = _mm_load_ps(a); + ma1 = _mm_load_ps(a + 4); + ma2 = _mm_load_ps(a + 8); + ma3 = _mm_load_ps(a + 12); + + for (int i = 0; i < batch; i++) { + mb0 = _mm_load_ps(b[i]); + mb1 = _mm_load_ps(b[i] + 4); + mb2 = _mm_load_ps(b[i] + 8); + mb3 = _mm_load_ps(b[i] + 12); + ma0 = _mm_add_ps(ma0, mb0); + ma1 = _mm_add_ps(ma1, mb1); + ma2 = _mm_add_ps(ma2, mb2); + ma3 = _mm_add_ps(ma3, mb3); + b[i] += 16; + } + + _mm_store_ps(a, ma0); + _mm_store_ps(a + 4, ma1); + _mm_store_ps(a + 8, ma2); + _mm_store_ps(a + 12, ma3); + } + + for (int i = 0; i < offset; i++) { + for (int k = 0; k < batch; k++) a[i] += b[k][i]; + } + return; +} + +static void col_max_sse(float* result, + const float* data, + int dim, + int numSamples) { + // first sample, direct copy + for (int d = 0; d < dim; ++d) { + result[d] = data[d]; + } + int offset = dim % 16; + __m128 ma0, ma1, ma2, ma3; + __m128 mb0, mb1, mb2, mb3; + // first 16n dims + for (int k = 0; k < dim / 16; k++, result += 16, data += 16) { + ma0 = _mm_load_ps(result); + ma1 = _mm_load_ps(result + 4); + ma2 = _mm_load_ps(result + 8); + ma3 = _mm_load_ps(result + 12); + for (int i = 1; i < numSamples; i++) { + mb0 = _mm_load_ps(data + i * dim); + mb1 = _mm_load_ps(data + i * dim + 4); + mb2 = _mm_load_ps(data + i * dim + 8); + mb3 = _mm_load_ps(data + i * dim + 12); + ma0 = _mm_max_ps(ma0, mb0); + ma1 = _mm_max_ps(ma1, mb1); + ma2 = _mm_max_ps(ma2, mb2); + ma3 = _mm_max_ps(ma3, mb3); + } + _mm_store_ps(result, ma0); + _mm_store_ps(result + 4, ma1); + _mm_store_ps(result + 8, ma2); + _mm_store_ps(result + 12, ma3); + } + // last dims + for (int d = 0; d < offset; ++d) { + float sm = data[d]; + for (int i = 1; i < numSamples; ++i) { + sm = std::max(sm, data[i * dim + d]); + } + result[d] = sm; + } +} + #endif -#ifndef __AVX__ -#define SIMD_INVOKE(func, ...) func##_sse(__VA_ARGS__) -#else +#if defined(__AVX__) #define SIMD_INVOKE(func, ...) func##_avx(__VA_ARGS__) +#elif defined(__SSE3__) +#define SIMD_INVOKE(func, ...) func##_sse(__VA_ARGS__) #endif namespace paddle { namespace simd { namespace internal { +#ifdef __SSE3__ void addToImpl(float* a, const float* b, size_t len) { SIMD_INVOKE(addto, a, b, len); } @@ -376,6 +380,7 @@ void batchAddToImpl(float* a, const float* b[], int batch, size_t len) { void colMaxImpl(float* result, const float* data, int dim, int numSamples) { SIMD_INVOKE(col_max, result, data, dim, numSamples); } +#endif #ifdef __AVX__ void decayL1AvxImpl(float* dst, float* src, float lambda, size_t len) { @@ -385,8 +390,8 @@ void decayL1AvxImpl( float* dst, float* src, float* lr, float lambda, size_t len) { decayL1_avx(dst, src, lr, lambda, len); } - #endif + } // namespace internal } // namespace simd } // namespace paddle diff --git a/paddle/math/SIMDFunctions.h b/paddle/math/SIMDFunctions.h index 9b0a8719b287a2b88e966484090974586d64521f..439f11b79d134d7054f45f2d0a70fc5a6fde6c13 100644 --- a/paddle/math/SIMDFunctions.h +++ b/paddle/math/SIMDFunctions.h @@ -128,17 +128,29 @@ void decayL1AvxImpl( template <> inline void addTo(float* a, const float* b, size_t len) { +#ifdef __SSE3__ internal::addToImpl(a, b, len); +#else + naive::addTo(a, b, len); +#endif } template <> inline void batchAddTo(float* a, const float* b[], int batch, size_t len) { +#ifdef __SSE3__ internal::batchAddToImpl(a, b, batch, len); +#else + naive::batchAddTo(a, b, batch, len); +#endif } template <> inline void colMax(float* result, const float* data, int dim, int numSamples) { +#ifdef __SSE3__ internal::colMaxImpl(result, data, dim, numSamples); +#else + naive::colMax(result, data, dim, numSamples); +#endif } template <> diff --git a/paddle/math/Storage.cpp b/paddle/math/Storage.cpp index 56e5442394b04230c22d668aa734dc0fa44004c2..7ce17a3207becb176a852a16fca52376009db9ee 100644 --- a/paddle/math/Storage.cpp +++ b/paddle/math/Storage.cpp @@ -14,6 +14,7 @@ limitations under the License. */ #include "Storage.h" #include "Allocator.h" +#include "paddle/utils/StringUtil.h" #include "paddle/utils/Util.h" DEFINE_int32(pool_limit_size, @@ -62,7 +63,7 @@ PoolAllocator* StorageEngine::getGpuAllocator(int deviceId) { } if (gpuAllocator_[deviceId] == nullptr) { std::string name = - "gpu" + std::to_string(deviceId) + std::string("_pool"); + "gpu" + str::to_string(deviceId) + std::string("_pool"); gpuAllocator_[deviceId] = new PoolAllocator(new GpuAllocator(), FLAGS_pool_limit_size, name); } diff --git a/paddle/math/tests/test_matrixCompare.cpp b/paddle/math/tests/test_matrixCompare.cpp index 08b64c1bb6f5d359a2d2164e723a76c5360168ee..dd19fe516fbf724a86479e6f27032614ab4c6106 100644 --- a/paddle/math/tests/test_matrixCompare.cpp +++ b/paddle/math/tests/test_matrixCompare.cpp @@ -685,7 +685,7 @@ TEST(SMatrix, topK) { } } -void testMatrixSequenceAvgForward(int batchSize, int inputDim, int mode) { +void testMatrixSequenceAvg(int batchSize, int inputDim, int mode) { MatrixPtr cpuInput = std::make_shared(batchSize, inputDim); MatrixPtr gpuInput = std::make_shared(batchSize, inputDim); cpuInput->randomizeUniform(); @@ -706,15 +706,25 @@ void testMatrixSequenceAvgForward(int batchSize, int inputDim, int mode) { gpuOutput->sequenceAvgForward(*gpuInput, *gpuSequence, mode); TensorCheckErr(*cpuOutput, *gpuOutput); + + MatrixPtr cpuInGrad = std::make_shared(batchSize, inputDim); + MatrixPtr gpuInGrad = std::make_shared(batchSize, inputDim); + cpuInGrad->randomizeUniform(); + gpuInGrad->copyFrom(*cpuInGrad); + + cpuInGrad->sequenceAvgBackward(*cpuOutput, *cpuSequence, mode); + gpuInGrad->sequenceAvgBackward(*gpuOutput, *gpuSequence, mode); + + TensorCheckErr(*cpuInGrad, *gpuInGrad); } -TEST(Matrix, sequenceAvgForward) { +TEST(Matrix, sequenceAvg) { for (auto batchSize : {10, 128, 6000}) { for (auto inputDim : {32, 100, 512}) { for (auto mode : {0, 1, 2}) { VLOG(3) << " batchSize=" << batchSize << " inputDim=" << inputDim << " mode=" << mode; - testMatrixSequenceAvgForward(batchSize, inputDim, mode); + testMatrixSequenceAvg(batchSize, inputDim, mode); } } } diff --git a/paddle/pserver/ParameterServer2.cpp b/paddle/pserver/ParameterServer2.cpp index 877cbb86ec112739a5c7eeee969ca48ef491ee87..19ff40ba7e9584f772043f939bcb31caf666163d 100644 --- a/paddle/pserver/ParameterServer2.cpp +++ b/paddle/pserver/ParameterServer2.cpp @@ -29,6 +29,7 @@ limitations under the License. */ #include "paddle/utils/Flags.h" #include "paddle/utils/GlobalConstants.h" #include "paddle/utils/Stat.h" +#include "paddle/utils/StringUtil.h" DEFINE_int32(pserver_num_threads, 1, "number of threads for sync op exec"); DEFINE_double(async_lagged_ratio_min, @@ -218,7 +219,8 @@ void ParameterServer2::setConfig(const SetConfigRequest& request, callback(response); /// always defined, barrier slowest node function need it. - statSet_.reset(new StatSet("ParameterServer" + std::to_string(serverId_))); + statSet_.reset(new StatSet("ParameterServer" + + str::to_string(static_cast(serverId_)))); } real bufferSum(const std::vector& buffers) { diff --git a/paddle/py_paddle/dataprovider_converter.py b/paddle/py_paddle/dataprovider_converter.py index 6d6a406cf61d467cb2701ca5e85e99648eea36eb..879703a00c197e62ba7e21b8e2e2dea2889c4e13 100644 --- a/paddle/py_paddle/dataprovider_converter.py +++ b/paddle/py_paddle/dataprovider_converter.py @@ -160,10 +160,19 @@ class SparseFloatScanner(SparseBinaryScanner): class IndexScanner(IScanner): def __init__(self, input_type, pos): IScanner.__init__(self, input_type, pos) - self.__ids__ = [] + self.__ids__ = None + self.__idx__ = 0 + + def pre_scan(self, dat): + self.__idx__ += 1 + + def finish_pre_scan(self, argument): + self.__ids__ = [0] * self.__idx__ + self.__idx__ = 0 def scan(self, dat): - self.__ids__.append(dat) + self.__ids__[self.__idx__] = dat + self.__idx__ += 1 def finish_scan(self, argument): ids = swig_paddle.IVector.create(self.__ids__, self.data_in_gpu) diff --git a/paddle/scripts/docker/README.md b/paddle/scripts/docker/README.md index 7c90316ad82a6430d6c12d72e07b166b6d9d98a9..132f8cd8aaf544984e1867f63c172808d087c91f 100644 --- a/paddle/scripts/docker/README.md +++ b/paddle/scripts/docker/README.md @@ -94,7 +94,7 @@ docker build -t paddle:dev --build-arg UBUNTU_MIRROR=mirror://mirrors.ubuntu.com Given the development image `paddle:dev`, the following command builds PaddlePaddle from the source tree on the development computer (host): ```bash -docker run -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_AVX=ON" -e "WITH_TEST=OFF" -e "RUN_TEST=OFF" paddle:dev +docker run --rm -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_AVX=ON" -e "WITH_TEST=OFF" -e "RUN_TEST=OFF" paddle:dev ``` This command mounts the source directory on the host into `/paddle` in the container, so the default entry point of `paddle:dev`, `build.sh`, could build the source code with possible local changes. When it writes to `/paddle/build` in the container, it writes to `$PWD/build` on the host indeed. @@ -110,7 +110,7 @@ Users can specify the following Docker build arguments with either "ON" or "OFF" - `WITH_AVX`: ***Required***. Set to "OFF" prevents from generating AVX instructions. If you don't know what is AVX, you might want to set "ON". - `WITH_TEST`: ***Optional, default OFF***. Build unit tests binaries. Once you've built the unit tests, you can run these test manually by the following command: ```bash - docker run -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_AVX=ON" paddle:dev sh -c "cd /paddle/build; make coverall" + docker run --rm -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_AVX=ON" paddle:dev sh -c "cd /paddle/build; make coverall" ``` - `RUN_TEST`: ***Optional, default OFF***. Run unit tests after building. You can't run unit tests without building it. @@ -129,7 +129,7 @@ This production image is minimal -- it includes binary `paddle`, the shared libr Again the development happens on the host. Suppose that we have a simple application program in `a.py`, we can test and run it using the production image: ```bash -docker run -it -v $PWD:/work paddle /work/a.py +docker run --rm -it -v $PWD:/work paddle /work/a.py ``` But this works only if all dependencies of `a.py` are in the production image. If this is not the case, we need to build a new Docker image from the production image and with more dependencies installs. @@ -166,3 +166,18 @@ docker tag myapp me/myapp docker push kubectl ... ``` + +### Reading source code with woboq codebrowser +For developers who are interested in the C++ source code, please use -e "WOBOQ=ON" to enable the building of C++ source code into HTML pages using [Woboq codebrowser](https://github.com/woboq/woboq_codebrowser). + +- The following command builds PaddlePaddle, generates HTML pages from C++ source code, and writes HTML pages into `$HOME/woboq_out` on the host: + +```bash +docker run -v $PWD:/paddle -v $HOME/woboq_out:/woboq_out -e "WITH_GPU=OFF" -e "WITH_AVX=ON" -e "WITH_TEST=ON" -e "WOBOQ=ON" paddle:dev +``` + +- You can open the generated HTML files in your Web browser. Or, if you want to run a Nginx container to serve them for a wider audience, you can run: + +``` +docker run -v $HOME/woboq_out:/usr/share/nginx/html -d -p 8080:80 nginx +``` diff --git a/paddle/scripts/docker/build.sh b/paddle/scripts/docker/build.sh index e6ed01428a63b9c55bf6ec299ea1c8bff71f3b65..176b8278f1102b240d02a494388a18229a682d55 100644 --- a/paddle/scripts/docker/build.sh +++ b/paddle/scripts/docker/build.sh @@ -46,7 +46,7 @@ make install # install them in docker cpack -D CPACK_GENERATOR='DEB' -D CPACK_DEBIAN_PACKAGE_DEPENDS="" .. -if [[ ${BUILD_WOBOQ:-OFF} == 'ON' ]]; then +if [[ ${WOBOQ:-OFF} == 'ON' ]]; then apt-get install -y clang-3.8 llvm-3.8 libclang-3.8-dev # Install woboq_codebrowser. git clone https://github.com/woboq/woboq_codebrowser /woboq @@ -56,7 +56,7 @@ if [[ ${BUILD_WOBOQ:-OFF} == 'ON' ]]; then . make - export WOBOQ_OUT=/usr/share/nginx/html/paddle + export WOBOQ_OUT=/woboq_out/paddle export BUILD_DIR=/paddle/build mkdir -p $WOBOQ_OUT cp -rv /woboq/data $WOBOQ_OUT/../data diff --git a/paddle/utils/Any.h b/paddle/utils/Any.h new file mode 100644 index 0000000000000000000000000000000000000000..99a0139accc4988f1e4cce45eeb688a6603c2c31 --- /dev/null +++ b/paddle/utils/Any.h @@ -0,0 +1,35 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#if __cplusplus > 201402L +#include + +namespace paddle { +// using std::any for C++ 17 +using std::any; +using std::any_cast; +using std::bad_any_cast; +} // namespace paddle + +#else +#include + +namespace paddle { +// use linb::any for C++ 11 +using linb::any; +using linb::any_cast; +using linb::bad_any_cast; +} // namespace paddle +#endif diff --git a/paddle/utils/CpuId.cpp b/paddle/utils/CpuId.cpp index 8eefdd2980e7f56a836df6fd2ff8c31b81a55555..edd33c454122d95078e0fde2a2e9d68903951ee8 100644 --- a/paddle/utils/CpuId.cpp +++ b/paddle/utils/CpuId.cpp @@ -19,7 +19,7 @@ limitations under the License. */ /// for MSVC #define CPUID(info, x) __cpuidex(info, x, 0) -#else +#elif !defined(__ANDROID__) #include @@ -31,6 +31,7 @@ limitations under the License. */ namespace paddle { SIMDFlags::SIMDFlags() { +#if !defined(__ANDROID__) unsigned int cpuInfo[4]; // CPUID: https://en.wikipedia.org/wiki/CPUID // clang-format off @@ -51,6 +52,9 @@ SIMDFlags::SIMDFlags() { CPUID(cpuInfo, 0x80000001); simd_flags_ |= cpuInfo[2] & (1 << 16) ? SIMD_FMA4 : SIMD_NONE; // clang-fotmat on +#else + simd_flags_ = SIMD_NEON; +#endif } SIMDFlags const* SIMDFlags::instance() { diff --git a/paddle/utils/CpuId.h b/paddle/utils/CpuId.h index 5fc610964d4f5b8064f16ebf1b26bbb002264ce1..869be5be541dafd699a87a8e8893aadadf59b711 100644 --- a/paddle/utils/CpuId.h +++ b/paddle/utils/CpuId.h @@ -30,6 +30,7 @@ enum simd_t { SIMD_AVX = 1 << 8, ///< AVX SIMD_AVX2 = 1 << 9, ///< AVX 2 SIMD_AVX512 = 1 << 10, ///< AVX 512 + SIMD_NEON = 1 << 11, /// NEON }; // clang-format on @@ -96,6 +97,7 @@ private: #define HAS_AVX HAS_SIMD(SIMD_AVX) #define HAS_AVX2 HAS_SIMD(SIMD_AVX2) #define HAS_AVX512 HAS_SIMD(SIMD_AVX512) +#define HAS_NEON HAS_SIMD(SIMD_NEON) // clang-format on /** diff --git a/paddle/utils/Logging.cpp b/paddle/utils/Logging.cpp index 5a1c6ecb2219f7983609c27f3215c7fc1e9e9ef2..ea96bad240ad81c4c29b7dab35b015549052e2bb 100644 --- a/paddle/utils/Logging.cpp +++ b/paddle/utils/Logging.cpp @@ -18,6 +18,7 @@ limitations under the License. */ */ #include "Logging.h" +#include namespace paddle { diff --git a/paddle/utils/StringUtil.h b/paddle/utils/StringUtil.h index 0b4f4c9113ae9d714b634b67931e51b408bbe777..95f071cb7de87d87f6988c136d7993c66fa9dde1 100644 --- a/paddle/utils/StringUtil.h +++ b/paddle/utils/StringUtil.h @@ -54,6 +54,25 @@ inline T toWithStatus(const std::string& s, bool* ok = nullptr) { return v; } +/** + * Cast type T to string with status. + * + * @param [in] v input value of type T. + * @param [out] ok status, return true if there is no error in casting. Set + * nullptr if user don't care error at all. + * @return result of casting. If error occurred, a empty string will be + * returned. + */ +template +inline std::string toWithStatus(const T v, bool* ok = nullptr) { + std::ostringstream sout; + sout << v; + if (ok) { + *ok = !sout.fail(); + } + return sout.str(); +} + /// Convert string to type T. It makes sure all the characters in s are used. /// Otherwise it will abort. /// @@ -67,6 +86,18 @@ inline T to(const std::string& s) { return v; } +/// Convert type T to string. +/// +/// @tparam T type of input value +/// @param v input value of type T +template +std::string to_string(T v) { + bool ok; + std::string s = toWithStatus(v, &ok); + CHECK(ok) << "Cannot convert v(" << v << ") to type std::string"; + return s; +} + } // namespace str #undef DEFINE_STRING_CONVERSION diff --git a/paddle/utils/Util.cpp b/paddle/utils/Util.cpp index 1f56b6b8a96602d298507452fc7182d46179de41..b18b73e06a6c39c3bf9717280bc6323917c80efb 100644 --- a/paddle/utils/Util.cpp +++ b/paddle/utils/Util.cpp @@ -15,11 +15,16 @@ limitations under the License. */ #include "Util.h" #include -#include #include #include #include + +#ifdef __SSE__ #include +#endif +#ifdef __SSE3__ +#include +#endif #include #include @@ -163,8 +168,12 @@ void initMain(int argc, char** argv) { installProfilerSwitch(); +#ifdef __SSE__ _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON); +#endif +#ifdef __SSE3__ _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON); +#endif if (FLAGS_seed == 0) { unsigned int t = time(NULL); diff --git a/paddle/utils/arch/linux/Locks.cpp b/paddle/utils/arch/linux/Locks.cpp index 2a6f96e04d024ac3977bc154dbeeb69ce9ab3a5d..310c9a6542563891d4ba5888e58406ea28d6a2ce 100644 --- a/paddle/utils/arch/linux/Locks.cpp +++ b/paddle/utils/arch/linux/Locks.cpp @@ -15,6 +15,7 @@ limitations under the License. */ #include "paddle/utils/Locks.h" #include #include +#include "paddle/utils/Logging.h" namespace paddle { class SemaphorePrivate { @@ -26,7 +27,10 @@ Semaphore::Semaphore(int initValue) : m(new SemaphorePrivate()) { sem_init(&m->sem, 0, initValue); } -Semaphore::~Semaphore() { sem_destroy(&m->sem); } +Semaphore::~Semaphore() { + sem_destroy(&m->sem); + delete m; +} bool Semaphore::timeWait(struct timespec* ts) { return (0 == sem_timedwait(&m->sem, ts)); @@ -36,36 +40,101 @@ void Semaphore::wait() { sem_wait(&m->sem); } void Semaphore::post() { sem_post(&m->sem); } +#ifdef PADDLE_USE_PTHREAD_SPINLOCK + class SpinLockPrivate { public: inline SpinLockPrivate() { pthread_spin_init(&lock_, 0); } inline ~SpinLockPrivate() { pthread_spin_destroy(&lock_); } + + inline void lock() { pthread_spin_lock(&lock_); } + inline void unlock() { pthread_spin_unlock(&lock_); } + pthread_spinlock_t lock_; char padding_[64 - sizeof(pthread_spinlock_t)]; }; -SpinLock::SpinLock() : m(new SpinLockPrivate()) {} +#else -SpinLock::~SpinLock() { delete m; } +#include +class SpinLockPrivate { +public: + inline void lock() { + while (lock_.test_and_set(std::memory_order_acquire)) { + } + } + inline void unlock() { lock_.clear(std::memory_order_release); } + + std::atomic_flag lock_ = ATOMIC_FLAG_INIT; + char padding_[64 - sizeof(lock_)]; // Padding to cache line size +}; -void SpinLock::lock() { pthread_spin_lock(&m->lock_); } +#endif -void SpinLock::unlock() { pthread_spin_unlock(&m->lock_); } +SpinLock::SpinLock() : m(new SpinLockPrivate()) {} +SpinLock::~SpinLock() { delete m; } +void SpinLock::lock() { m->lock(); } +void SpinLock::unlock() { m->unlock(); } + +#ifdef PADDLE_USE_PTHREAD_BARRIER class ThreadBarrierPrivate { public: pthread_barrier_t barrier_; + + inline explicit ThreadBarrierPrivate(int count) { + pthread_barrier_init(&barrier_, nullptr, count); + } + + inline ~ThreadBarrierPrivate() { pthread_barrier_destroy(&barrier_); } + + inline void wait() { pthread_barrier_wait(&barrier_); } }; -ThreadBarrier::ThreadBarrier(int count) : m(new ThreadBarrierPrivate()) { - pthread_barrier_init(&m->barrier_, nullptr, count); -} +#else -ThreadBarrier::~ThreadBarrier() { - pthread_barrier_destroy(&m->barrier_); - delete m; -} +class ThreadBarrierPrivate { +public: + pthread_mutex_t mutex_; + pthread_cond_t cond_; + int count_; + int tripCount_; + + inline explicit ThreadBarrierPrivate(int cnt) : count_(0), tripCount_(cnt) { + CHECK_NE(cnt, 0); + CHECK_GE(pthread_mutex_init(&mutex_, 0), 0); + CHECK_GE(pthread_cond_init(&cond_, 0), 0); + } + + inline ~ThreadBarrierPrivate() { + pthread_cond_destroy(&cond_); + pthread_mutex_destroy(&mutex_); + } + + /** + * @brief wait + * @return true if the last wait + */ + inline bool wait() { + pthread_mutex_lock(&mutex_); + ++count_; + if (count_ >= tripCount_) { + count_ = 0; + pthread_cond_broadcast(&cond_); + pthread_mutex_unlock(&mutex_); + return true; + } else { + pthread_cond_wait(&cond_, &mutex_); + pthread_mutex_unlock(&mutex_); + return false; + } + } +}; + +#endif -void ThreadBarrier::wait() { pthread_barrier_wait(&m->barrier_); } +ThreadBarrier::ThreadBarrier(int count) : m(new ThreadBarrierPrivate(count)) {} +ThreadBarrier::~ThreadBarrier() { delete m; } +void ThreadBarrier::wait() { m->wait(); } } // namespace paddle diff --git a/paddle/utils/tests/test_CustomStackTrace.cpp b/paddle/utils/tests/test_CustomStackTrace.cpp index 378788bcecd579fff1c762702a8c27f54cee94bf..b5d9f93f1376048eabd726331006b0bb848bce11 100644 --- a/paddle/utils/tests/test_CustomStackTrace.cpp +++ b/paddle/utils/tests/test_CustomStackTrace.cpp @@ -19,6 +19,7 @@ limitations under the License. */ #include "paddle/utils/CustomStackTrace.h" #include "paddle/utils/Locks.h" +#include "paddle/utils/StringUtil.h" #include "paddle/utils/Util.h" DEFINE_int32(test_thread_num, 10, "testing thread number"); @@ -69,11 +70,11 @@ TEST(CustomStackTrace, normalTrain) { while (countDown-- > 0) { start.wait(); for (size_t i = 0; i < layerSize; ++i) { - tracer.push("layer_" + std::to_string(i)); + tracer.push("layer_" + paddle::str::to_string(i)); } tracer.pop(""); for (size_t i = 0; i < layerSize; ++i) { - tracer.pop("layer_" + std::to_string(layerSize - 1 - i)); + tracer.pop("layer_" + paddle::str::to_string(layerSize - 1 - i)); } finish.wait(); } @@ -89,7 +90,7 @@ TEST(CustomStackTrace, normalTest) { while (countDown-- > 0) { start.wait(); for (size_t i = 0; i < layerSize; ++i) { - tracer.push("layer_" + std::to_string(i)); + tracer.push("layer_" + paddle::str::to_string(i)); } tracer.clear(); // in forward test, tracer will clear after forward. finish.wait(); diff --git a/paddle/utils/tests/test_CustomStackTracePrint.cpp b/paddle/utils/tests/test_CustomStackTracePrint.cpp index 611b16aa7116d03ee51ba0095d043b78df1742ba..360c61c88a757da708b01d2bb54068b948b235cc 100644 --- a/paddle/utils/tests/test_CustomStackTracePrint.cpp +++ b/paddle/utils/tests/test_CustomStackTracePrint.cpp @@ -13,13 +13,14 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/utils/CustomStackTrace.h" +#include "paddle/utils/StringUtil.h" #include "paddle/utils/Util.h" int main(int argc, char** argv) { paddle::initMain(argc, argv); for (size_t i = 0; i < 1000; ++i) { - paddle::gLayerStackTrace.push("layer_" + std::to_string(i)); + paddle::gLayerStackTrace.push("layer_" + paddle::str::to_string(i)); if (i == 998) { throw "Unhandle exception"; } diff --git a/paddle/utils/tests/test_SIMDFlags.cpp b/paddle/utils/tests/test_SIMDFlags.cpp index 8200a24ce7b7df75b48a89fbb7af15f304c5957f..185789c927be19385d6ddc7a1889b6cc56109d38 100644 --- a/paddle/utils/tests/test_SIMDFlags.cpp +++ b/paddle/utils/tests/test_SIMDFlags.cpp @@ -18,7 +18,8 @@ limitations under the License. */ using namespace paddle; // NOLINT TEST(SIMDFlags, gccTest) { -#if (defined(__GNUC__) || defined(__GNUG__)) && !(defined(__clang__)) +#if (defined(__GNUC__) || defined(__GNUG__)) && !(defined(__clang__)) && \ + !defined(__arm__) // clang-format off CHECK(!__builtin_cpu_supports("sse") != HAS_SSE); CHECK(!__builtin_cpu_supports("sse2") != HAS_SSE2); @@ -43,4 +44,5 @@ TEST(SIMDFlags, normalPrint) { LOG(INFO) << "Has AVX: " << std::boolalpha << HAS_AVX; LOG(INFO) << "Has AVX2: " << std::boolalpha << HAS_AVX2; LOG(INFO) << "Has AVX512: " << std::boolalpha << HAS_AVX512; + LOG(INFO) << "Has NEON: " << std::boolalpha << HAS_NEON; } diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt index 48e0a1993d07f801e65dfa54a991995c593fe475..e7a0895533dd8902df9a012ab230df2a67256483 100644 --- a/python/CMakeLists.txt +++ b/python/CMakeLists.txt @@ -24,8 +24,9 @@ add_custom_target(paddle_python ALL DEPENDS ${OUTPUT_DIR}/.timestamp) add_subdirectory(paddle/trainer_config_helpers/tests) -add_subdirectory(paddle/v2/reader/tests) add_subdirectory(paddle/v2/tests) +add_subdirectory(paddle/v2/reader/tests) +add_subdirectory(paddle/v2/plot/tests) install(DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/dist/ DESTINATION opt/paddle/share/wheels diff --git a/python/paddle/trainer_config_helpers/layers.py b/python/paddle/trainer_config_helpers/layers.py index 8d2329292b5b8b408473c2e33fc43b2e586d89b6..aae419566f051e894f25e83902099ee0f39a469c 100755 --- a/python/paddle/trainer_config_helpers/layers.py +++ b/python/paddle/trainer_config_helpers/layers.py @@ -18,7 +18,7 @@ import inspect from paddle.trainer.config_parser import * from .activations import LinearActivation, SigmoidActivation, TanhActivation, \ - ReluActivation, IdentityActivation, SoftmaxActivation + ReluActivation, IdentityActivation, SoftmaxActivation, BaseActivation from .evaluators import * from .poolings import MaxPooling, AvgPooling, BasePoolingType from .attrs import * @@ -1916,7 +1916,7 @@ def cos_sim(a, b, scale=1, size=1, name=None, layer_attr=None): @layer_support() def hsigmoid(input, label, - num_classes, + num_classes=None, name=None, bias_attr=None, param_attr=None, @@ -1932,8 +1932,7 @@ def hsigmoid(input, .. code-block:: python cost = hsigmoid(input=[layer1, layer2], - label=data_layer, - num_classes=3) + label=data_layer) :param input: Input layers. It could be a LayerOutput or list/tuple of LayerOutput. @@ -1941,12 +1940,14 @@ def hsigmoid(input, :param label: Label layer. :type label: LayerOutput :param num_classes: number of classes. - :type num_classes: int + :type num_classes: int|None :param name: layer name :type name: basestring :param bias_attr: Bias attribute. None means default bias. False means no bias. :type bias_attr: ParameterAttribute|False + :param param_attr: Parameter Attribute. None means default parameter. + :type param_attr: ParameterAttribute|None :param layer_attr: Extra Layer Attribute. :type layer_attr: ExtraLayerAttribute :return: LayerOutput object. @@ -1966,6 +1967,11 @@ def hsigmoid(input, assert isinstance(label, LayerOutput) assert label.layer_type == LayerType.DATA + if num_classes is None: + num_classes = label.size + if num_classes is None or num_classes <= 2: + raise ValueError("hsigmoid label size must larger than 2.") + ipts_for_layer = [] parents = [] for each_input, each_param_attr in zip(input, param_attr): @@ -2253,8 +2259,9 @@ def img_pool_layer(input, pool_type.name = 'avg' type_name = pool_type.name + '-projection' \ - if (isinstance(pool_type, AvgPooling) or isinstance(pool_type, MaxPooling)) \ - else pool_type.name + if ( + isinstance(pool_type, AvgPooling) or isinstance(pool_type, MaxPooling)) \ + else pool_type.name pool_size_y = pool_size if pool_size_y is None else pool_size_y stride_y = stride if stride_y is None else stride_y @@ -3294,8 +3301,8 @@ def recurrent_group(step, assert (targetInlink == None or targetInlink_in_inlinks()) targetInlinkName = None if targetInlink == None \ - else targetInlink.name if isinstance(targetInlink, LayerOutput) \ - else targetInlink.input.name + else targetInlink.name if isinstance(targetInlink, LayerOutput) \ + else targetInlink.input.name contains_sub_seq = [False] @@ -4807,12 +4814,14 @@ def crf_decoding_layer(input, return LayerOutput(name, LayerType.CRF_DECODING_LAYER, parents, size=1) +@wrap_act_default(act=SigmoidActivation()) @wrap_bias_attr_default(has_bias=True) @wrap_name_default() @layer_support() def nce_layer(input, label, num_classes, + act=None, weight=None, num_neg_samples=10, neg_distribution=None, @@ -4841,6 +4850,8 @@ def nce_layer(input, :type weight: LayerOutput :param num_classes: number of classes. :type num_classes: int + :param act: Activation, default is Sigmoid. + :type act: BaseActivation :param num_neg_samples: number of negative samples. Default is 10. :type num_neg_samples: int :param neg_distribution: The distribution for generating the random negative labels. @@ -4863,6 +4874,8 @@ def nce_layer(input, assert isinstance(neg_distribution, collections.Sequence) assert len(neg_distribution) == num_classes assert sum(neg_distribution) == 1 + if not isinstance(act, BaseActivation): + raise TypeError() ipts_for_layer = [] parents = [] @@ -4884,12 +4897,17 @@ def nce_layer(input, type=LayerType.NCE_LAYER, num_classes=num_classes, neg_sampling_dist=neg_distribution, + active_type=act.name, num_neg_samples=num_neg_samples, inputs=ipts_for_layer, bias=ParamAttr.to_bias(bias_attr), **ExtraLayerAttribute.to_kwargs(layer_attr)) return LayerOutput( - name, LayerType.NCE_LAYER, parents=parents, size=l.config.size) + name, + LayerType.NCE_LAYER, + parents=parents, + size=l.config.size, + activation=act) """ diff --git a/python/paddle/v2/__init__.py b/python/paddle/v2/__init__.py index 25526bf409cf82f26979a84700ce948ac969df0c..7c8f6ea62fcb74700f7356ed4b937a3aaa1c7092 100644 --- a/python/paddle/v2/__init__.py +++ b/python/paddle/v2/__init__.py @@ -21,19 +21,22 @@ import data_type import topology import data_feeder import networks +import evaluator from . import dataset from . import reader +from . import plot import attr import pooling import inference import networks import py_paddle.swig_paddle as api import minibatch +import plot __all__ = [ 'optimizer', 'layer', 'activation', 'parameters', 'init', 'trainer', 'event', 'data_type', 'attr', 'pooling', 'data_feeder', 'dataset', 'reader', - 'topology', 'networks', 'infer' + 'topology', 'networks', 'infer', 'plot', 'evaluator' ] diff --git a/python/paddle/v2/config_base.py b/python/paddle/v2/config_base.py index 1ec1d7bbdf912b940ca4b8e7b20eb11310f0e74f..b0e8da563e0d65d534d3f224fe5f1c39a67eeb54 100644 --- a/python/paddle/v2/config_base.py +++ b/python/paddle/v2/config_base.py @@ -65,13 +65,42 @@ class Layer(object): def __init__(self, name=None, parent_layers=None): assert isinstance(parent_layers, dict) self.name = name - self.__contex__ = {} + self.__context__ = {} self.__parent_layers__ = parent_layers + # some layer may have some extra parent layer + self.__extra_parent__ = [] + # used for evaluator. + self.__children_layers__ = [] + + def extra_parent(self): + return self.__extra_parent__ + + def append_extra_parent(self, parent): + self.__extra_parent__.append(parent) + + def append_child(self, layer, parent_names): + self.__children_layers__.append((layer, parent_names)) def to_proto(self, context): """ function to set proto attribute """ + self.__context__ = context + + # STEP: short cut if this layer is parsed before. + if self.context_name() in context: + if self.use_context_name(): + return context[self.context_name()] + else: + return context[self.name] + + # STEP: parse extra_parent that is not used by this layer but must + # be parsed before this layer. + for p in self.__extra_parent__: + p.to_proto(context=context) + + # STEP: parse parent that is used by this layer, get the result and + # insert into kwargs of the next layer's to_proto_impl method. kwargs = dict() for layer_name in self.__parent_layers__: if not isinstance(self.__parent_layers__[layer_name], @@ -83,12 +112,29 @@ class Layer(object): self.__parent_layers__[layer_name]) kwargs[layer_name] = v1_layer + # STEP: parse myself and add myself into context. + ret_val = self.to_proto_impl(**kwargs) + if self.context_name() is not None \ + and self.context_name() not in context: + context[self.context_name()] = ret_val + + # STEP: parse children that should be pased after this layer. + for layer, pnames in self.__children_layers__: + drop = False + + # child will only be parsed if all parents are in context. + for pname in pnames: + if pname not in context: + drop = True + break + if drop: + continue + layer.to_proto(context=context) + + # STEP: return v1 layer result if self.context_name() is None: - return self.to_proto_impl(**kwargs) - elif self.context_name() not in context: - context[self.context_name()] = self.to_proto_impl(**kwargs) - self.__contex__ = context - if self.use_context_name(): + return ret_val + elif self.use_context_name(): return context[self.context_name()] else: return context[self.name] @@ -113,10 +159,13 @@ class Layer(object): this layer is called. :return: """ - return self.__contex__[self.context_name()].size + return self.__context__[self.context_name()].size -def __convert_to_v2__(method_name, parent_names, is_default_name=True): +def __convert_to_v2__(method_name, + parent_names, + is_default_name=True, + attach_parent=False): if is_default_name: wrapper = wrap_name_default(name_prefix=method_name) else: @@ -129,9 +178,20 @@ def __convert_to_v2__(method_name, parent_names, is_default_name=True): parent_layers = dict() other_kwargs = dict() for pname in parent_names: - if kwargs.has_key(pname): + if pname in kwargs: parent_layers[pname] = kwargs[pname] + if attach_parent: + pnames = [x.context_name() for x in parent_layers.values()] + + for pname in parent_layers: + layers = kwargs[pname] + if not isinstance(layers, collections.Sequence): + layers = [layers] + + for layer in layers: + layer.append_child(self, pnames) + for key in kwargs.keys(): if key not in parent_names: other_kwargs[key] = kwargs[key] diff --git a/python/paddle/v2/dataset/common.py b/python/paddle/v2/dataset/common.py index 7021a6da05dec6be216534112c2df2586e73390f..2eb018b8d60e9a8bd0091836ab56c35b05786fca 100644 --- a/python/paddle/v2/dataset/common.py +++ b/python/paddle/v2/dataset/common.py @@ -66,13 +66,6 @@ def download(url, module_name, md5sum): return filename -def dict_add(a_dict, ele): - if ele in a_dict: - a_dict[ele] += 1 - else: - a_dict[ele] = 1 - - def fetch_all(): for module_name in filter(lambda x: not x.startswith("__"), dir(paddle.v2.dataset)): diff --git a/python/paddle/v2/dataset/imdb.py b/python/paddle/v2/dataset/imdb.py index 0340f37f27625b4df03fde4a46d9fb611cb946a7..a3af22f5be419cea999fa12cf7588c79d4069b15 100644 --- a/python/paddle/v2/dataset/imdb.py +++ b/python/paddle/v2/dataset/imdb.py @@ -22,6 +22,7 @@ into paddle reader creators. """ import paddle.v2.dataset.common +import collections import tarfile import Queue import re @@ -58,10 +59,10 @@ def build_dict(pattern, cutoff): """ Build a word dictionary, the key is word, and the value is index. """ - word_freq = {} + word_freq = collections.defaultdict(int) for doc in tokenize(pattern): for word in doc: - paddle.v2.dataset.common.dict_add(word_freq, word) + word_freq[word] += 1 # Not sure if we should prune less-frequent words here. word_freq = filter(lambda x: x[1] > cutoff, word_freq.items()) diff --git a/python/paddle/v2/dataset/imikolov.py b/python/paddle/v2/dataset/imikolov.py index 917a0be849778aae7939fd953396389ceda06c02..4d3c0d59246d33dd2d9a360cf26495a23908cd11 100644 --- a/python/paddle/v2/dataset/imikolov.py +++ b/python/paddle/v2/dataset/imikolov.py @@ -18,6 +18,7 @@ This module will download dataset from http://www.fit.vutbr.cz/~imikolov/rnnlm/ parse train/test set into paddle reader creators. """ import paddle.v2.dataset.common +import collections import tarfile __all__ = ['train', 'test', 'build_dict'] @@ -27,15 +28,14 @@ MD5 = '30177ea32e27c525793142b6bf2c8e2d' def word_count(f, word_freq=None): - add = paddle.v2.dataset.common.dict_add - if word_freq == None: - word_freq = {} + if word_freq is None: + word_freq = collections.defaultdict(int) for l in f: for w in l.strip().split(): - add(word_freq, w) - add(word_freq, '') - add(word_freq, '') + word_freq[w] += 1 + word_freq[''] += 1 + word_freq[''] += 1 return word_freq diff --git a/python/paddle/v2/dataset/wmt14.py b/python/paddle/v2/dataset/wmt14.py index b02d70709519872767c9e49282fc79b6a06f9455..34757e3baf239b3206c2983e7cdec790948ae907 100644 --- a/python/paddle/v2/dataset/wmt14.py +++ b/python/paddle/v2/dataset/wmt14.py @@ -20,8 +20,10 @@ parse train/test set into paddle reader creators. """ import tarfile +import gzip from paddle.v2.dataset.common import download +from paddle.v2.parameters import Parameters __all__ = ['train', 'test', 'build_dict'] @@ -30,6 +32,9 @@ MD5_DEV_TEST = '7d7897317ddd8ba0ae5c5fa7248d3ff5' # this is a small set of data for test. The original data is too large and will be add later. URL_TRAIN = 'http://paddlepaddle.cdn.bcebos.com/demo/wmt_shrinked_data/wmt14.tgz' MD5_TRAIN = 'a755315dd01c2c35bde29a744ede23a6' +# this is the pretrained model, whose bleu = 26.92 +URL_MODEL = 'http://paddlepaddle.bj.bcebos.com/demo/wmt_14/wmt14_model.tar.gz' +MD5_MODEL = '6b097d23e15654608c6f74923e975535' START = "" END = "" @@ -126,5 +131,13 @@ def test(dict_size): download(URL_TRAIN, 'wmt14', MD5_TRAIN), 'test/test', dict_size) +def model(): + tar_file = download(URL_MODEL, 'wmt14', MD5_MODEL) + with gzip.open(tar_file, 'r') as f: + parameters = Parameters.from_tar(f) + return parameters + + def fetch(): download(URL_TRAIN, 'wmt14', MD5_TRAIN) + download(URL_MODEL, 'wmt14', MD5_MODEL) diff --git a/python/paddle/v2/evaluator.py b/python/paddle/v2/evaluator.py new file mode 100644 index 0000000000000000000000000000000000000000..588eefa3912799aa55f970c6d7e013ed7779ec9a --- /dev/null +++ b/python/paddle/v2/evaluator.py @@ -0,0 +1,47 @@ +# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle.trainer_config_helpers.evaluators as evs +import inspect +from config_base import __convert_to_v2__ + +__all__ = [] + + +def initialize(): + def convert_to_new_name(nm): + return nm[:-len("_evaluator")] + + for __ev_name__ in filter(lambda x: x.endswith('_evaluator'), evs.__all__): + __ev__ = getattr(evs, __ev_name__) + if hasattr(__ev__, 'argspec'): + argspec = __ev__.argspec + else: + argspec = inspect.getargspec(__ev__) + parent_names = filter(lambda x: x in ['input', 'label', 'weight'], + argspec.args) + v2_ev = __convert_to_v2__( + __ev_name__, + parent_names=parent_names, + is_default_name='name' in argspec.args, + attach_parent=True) + + __new_name__ = convert_to_new_name(__ev_name__) + + globals()[__new_name__] = v2_ev + globals()[__new_name__].__name__ = __new_name__ + __all__.append(__new_name__) + + +initialize() diff --git a/python/paddle/v2/layer.py b/python/paddle/v2/layer.py index 1e4efedde363f20fde168941adcb6e8a594b533a..384de9b9d57f88e84ab6067846174bb037502dc0 100644 --- a/python/paddle/v2/layer.py +++ b/python/paddle/v2/layer.py @@ -33,40 +33,52 @@ The primary usage shows below. import collections import inspect -from config_base import Layer, __convert_to_v2__ +import re + import paddle.trainer_config_helpers as conf_helps +from paddle.trainer.config_parser import \ + RecurrentLayerGroupWithoutOutLinksBegin, RecurrentLayerGroupSetOutLink, \ + RecurrentLayerGroupEnd, model_type from paddle.trainer_config_helpers.config_parser_utils import \ parse_network_config as __parse__ from paddle.trainer_config_helpers.default_decorators import wrap_act_default from paddle.trainer_config_helpers.default_decorators import \ wrap_bias_attr_default from paddle.trainer_config_helpers.default_decorators import wrap_name_default +from paddle.trainer_config_helpers.layers import RecurrentLayerGroupSetGenerator, Generator from paddle.trainer_config_helpers.layers import layer_support -from paddle.trainer.config_parser import \ - RecurrentLayerGroupWithoutOutLinksBegin, RecurrentLayerGroupSetOutLink, \ - RecurrentLayerGroupEnd, model_type import activation -import re +import attr import data_type +from config_base import Layer, __convert_to_v2__ __all__ = ['parse_network', 'data'] -def parse_network(*outputs): +def parse_network(output_layers, extra_layers=None): """ - Parse all output layers and then generate a ModelConfig object. + Parse all layers in the neural network graph and + then generate a ModelConfig object. .. note:: This function is used internally in paddle.v2 module. User should never invoke this method. - :param outputs: Output layers. - :type outputs: Layer + :param output_layers: Output layers. + :type output_layers: Layer + :param extra_layers: Some layers in the neural network graph are not in the + path of output_layers. + :type extra_layers: Layer :return: A ModelConfig object instance. :rtype: ModelConfig """ + if not isinstance(output_layers, collections.Sequence): + output_layers = [output_layers] + if extra_layers is not None and not isinstance(extra_layers, + collections.Sequence): + extra_layers = [extra_layers] def __real_func__(): """ @@ -74,7 +86,11 @@ def parse_network(*outputs): the plain old paddle configuration function. """ context = dict() - real_output = [each.to_proto(context=context) for each in outputs] + real_output = [each.to_proto(context=context) for each in output_layers] + if extra_layers is not None: + extra_output = [ + each.to_proto(context=context) for each in extra_layers + ] conf_helps.outputs(real_output) return __parse__(__real_func__) @@ -119,54 +135,23 @@ class DataLayerV2(Layer): return doc -class WithExtraParent(Layer): - def extra_parent(self): - return self.__extra_parent__ - - def __init__(self, name=None, parent_layers=None): - self.__extra_parent__ = [] - super(WithExtraParent, self).__init__( - name=name, parent_layers=parent_layers) - - def append_extra_parent(self, parent): - self.__extra_parent__.append(parent) - - def to_proto(self, context): +class MemoryV2(Layer): + def __init__(self, name, extra_input=None, **kwargs): """ - function to set proto attribute + Init memory object, if memory is inited inside recurrent_group step + function, it may depend on a boot_layer that should be initialized + outside recurrent_group, so we: + 1. add RecurrentLayerInput to extra_parent of self. + 2. add boot_layer to the extra_parent of RecurrentLayerInput. + + :param extra_input: list of RecurrentLayerInput + :type extra_input: [RecurrentLayerInput] """ - kwargs = dict() - for p in self.__extra_parent__: - p.to_proto(context=context) - - for layer_name in self.__parent_layers__: - if not isinstance(self.__parent_layers__[layer_name], - collections.Sequence): - v1_layer = self.__parent_layers__[layer_name].to_proto( - context=context) - else: - v1_layer = map(lambda x: x.to_proto(context=context), - self.__parent_layers__[layer_name]) - kwargs[layer_name] = v1_layer - - if self.context_name() is None: - return self.to_proto_impl(context=context, **kwargs) - elif self.context_name() not in context: - context[self.context_name()] = self.to_proto_impl( - context=context, **kwargs) - - if self.use_context_name(): - return context[self.context_name()] - else: - return context[self.name] - - -class MemoryV2(WithExtraParent): - def __init__(self, name, **kwargs): self.name = name super(MemoryV2, self).__init__(name=name, parent_layers=dict()) self.__kwargs__ = kwargs self.__boot_layer_name__ = None + if 'boot_layer' in kwargs: begin_of_current_rnn = [] # TODO(yuyang18): Fix inspect, it could be wrong when user invoke a @@ -189,11 +174,10 @@ class MemoryV2(WithExtraParent): assert begin_of_current_rnn is not None for extra in begin_of_current_rnn: self.append_extra_parent(extra) - assert isinstance(extra, WithExtraParent) extra.append_extra_parent(kwargs['boot_layer']) self.__boot_layer_name__ = kwargs['boot_layer'].name - def to_proto_impl(self, context, **kwargs): + def to_proto_impl(self, **kwargs): args = dict() for each in kwargs: args[each] = kwargs[each] @@ -201,7 +185,7 @@ class MemoryV2(WithExtraParent): args[each] = self.__kwargs__[each] if self.__boot_layer_name__ is not None: - args['boot_layer'] = context[self.__boot_layer_name__] + args['boot_layer'] = self.__context__[self.__boot_layer_name__] size = args.get('size', None) if size is not None: @@ -223,22 +207,6 @@ class MemoryV2(WithExtraParent): return True -class LayerOutputV2(Layer): - """ - LayerOutputV2 is used to store the result of LayerOutput in v1 api. - It will not store it's parents because layer_output has been parsed already. - """ - - def __init__(self, layer_output): - assert isinstance(layer_output, conf_helps.LayerOutput) - self.layer_output = layer_output - super(LayerOutputV2, self).__init__( - name=layer_output.name, parent_layers=dict()) - - def to_proto_impl(self): - return self.layer_output - - class StaticInputV2(object): def __init__(self, input, is_seq=False, size=None): assert isinstance(input, LayerV2) @@ -250,6 +218,66 @@ class StaticInputV2(object): # assert input.size is not None or size is not None +class BaseGeneratedInputV2(object): + def __init__(self): + self.bos_id = None + self.eos_id = None + + def before_real_step(self): + raise NotImplementedError() + + def after_real_step(self, *args): + raise NotImplementedError() + + +class GeneratedInputV2(BaseGeneratedInputV2): + def __init__(self, size, embedding_name, embedding_size): + super(GeneratedInputV2, self).__init__() + self.size = size + self.embedding_name = embedding_name + self.embedding_size = embedding_size + + def after_real_step(self, input): + return max_id(input=input, name='__beam_search_predict__') + + def before_real_step(self): + predict_id = memory( + name='__beam_search_predict__', + size=self.size, + boot_with_const_id=self.bos_id) + + trg_emb = embedding( + input=predict_id, + size=self.embedding_size, + param_attr=attr.ParamAttr(name=self.embedding_name)) + return trg_emb + + +class RecurrentLayerGroupSetGeneratorV2(Layer): + def __init__(self, eos_name, max_length, beam_size, num_results_per_sample): + self.eos_name = eos_name + self.max_length = max_length + self.beam_size = beam_size + self.num_results_per_sample = num_results_per_sample + super(RecurrentLayerGroupSetGeneratorV2, self).__init__( + name=eos_name, parent_layers={}) + + def to_proto_impl(self, **kwargs): + RecurrentLayerGroupSetGenerator( + Generator( + eos_layer_name=self.eos_name, + max_num_frames=self.max_length, + beam_size=self.beam_size, + num_results_per_sample=self.num_results_per_sample)) + return self + + def context_name(self): + return self.eos_name + ".fake" + + def use_context_name(self): + return True + + class MixedLayerV2(Layer): """ This class is use to support `with` grammar. If not, the following code @@ -328,18 +356,24 @@ def mixed(size=0, return MixedLayerV2(size, input, name, act, bias_attr, layer_attr) -class RecurrentLayerInput(WithExtraParent): +class RecurrentLayerInput(Layer): def __init__(self, recurrent_name, index, parent_layers): - assert len(parent_layers) == 1 - self.__parents__ = parent_layers.values()[0] - super(RecurrentLayerInput, self).__init__( - name=self.__parents__[index].name, parent_layers=parent_layers) + parents_len = len(parent_layers) + assert parents_len <= 1 + if parents_len == 0: + self.__parents__ = [] + else: + self.__parents__ = parent_layers.values()[0] self.__recurrent_name__ = recurrent_name + name = self.__parents__[ + index].name if index >= 0 else self.context_name() + super(RecurrentLayerInput, self).__init__( + name=name, parent_layers=parent_layers) def context_name(self): return self.__recurrent_name__ + ".begin" - def to_proto_impl(self, context, **kwargs): + def to_proto_impl(self, **kwargs): model_type('recurrent_nn') RecurrentLayerGroupWithoutOutLinksBegin( name=self.__recurrent_name__, @@ -436,6 +470,11 @@ def recurrent_group(step, input, name=None): for i in xrange(len(non_static_inputs)) ] + extra_input = None + if len(non_static_inputs) == 0: + extra_input = RecurrentLayerInput( + recurrent_name=name, index=-1, parent_layers={}) + def __real_step__(*args): rnn_input = list(args) static_inputs = filter(lambda x: isinstance(x, StaticInputV2), input) @@ -443,6 +482,7 @@ def recurrent_group(step, input, name=None): mem_name = "__%s_memory__" % static_input.input.name mem = memory( name=mem_name, + extra_input=extra_input, is_seq=static_input.is_seq, size=static_input.input.calculate_size, boot_layer=static_input.input) @@ -472,6 +512,73 @@ def recurrent_group(step, input, name=None): return retv +@wrap_name_default() +def beam_search(step, + input, + bos_id, + eos_id, + beam_size, + max_length=500, + name=None, + num_results_per_sample=None): + if num_results_per_sample is None: + num_results_per_sample = beam_size + assert num_results_per_sample <= beam_size + # logger.warning("num_results_per_sample should be less than beam_size") + + if isinstance(input, StaticInputV2) or isinstance(input, + BaseGeneratedInputV2): + input = [input] + + generated_input_index = -1 + + real_input = [] + for i, each_input in enumerate(input): + assert isinstance(each_input, StaticInputV2) or isinstance( + each_input, BaseGeneratedInputV2) + if isinstance(each_input, BaseGeneratedInputV2): + assert generated_input_index == -1 + generated_input_index = i + else: + real_input.append(each_input) + + assert generated_input_index != -1 + + gipt = input[generated_input_index] + assert isinstance(gipt, BaseGeneratedInputV2) + + gipt.bos_id = bos_id + gipt.eos_id = eos_id + + def __real_step__(*args): + eos_name = "__%s_eos_layer__" % name + generator = RecurrentLayerGroupSetGeneratorV2( + eos_name, max_length, beam_size, num_results_per_sample) + + args = list(args) + before_step_layer = gipt.before_real_step() + before_step_layer.append_child( + layer=generator, parent_names=[before_step_layer.name]) + args.insert(generated_input_index, before_step_layer) + + predict = gipt.after_real_step(step(*args)) + + eos_layer = eos(input=predict, eos_id=eos_id, name=eos_name) + predict.append_child(layer=eos_layer, parent_names=[predict.name]) + + return predict + + # tmp = paddle.layer.recurrent_group( + # step=__real_step__, + # input=real_input, + # reverse=False, + # name=name, + # is_generating=True) + tmp = recurrent_group(step=__real_step__, input=real_input, name=name) + + return tmp + + __projection_names__ = filter(lambda x: x.endswith('_projection'), dir(conf_helps)) diff --git a/python/paddle/v2/parameters.py b/python/paddle/v2/parameters.py index 05dc5c68dd97b00fb15b74564a32313430c45345..d686d09f220671fce50be0784e354f97cb109f32 100644 --- a/python/paddle/v2/parameters.py +++ b/python/paddle/v2/parameters.py @@ -159,7 +159,8 @@ class Parameters(object): if not self.has_key(key): raise ValueError("No such parameter %s" % key) conf = self.__param_conf__[key] - return tuple(map(int, conf.dims)) + dims = conf.dims if conf.dims else (1, conf.size) + return tuple(map(int, dims)) def __setitem__(self, key, value): """ diff --git a/python/paddle/v2/plot/__init__.py b/python/paddle/v2/plot/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..acd3013db4e6a57cd1b269266bea82a31e928397 --- /dev/null +++ b/python/paddle/v2/plot/__init__.py @@ -0,0 +1,17 @@ +# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from plot import Ploter + +__all__ = ['Ploter'] diff --git a/python/paddle/v2/plot/plot.py b/python/paddle/v2/plot/plot.py new file mode 100644 index 0000000000000000000000000000000000000000..6f7bd039b07db4832295c2374293bffa588eb4ef --- /dev/null +++ b/python/paddle/v2/plot/plot.py @@ -0,0 +1,79 @@ +# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os + + +class PlotData(object): + def __init__(self): + self.step = [] + self.value = [] + + def append(self, step, value): + self.step.append(step) + self.value.append(value) + + def reset(self): + self.step = [] + self.value = [] + + +class Ploter(object): + def __init__(self, *args): + self.__args__ = args + self.__plot_data__ = {} + for title in args: + self.__plot_data__[title] = PlotData() + # demo in notebooks will use Ploter to plot figure, but when we convert + # the ipydb to py file for testing, the import of matplotlib will make the + # script crash. So we can use `export DISABLE_PLOT=True` to disable import + # these libs + self.__disable_plot__ = os.environ.get("DISABLE_PLOT") + if not self.__plot_is_disabled__(): + import matplotlib.pyplot as plt + from IPython import display + self.plt = plt + self.display = display + + def __plot_is_disabled__(self): + return self.__disable_plot__ == "True" + + def append(self, title, step, value): + assert isinstance(title, basestring) + assert self.__plot_data__.has_key(title) + data = self.__plot_data__[title] + assert isinstance(data, PlotData) + data.append(step, value) + + def plot(self): + if self.__plot_is_disabled__(): + return + + titles = [] + for title in self.__args__: + data = self.__plot_data__[title] + assert isinstance(data, PlotData) + if len(data.step) > 0: + titles.append(title) + self.plt.plot(data.step, data.value) + self.plt.legend(titles, loc='upper left') + self.display.clear_output(wait=True) + self.display.display(self.plt.gcf()) + self.plt.gcf().clear() + + def reset(self): + for key in self.__plot_data__: + data = self.__plot_data__[key] + assert isinstance(data, PlotData) + data.reset() diff --git a/python/paddle/v2/plot/tests/CMakeLists.txt b/python/paddle/v2/plot/tests/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..da550a178ce0fe4832b640e0c23505279dedd27a --- /dev/null +++ b/python/paddle/v2/plot/tests/CMakeLists.txt @@ -0,0 +1,3 @@ +add_test(NAME test_ploter + COMMAND bash ${PROJ_ROOT}/python/paddle/v2/plot/tests/run_tests.sh + ${PYTHON_EXECUTABLE}) diff --git a/python/paddle/v2/plot/tests/__init__.py b/python/paddle/v2/plot/tests/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..d1abfc08f19505a9010e924e34074e5bc3cc0571 --- /dev/null +++ b/python/paddle/v2/plot/tests/__init__.py @@ -0,0 +1,16 @@ +# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import test_ploter + +__all__ = ['test_ploter.py'] diff --git a/python/paddle/v2/plot/tests/run_tests.sh b/python/paddle/v2/plot/tests/run_tests.sh new file mode 100755 index 0000000000000000000000000000000000000000..9c1a4a71ce43f285c4f970eddf6af46a2821a40a --- /dev/null +++ b/python/paddle/v2/plot/tests/run_tests.sh @@ -0,0 +1,36 @@ +#!/bin/bash +# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +pushd `dirname $0` > /dev/null +SCRIPTPATH=$PWD +popd > /dev/null + +cd $SCRIPTPATH +$1 -m pip install ../../../../../paddle/dist/*.whl + +export DISABLE_PLOT="True" +test_list="test_ploter.py" + +export PYTHONPATH=$PWD/../../../../../python/ + +for fn in $test_list +do + echo "test $fn" + $1 $fn + if [ $? -ne 0 ]; then + exit 1 + fi +done diff --git a/python/paddle/v2/plot/tests/test_ploter.py b/python/paddle/v2/plot/tests/test_ploter.py new file mode 100644 index 0000000000000000000000000000000000000000..a75f853ed933dfce651faf758f71feca7cd8d328 --- /dev/null +++ b/python/paddle/v2/plot/tests/test_ploter.py @@ -0,0 +1,40 @@ +# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest + +from paddle.v2.plot import Ploter + + +class TestCommon(unittest.TestCase): + def test_append(self): + title1 = "title1" + title2 = "title2" + plot_test = Ploter(title1, title2) + plot_test.append(title1, 1, 2) + plot_test.append(title1, 2, 5) + plot_test.append(title2, 3, 4) + self.assertEqual(plot_test.__plot_data__[title1].step, [1, 2]) + self.assertEqual(plot_test.__plot_data__[title1].value, [2, 5]) + self.assertEqual(plot_test.__plot_data__[title2].step, [3]) + self.assertEqual(plot_test.__plot_data__[title2].value, [4]) + plot_test.reset() + self.assertEqual(plot_test.__plot_data__[title1].step, []) + self.assertEqual(plot_test.__plot_data__[title1].value, []) + self.assertEqual(plot_test.__plot_data__[title2].step, []) + self.assertEqual(plot_test.__plot_data__[title2].value, []) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/v2/tests/test_layer.py b/python/paddle/v2/tests/test_layer.py index 89cc928dd7f624612ba717b4e5c2d6c2de7f8bed..c67f3b84d96eb92d94ad80cc54c5e056103c1a1a 100644 --- a/python/paddle/v2/tests/test_layer.py +++ b/python/paddle/v2/tests/test_layer.py @@ -19,6 +19,7 @@ import paddle.v2.data_type as data_type import paddle.v2.layer as layer import paddle.v2.pooling as pooling import paddle.v2.networks as networks +import paddle.v2.evaluator as evaluator pixel = layer.data(name='pixel', type=data_type.dense_vector(128)) label = layer.data(name='label', type=data_type.integer_value(10)) @@ -58,13 +59,13 @@ class ImageLayerTest(unittest.TestCase): num_channels=16, pool_type=pooling.Max()) maxout = layer.maxout(input=conv, num_channels=16, groups=4) - print layer.parse_network(maxpool, spp, maxout) + print layer.parse_network([maxpool, spp, maxout]) def test_norm_layer(self): norm1 = layer.img_cmrnorm(input=conv, size=5) norm2 = layer.batch_norm(input=conv) norm3 = layer.sum_to_one_norm(input=conv) - print layer.parse_network(norm1, norm2, norm3) + print layer.parse_network([norm1, norm2, norm3]) class AggregateLayerTest(unittest.TestCase): @@ -77,7 +78,8 @@ class AggregateLayerTest(unittest.TestCase): first_seq = layer.first_seq(input=pixel) concat = layer.concat(input=[last_seq, first_seq]) seq_concat = layer.seq_concat(a=last_seq, b=first_seq) - print layer.parse_network(pool, last_seq, first_seq, concat, seq_concat) + print layer.parse_network( + [pool, last_seq, first_seq, concat, seq_concat]) class MathLayerTest(unittest.TestCase): @@ -94,8 +96,10 @@ class MathLayerTest(unittest.TestCase): tensor = layer.tensor(a=pixel, b=pixel, size=1000) cos_sim = layer.cos_sim(a=pixel, b=pixel) trans = layer.trans(input=tensor) - print layer.parse_network(addto, linear_comb, interpolation, power, - scaling, slope, tensor, cos_sim, trans) + print layer.parse_network([ + addto, linear_comb, interpolation, power, scaling, slope, tensor, + cos_sim, trans + ]) class ReshapeLayerTest(unittest.TestCase): @@ -109,7 +113,8 @@ class ReshapeLayerTest(unittest.TestCase): repeat = layer.repeat(input=pixel, num_repeats=4) reshape = layer.seq_reshape(input=pixel, reshape_size=4) rotate = layer.rotate(input=pixel, height=16, width=49) - print layer.parse_network(block_expand, expand, repeat, reshape, rotate) + print layer.parse_network( + [block_expand, expand, repeat, reshape, rotate]) class RecurrentLayerTest(unittest.TestCase): @@ -118,7 +123,7 @@ class RecurrentLayerTest(unittest.TestCase): recurrent = layer.recurrent(input=word) lstm = layer.lstmemory(input=word) gru = layer.grumemory(input=word) - print layer.parse_network(recurrent, lstm, gru) + print layer.parse_network([recurrent, lstm, gru]) class CostLayerTest(unittest.TestCase): @@ -138,10 +143,10 @@ class CostLayerTest(unittest.TestCase): cost10 = layer.sum_cost(input=inference) cost11 = layer.huber_cost(input=score, label=label) - print layer.parse_network(cost1, cost2) - print layer.parse_network(cost3, cost4) - print layer.parse_network(cost5, cost6) - print layer.parse_network(cost7, cost8, cost9, cost10, cost11) + print layer.parse_network([cost1, cost2]) + print layer.parse_network([cost3, cost4]) + print layer.parse_network([cost5, cost6]) + print layer.parse_network([cost7, cost8, cost9, cost10, cost11]) crf = layer.crf(input=inference, label=label) crf_decoding = layer.crf_decoding(input=inference, size=3) @@ -150,8 +155,8 @@ class CostLayerTest(unittest.TestCase): nce = layer.nce(input=inference, label=label, num_classes=3) hsigmoid = layer.hsigmoid(input=inference, label=label, num_classes=3) - print layer.parse_network(crf, crf_decoding, ctc, warp_ctc, nce, - hsigmoid) + print layer.parse_network( + [crf, crf_decoding, ctc, warp_ctc, nce, hsigmoid]) class OtherLayerTest(unittest.TestCase): @@ -159,7 +164,7 @@ class OtherLayerTest(unittest.TestCase): maxid = layer.max_id(input=inference) sampling_id = layer.sampling_id(input=inference) eos = layer.eos(input=maxid, eos_id=5) - print layer.parse_network(maxid, sampling_id, eos) + print layer.parse_network([maxid, sampling_id, eos]) def test_slicing_joining_layer(self): pad = layer.pad(input=conv, pad_c=[2, 3], pad_h=[1, 2], pad_w=[3, 1]) @@ -262,5 +267,20 @@ class NetworkTests(unittest.TestCase): print layer.parse_network(vgg_out) +class EvaluatorTest(unittest.TestCase): + def test_evaluator(self): + img = layer.data(name='pixel', type=data_type.dense_vector(784)) + output = layer.fc(input=img, + size=10, + act=activation.Softmax(), + name='fc_here') + lbl = layer.data(name='label', type=data_type.integer_value(10)) + cost = layer.cross_entropy_cost(input=output, label=lbl) + + evaluator.classification_error(input=output, label=lbl) + print layer.parse_network(cost) + print layer.parse_network(output) + + if __name__ == '__main__': unittest.main() diff --git a/python/paddle/v2/topology.py b/python/paddle/v2/topology.py index f0679c5675b0c0f24f28f3df22efd4eb51ccbb3a..737b6bf1e2eb60281d4d6e92667d9fe91e243704 100644 --- a/python/paddle/v2/topology.py +++ b/python/paddle/v2/topology.py @@ -17,7 +17,6 @@ import collections from paddle.proto.ModelConfig_pb2 import ModelConfig import layer as v2_layer -from layer import WithExtraParent __all__ = ['Topology'] @@ -41,9 +40,8 @@ def __bfs_travel__(callback, *layers): __break__ = callback(each_layer) if __break__: return - __layers__ = each_layer.__parent_layers__.values() - if isinstance(each_layer, WithExtraParent): - __layers__ = __layers__ + each_layer.extra_parent() + __layers__ = each_layer.__parent_layers__.values() + \ + each_layer.extra_parent() __bfs_travel__(callback, *__layers__) @@ -53,14 +51,26 @@ class Topology(object): and network configs. """ - def __init__(self, layers): - if not isinstance(layers, collections.Sequence): - __check_layer_type__(layers) - layers = [layers] - for layer in layers: - __check_layer_type__(layer) + def __init__(self, layers, extra_layers=None): + def __check__(layers): + if not isinstance(layers, collections.Sequence): + __check_layer_type__(layers) + layers = [layers] + for layer in layers: + __check_layer_type__(layer) + return layers + + layers = __check__(layers) self.layers = layers - self.__model_config__ = v2_layer.parse_network(*layers) + if extra_layers is not None: + extra_layers = __check__(extra_layers) + + self.__model_config__ = v2_layer.parse_network( + layers, extra_layers=extra_layers) + + if extra_layers is not None: + self.layers.extend(extra_layers) + assert isinstance(self.__model_config__, ModelConfig) def proto(self): diff --git a/python/paddle/v2/trainer.py b/python/paddle/v2/trainer.py index 4a6d912596da472059dc5c5a673017a66d9b71b4..31222a89e4923e8c20f42a08f7b4b681dc552db3 100644 --- a/python/paddle/v2/trainer.py +++ b/python/paddle/v2/trainer.py @@ -37,9 +37,12 @@ class SGD(object): :type cost: paddle.v2.config_base.Layer :param parameters: The parameters dictionary. :type parameters: paddle.v2.parameters.Parameters + :param extra_layers: Some layers in the neural network graph are not + in the path of cost layer. + :type extra_layers: paddle.v2.config_base.Layer """ - def __init__(self, cost, parameters, update_equation): + def __init__(self, cost, parameters, update_equation, extra_layers=None): if not isinstance(parameters, v2_parameters.Parameters): raise TypeError('parameters should be parameters') @@ -47,11 +50,17 @@ class SGD(object): if not isinstance(update_equation, v2_optimizer.Optimizer): raise TypeError("update equation parameter must be " "paddle.v2.optimizer.Optimizer") - topology = Topology(cost) + topology = Topology(cost, extra_layers=extra_layers) self.__optimizer__ = update_equation self.__topology__ = topology self.__parameters__ = parameters self.__topology_in_proto__ = topology.proto() + + # In local mode, disable sparse_remote_update. + for param in self.__topology_in_proto__.parameters: + if param.sparse_remote_update: + param.sparse_remote_update = False + self.__data_types__ = topology.data_type() gm = api.GradientMachine.createFromConfigProto( self.__topology_in_proto__, api.CREATE_MODE_NORMAL, diff --git a/python/setup.py.in b/python/setup.py.in index 68ca35265cf13265ad0b171b0f70e20b83006ff9..4ac35e3b8d6049a9024a6e0c9bb6804900f82197 100644 --- a/python/setup.py.in +++ b/python/setup.py.in @@ -7,7 +7,8 @@ packages=['paddle', 'paddle.utils', 'paddle.v2', 'paddle.v2.dataset', - 'paddle.v2.reader'] + 'paddle.v2.reader', + 'paddle.v2.plot'] setup(name='paddle', version='${PADDLE_VERSION}',