提交 5dd7586e 编写于 作者: Q qijun

merge from baidu/develop

......@@ -12,19 +12,26 @@
# See the License for the specific language governing permissions and
# limitations under the License
cmake_minimum_required(VERSION 3.0)
project(paddle CXX C)
set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_SOURCE_DIR}/cmake")
set(PROJ_ROOT ${CMAKE_SOURCE_DIR})
include(system)
if(ANDROID)
cmake_minimum_required(VERSION 3.7)
else()
cmake_minimum_required(VERSION 3.0)
endif()
project(paddle CXX C)
find_package(Sphinx)
find_package(CUDA QUIET)
if(NOT CMAKE_CROSSCOMPILING)
find_package(CUDA QUIET)
endif(NOT CMAKE_CROSSCOMPILING)
find_package(Git REQUIRED)
find_package(Threads REQUIRED)
include(system)
include(simd)
################################ Configurations #######################################
......@@ -51,6 +58,21 @@ if(NOT CMAKE_BUILD_TYPE)
FORCE)
endif()
if(ANDROID)
if(${CMAKE_SYSTEM_VERSION} VERSION_LESS "21")
message(FATAL_ERROR "Unsupport standalone toolchains with Android API level lower than 21")
endif()
set(WITH_GPU OFF CACHE STRING
"Disable GPU when cross-compiling for Android" FORCE)
set(WITH_AVX OFF CACHE STRING
"Disable AVX when cross-compiling for Android" FORCE)
set(WITH_PYTHON OFF CACHE STRING
"Disable PYTHON when cross-compiling for Android" FORCE)
set(WITH_RDMA OFF CACHE STRING
"Disable RDMA when cross-compiling for Android" FORCE)
endif(ANDROID)
set(THIRD_PARTY_PATH "${PROJ_ROOT}/third_party" CACHE STRING
"A path setting third party libraries download & build directories.")
########################################################################################
......@@ -64,6 +86,7 @@ include(external/python) # download, build, install python
include(external/openblas) # download, build, install openblas
include(external/swig) # download, build, install swig
include(external/warpctc) # download, build, install warpctc
include(external/any) # download libn::any
include(package) # set paddle packages
include(cpplint) # set paddle c++ style
......@@ -74,7 +97,6 @@ include(flags) # set paddle compile flags
include(cudnn) # set cudnn libraries
include(version) # set PADDLE_VERSION
include(coveralls) # set code coverage
include(configure) # add paddle env configuration
include_directories("${PROJ_ROOT}")
......
......@@ -7,13 +7,12 @@ ARG UBUNTU_MIRROR
RUN /bin/bash -c 'if [[ -n ${UBUNTU_MIRROR} ]]; then sed -i 's#http://archive.ubuntu.com/ubuntu#${UBUNTU_MIRROR}#g' /etc/apt/sources.list; fi'
# ENV variables
ARG BUILD_WOBOQ
ARG WITH_GPU
ARG WITH_AVX
ARG WITH_DOC
ARG WITH_STYLE_CHECK
ENV BUILD_WOBOQ=${BUILD_WOBOQ:-OFF}
ENV WOBOQ OFF
ENV WITH_GPU=${WITH_AVX:-OFF}
ENV WITH_AVX=${WITH_AVX:-ON}
ENV WITH_DOC=${WITH_DOC:-OFF}
......@@ -48,7 +47,7 @@ RUN curl -sSL https://cmake.org/files/v3.4/cmake-3.4.1.tar.gz | tar -xz && \
cd cmake-3.4.1 && ./bootstrap && make -j `nproc` && make install && \
cd .. && rm -rf cmake-3.4.1
VOLUME ["/usr/share/nginx/html/data", "/usr/share/nginx/html/paddle"]
VOLUME ["/woboq_out"]
# Configure OpenSSH server. c.f. https://docs.docker.com/engine/examples/running_ssh_service
RUN mkdir /var/run/sshd
......
......@@ -19,9 +19,9 @@ set(CBLAS_FOUND OFF)
set(INTEL_ROOT "/opt/intel" CACHE PATH "Folder contains intel libs")
set(MKL_ROOT ${INTEL_ROOT}/mkl CACHE PATH "Folder contains MKL")
find_path(MKL_INCLUDE_DIR mkl.h PATHS
find_path(MKL_INC_DIR mkl.h PATHS
${MKL_ROOT}/include)
find_path(MKL_INCLUDE_DIR mkl_lapacke.h PATHS
find_path(MKL_LAPACK_INC_DIR mkl_lapacke.h PATHS
${MKL_ROOT}/include)
find_library(MKL_CORE_LIB NAMES mkl_core PATHS
${MKL_ROOT}/lib
......@@ -34,15 +34,19 @@ find_library(MKL_INTEL_LP64 NAMES mkl_intel_lp64 PATHS
${MKL_ROOT}/lib/intel64)
if(MKL_INCLUDE_DIR AND MKL_CORE_LIB AND MKL_SEQUENTIAL_LIB AND MKL_INTEL_LP64)
if(MKL_INC_DIR AND MKL_CORE_LIB AND MKL_SEQUENTIAL_LIB AND MKL_INTEL_LP64)
set(CBLAS_PROVIDER MKL)
set(CBLAS_INC_DIR ${MKL_INCLUDE_DIR})
set(CBLAS_INC_DIR ${MKL_INC_DIR})
set(CBLAS_LIBRARIES ${MKL_INTEL_LP64}
${MKL_SEQUENTIAL_LIB}
${MKL_CORE_LIB})
add_definitions(-DPADDLE_USE_MKL)
message(STATUS "Found MKL (include: ${CBLAS_INC_DIR}, library: ${CBLAS_LIBRARIES})")
set(CBLAS_FOUND ON)
if(${MKL_LAPACK_INC_DIR})
add_definitions(-DPADDLE_USE_LAPACK)
message(STATUS "Found lapack in MKL (include: ${MKL_LAPACK_INC_DIR})")
endif()
return() # return file.
endif()
......@@ -68,13 +72,17 @@ find_library(ATLAS_CBLAS_LIB NAMES cblas libcblas.so.3
find_library(ATLAS_LIB NAMES lapack_atlas liblapack_atlas.so.3
PATHS ${ATLAS_LIB_SEARCH_PATHS})
if(ATLAS_INC_DIR AND ATLAS_CBLAS_LIB AND ATLAS_LIB)
if(ATLAS_INC_DIR AND ATLAS_CBLAS_LIB AND ATLAS_LIB AND NOT CBLAS_FOUND)
set(CBLAS_PROVIDER ATLAS)
set(CBLAS_INC_DIR ${ATLAS_INC_DIR} ${ATLAS_CLAPACK_INC_DIR})
set(CBLAS_INC_DIR ${ATLAS_INC_DIR})
set(CBLAS_LIBRARIES ${ATLAS_LIB} ${ATLAS_CBLAS_LIB})
add_definitions(-DPADDLE_USE_ATLAS)
message(STATUS "Found Atlas (include: ${CBLAS_INC_DIR}, library: ${CBLAS_LIBRARIES})")
message(STATUS "Found ATLAS (include: ${CBLAS_INC_DIR}, library: ${CBLAS_LIBRARIES})")
set(CBLAS_FOUND ON)
if(ATLAS_CLAPACK_INC_DIR)
add_definitions(-DPADDLE_USE_LAPACK)
message(STATUS "Found lapack in ATLAS (include: ${ATLAS_CLAPACK_INC_DIR})")
endif()
return()
endif()
......@@ -103,8 +111,12 @@ if(OPENBLAS_INC_DIR AND OPENBLAS_LIB)
set(CBLAS_PROVIDER OPENBLAS)
set(CBLAS_INC_DIR ${OPENBLAS_INC_DIR})
set(CBLAS_LIBRARIES ${OPENBLAS_LIB})
message(STATUS "Found OpenBlas (include: ${CBLAS_INC_DIR}, library: ${CBLAS_LIBRARIES})")
message(STATUS "Found OpenBLAS (include: ${CBLAS_INC_DIR}, library: ${CBLAS_LIBRARIES})")
set(CBLAS_FOUND ON)
if(OPENBLAS_LAPACKE_INC_DIR)
add_definitions(-DPADDLE_USE_LAPACK)
message(STATUS "Found lapack in OpenBLAS (include: ${OPENBLAS_LAPACKE_INC_DIR})")
endif()
return()
endif()
......
......@@ -32,6 +32,14 @@ if(NOT WITH_PROFILER)
add_definitions(-DPADDLE_DISABLE_PROFILER)
endif(NOT WITH_PROFILER)
if(NOT CMAKE_CROSSCOMPILING)
if(WITH_AVX AND AVX_FOUND)
set(SIMD_FLAG ${AVX_FLAG})
elseif(SSE3_FOUND)
set(SIMD_FLAG ${SSE3_FLAG})
endif()
endif()
if(NOT WITH_GPU)
add_definitions(-DPADDLE_ONLY_CPU)
add_definitions(-DHPPL_STUB_FUNC)
......@@ -48,21 +56,12 @@ else()
message(FATAL_ERROR "Paddle need cudnn to compile")
endif()
if(WITH_AVX)
set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "-Xcompiler ${AVX_FLAG}")
else(WITH_AVX)
set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "-Xcompiler ${SSE3_FLAG}")
endif(WITH_AVX)
set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "-Xcompiler ${SIMD_FLAG}")
# Include cuda and cudnn
include_directories(${CUDNN_INCLUDE_DIR})
include_directories(${CUDA_TOOLKIT_INCLUDE})
endif(NOT WITH_GPU)
if(WITH_AVX)
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${AVX_FLAG}")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${AVX_FLAG}")
else(WITH_AVX)
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${SSE3_FLAG}")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${SSE3_FLAG}")
endif(WITH_AVX)
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${SIMD_FLAG}")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${SIMD_FLAG}")
if(NOT WITH_GPU)
return()
endif()
set(CUDNN_ROOT "" CACHE PATH "CUDNN ROOT")
find_path(CUDNN_INCLUDE_DIR cudnn.h
PATHS ${CUDNN_ROOT} ${CUDNN_ROOT}/include
......
INCLUDE(ExternalProject)
SET(ANY_SOURCE_DIR ${THIRD_PARTY_PATH}/any)
INCLUDE_DIRECTORIES(${ANY_SOURCE_DIR}/src/linb_any)
ExternalProject_Add(
linb_any
${EXTERNAL_PROJECT_LOG_ARGS}
GIT_REPOSITORY "https://github.com/thelink2012/any.git"
GIT_TAG "8fef1e93710a0edf8d7658999e284a1142c4c020"
PREFIX ${ANY_SOURCE_DIR}
UPDATE_COMMAND ""
CONFIGURE_COMMAND ""
BUILD_COMMAND ""
INSTALL_COMMAND ""
TEST_COMMAND ""
)
add_definitions(-DANY_IMPL_ANY_CAST_MOVEABLE)
......@@ -31,9 +31,17 @@ ExternalProject_Add(
GIT_REPOSITORY "https://github.com/gflags/gflags.git"
PREFIX ${GFLAGS_SOURCES_DIR}
UPDATE_COMMAND ""
CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
CMAKE_ARGS -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
CMAKE_ARGS -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
CMAKE_ARGS -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${GFLAGS_INSTALL_DIR}
CMAKE_ARGS -DCMAKE_POSITION_INDEPENDENT_CODE=ON
CMAKE_ARGS -DBUILD_TESTING=OFF
CMAKE_ARGS -DCMAKE_BUILD_TYPE=Release
CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${GFLAGS_INSTALL_DIR}
-DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
-DCMAKE_BUILD_TYPE:STRING=Release
)
LIST(APPEND external_project_dependencies gflags)
......@@ -33,11 +33,19 @@ ExternalProject_Add(
GIT_REPOSITORY "https://github.com/google/glog.git"
PREFIX ${GLOG_SOURCES_DIR}
UPDATE_COMMAND ""
CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
CMAKE_ARGS -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
CMAKE_ARGS -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
CMAKE_ARGS -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${GLOG_INSTALL_DIR}
CMAKE_ARGS -DCMAKE_POSITION_INDEPENDENT_CODE=ON
CMAKE_ARGS -DWITH_GFLAGS=ON
CMAKE_ARGS -Dgflags_DIR=${GFLAGS_INSTALL_DIR}/lib/cmake/gflags
CMAKE_ARGS -DBUILD_TESTING=OFF
CMAKE_ARGS -DCMAKE_BUILD_TYPE=Release
CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${GLOG_INSTALL_DIR}
-DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
-DCMAKE_BUILD_TYPE:STRING=Release
)
LIST(APPEND external_project_dependencies glog)
......@@ -41,11 +41,19 @@ IF(WITH_TESTING)
GIT_TAG "release-1.8.0"
PREFIX ${GTEST_SOURCES_DIR}
UPDATE_COMMAND ""
CMAKE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${GTEST_INSTALL_DIR}
CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
CMAKE_ARGS -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
CMAKE_ARGS -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
CMAKE_ARGS -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${GTEST_INSTALL_DIR}
CMAKE_ARGS -DCMAKE_POSITION_INDEPENDENT_CODE=ON
CMAKE_ARGS -DBUILD_GMOCK=ON
CMAKE_ARGS -Dgtest_disable_pthreads=ON
CMAKE_ARGS -Dgtest_force_shared_crt=ON
CMAKE_ARGS -DCMAKE_BUILD_TYPE=Release
CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${GTEST_INSTALL_DIR}
-DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
-DCMAKE_BUILD_TYPE:STRING=Release
)
LIST(APPEND external_project_dependencies gtest)
ENDIF(WITH_TESTING)
......@@ -29,7 +29,24 @@ IF(NOT ${CBLAS_FOUND})
IF(CMAKE_COMPILER_IS_GNUCC)
ENABLE_LANGUAGE(Fortran)
LIST(APPEND CBLAS_LIBRARIES gfortran pthread)
if (NOT CMAKE_Fortran_COMPILER_VERSION)
# cmake < 3.4 cannot get CMAKE_Fortran_COMPILER_VERSION directly.
execute_process(COMMAND ${CMAKE_Fortran_COMPILER} -dumpversion
OUTPUT_VARIABLE CMAKE_Fortran_COMPILER_VERSION)
endif()
string(REGEX MATCHALL "[0-9]+" Fortran_VERSION ${CMAKE_Fortran_COMPILER_VERSION})
list(GET Fortran_VERSION 0 Fortran_MAJOR)
list(GET Fortran_VERSION 1 Fortran_MINOR)
find_library(GFORTRAN_LIBRARY NAMES gfortran PATHS
/lib
/usr/lib
/usr/lib/gcc/x86_64-linux-gnu/${Fortran_MAJOR}.${Fortran_MINOR}/
/usr/lib/gcc/x86_64-linux-gnu/${Fortran_MAJOR}/)
if (NOT GFORTRAN_LIBRARY)
message(FATAL_ERROR "Cannot found gfortran library which it is used by openblas")
endif()
find_package(Threads REQUIRED)
LIST(APPEND CBLAS_LIBRARIES ${GFORTRAN_LIBRARY} ${CMAKE_THREAD_LIBS_INIT})
ENDIF(CMAKE_COMPILER_IS_GNUCC)
IF(NOT CMAKE_Fortran_COMPILER)
......@@ -37,6 +54,8 @@ IF(NOT ${CBLAS_FOUND})
"you need to set gfortran compiler: cmake .. -DCMAKE_Fortran_COMPILER=...")
ENDIF(NOT CMAKE_Fortran_COMPILER)
ADD_DEFINITIONS(-DPADDLE_USE_LAPACK)
ExternalProject_Add(
openblas
${EXTERNAL_PROJECT_LOG_ARGS}
......
......@@ -58,12 +58,20 @@ IF(NOT PROTOBUF_FOUND)
GIT_TAG "9f75c5aa851cd877fb0d93ccc31b8567a6706546"
CONFIGURE_COMMAND
${CMAKE_COMMAND} ${PROTOBUF_SOURCES_DIR}/src/protobuf/cmake
-Dprotobuf_BUILD_TESTS=OFF
-DZLIB_ROOT:FILEPATH=${ZLIB_ROOT}
-DCMAKE_POSITION_INDEPENDENT_CODE=ON
-DCMAKE_BUILD_TYPE=Release
-DCMAKE_INSTALL_PREFIX=${PROTOBUF_INSTALL_DIR}
-DCMAKE_INSTALL_LIBDIR=lib
-Dprotobuf_BUILD_TESTS=OFF
-DZLIB_ROOT:FILEPATH=${ZLIB_ROOT}
-DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
-DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
-DCMAKE_POSITION_INDEPENDENT_CODE=ON
-DCMAKE_BUILD_TYPE=Release
-DCMAKE_INSTALL_PREFIX=${PROTOBUF_INSTALL_DIR}
-DCMAKE_INSTALL_LIBDIR=lib
CMAKE_CACHE_ARGS
-DCMAKE_INSTALL_PREFIX:PATH=${PROTOBUF_INSTALL_DIR}
-DCMAKE_BUILD_TYPE:STRING=Release
-DCMAKE_VERBOSE_MAKEFILE:BOOL=OFF
-DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
-DZLIB_ROOT:STRING=${ZLIB_ROOT}
)
LIST(APPEND external_project_dependencies protobuf)
......
......@@ -219,9 +219,9 @@ ELSE(PYTHONLIBS_FOUND AND PYTHONINTERP_FOUND)
ENDIF(PYTHONLIBS_FOUND AND PYTHONINTERP_FOUND)
INCLUDE_DIRECTORIES(${PYTHON_INCLUDE_DIR})
INCLUDE_DIRECTORIES(${PYTHON_NUMPY_INCLUDE_DIR})
IF(NOT WITH_PYTHON)
IF(WITH_PYTHON)
INCLUDE_DIRECTORIES(${PYTHON_INCLUDE_DIR})
INCLUDE_DIRECTORIES(${PYTHON_NUMPY_INCLUDE_DIR})
ELSE()
SET(PYTHON_LIBRARIES "")
ENDIF()
......@@ -50,12 +50,19 @@ ExternalProject_Add(
UPDATE_COMMAND ""
CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
CMAKE_ARGS -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
CMAKE_ARGS -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
CMAKE_ARGS -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${WARPCTC_INSTALL_DIR}
CMAKE_ARGS -DWITH_GPU=${WITH_GPU}
CMAKE_ARGS -DWITH_OMP=${USE_OMP}
CMAKE_ARGS -DWITH_TORCH=OFF
CMAKE_ARGS -DCMAKE_DISABLE_FIND_PACKAGE_Torch=TRUE
CMAKE_ARGS -DCMAKE_DISABLE_FIND_PACKAGE_Torch=ON
CMAKE_ARGS -DBUILD_SHARED=ON
CMAKE_ARGS -DCMAKE_POSITION_INDEPENDENT_CODE=ON
CMAKE_ARGS -DCMAKE_BUILD_TYPE=Release
CMAKE_CACHE_ARGS -DCMAKE_BUILD_TYPE:STRING=Release
-DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
-DCMAKE_INSTALL_PREFIX:PATH=${WARPCTC_INSTALL_DIR}
)
LIST(APPEND external_project_dependencies warpctc)
......@@ -22,7 +22,7 @@ SET(ZLIB_INCLUDE_DIR "${ZLIB_INSTALL_DIR}/include" CACHE PATH "zlib include dire
IF(WIN32)
SET(ZLIB_LIBRARIES "${ZLIB_INSTALL_DIR}/lib/zlibstatic.lib" CACHE FILEPATH "zlib library." FORCE)
ELSE(WIN32)
set(ZLIB_LIBRARIES "${ZLIB_INSTALL_DIR}/lib/libz.a" CACHE FILEPATH "zlib library." FORCE)
SET(ZLIB_LIBRARIES "${ZLIB_INSTALL_DIR}/lib/libz.a" CACHE FILEPATH "zlib library." FORCE)
ENDIF(WIN32)
INCLUDE_DIRECTORIES(${ZLIB_INCLUDE_DIR})
......@@ -34,10 +34,18 @@ ExternalProject_Add(
GIT_TAG "v1.2.8"
PREFIX ${ZLIB_SOURCES_DIR}
UPDATE_COMMAND ""
CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
CMAKE_ARGS -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
CMAKE_ARGS -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
CMAKE_ARGS -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${ZLIB_INSTALL_DIR}
CMAKE_ARGS -DBUILD_SHARED_LIBS=OFF
CMAKE_ARGS -DCMAKE_POSITION_INDEPENDENT_CODE=ON
CMAKE_ARGS -DCMAKE_MACOSX_RPATH=ON
CMAKE_ARGS -DCMAKE_BUILD_TYPE=Release
CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${ZLIB_INSTALL_DIR}
-DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
-DCMAKE_BUILD_TYPE:STRING=Release
)
LIST(APPEND external_project_dependencies zlib)
......@@ -2,6 +2,7 @@
include(CheckCXXCompilerFlag)
include(CheckCCompilerFlag)
include(CheckCXXSymbolExists)
include(CheckTypeSize)
function(CheckCompilerCXX11Flag)
if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
......@@ -25,7 +26,7 @@ function(CheckCompilerCXX11Flag)
endfunction()
CheckCompilerCXX11Flag()
LIST(APPEND CMAKE_CXX_FLAGS -std=c++11)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
# safe_set_flag
#
......@@ -83,6 +84,17 @@ if(NOT UINT64_MAX_EXISTS)
endif()
endif()
SET(CMAKE_EXTRA_INCLUDE_FILES "pthread.h")
CHECK_TYPE_SIZE(pthread_spinlock_t SPINLOCK_FOUND)
CHECK_TYPE_SIZE(pthread_barrier_t BARRIER_FOUND)
if(SPINLOCK_FOUND)
add_definitions(-DPADDLE_USE_PTHREAD_SPINLOCK)
endif(SPINLOCK_FOUND)
if(BARRIER_FOUND)
add_definitions(-DPADDLE_USE_PTHREAD_BARRIER)
endif(BARRIER_FOUND)
SET(CMAKE_EXTRA_INCLUDE_FILES "")
# Common flags. the compiler flag used for C/C++ sources whenever release or debug
# Do not care if this flag is support for gcc.
set(COMMON_FLAGS
......
......@@ -2,6 +2,7 @@
# so that PaddlePaddle can unleash the vectorization power of muticore.
INCLUDE(CheckCXXSourceRuns)
INCLUDE(CheckCXXSourceCompiles)
IF(CMAKE_COMPILER_IS_GNUCC OR CMAKE_COMPILER_IS_GNUCXX OR CMAKE_CXX_COMPILER_ID MATCHES "Clang")
set(MMX_FLAG "-mmmx")
......@@ -17,6 +18,8 @@ ELSEIF(MSVC)
SET(AVX2_FLAG "/arch:AVX2")
ENDIF()
set(CMAKE_REQUIRED_FLAGS_RETAINED ${CMAKE_REQUIRED_FLAGS})
# Check MMX
set(CMAKE_REQUIRED_FLAGS ${MMX_FLAG})
CHECK_CXX_SOURCE_RUNS("
......@@ -73,4 +76,5 @@ int main()
return 0;
}" AVX2_FOUND)
set(CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS_RETAINED})
mark_as_advanced(MMX_FOUND SSE2_FOUND SSE3_FOUND AVX_FOUND AVX2_FOUND)
......@@ -67,6 +67,12 @@ MARK_AS_ADVANCED(HOST_SYSTEM CPU_CORES)
MESSAGE(STATUS "Found Paddle host system: ${HOST_SYSTEM}")
MESSAGE(STATUS "Found Paddle host system's CPU: ${CPU_CORES} cores")
IF(DEFINED CMAKE_SYSTEM_NAME)
IF(${CMAKE_SYSTEM_NAME} STREQUAL "Android")
SET(ANDROID TRUE)
ENDIF()
ENDIF()
# external dependencies log output
SET(EXTERNAL_PROJECT_LOG_ARGS
LOG_DOWNLOAD 0 # Wrap download in script to log output
......
......@@ -90,6 +90,10 @@ function(link_paddle_exe TARGET_NAME)
${RDMA_LD_FLAGS}
${RDMA_LIBS})
if(ANDROID)
target_link_libraries(${TARGET_NAME} log)
endif(ANDROID)
add_dependencies(${TARGET_NAME} ${external_project_dependencies})
endfunction()
......
import sys
import paddle.v2 as paddle
def seqToseq_net(source_dict_dim, target_dict_dim):
def seqToseq_net(source_dict_dim, target_dict_dim, is_generating=False):
### Network Architecture
word_vector_dim = 512 # dimension of word vector
decoder_size = 512 # dimension of hidden unit in GRU Decoder network
encoder_size = 512 # dimension of hidden unit in GRU Encoder network
beam_size = 3
max_length = 250
#### Encoder
src_word_id = paddle.layer.data(
name='source_language_word',
......@@ -67,30 +71,57 @@ def seqToseq_net(source_dict_dim, target_dict_dim):
group_input2 = paddle.layer.StaticInputV2(input=encoded_proj, is_seq=True)
group_inputs = [group_input1, group_input2]
trg_embedding = paddle.layer.embedding(
input=paddle.layer.data(
name='target_language_word',
type=paddle.data_type.integer_value_sequence(target_dict_dim)),
size=word_vector_dim,
param_attr=paddle.attr.ParamAttr(name='_target_language_embedding'))
group_inputs.append(trg_embedding)
# For decoder equipped with attention mechanism, in training,
# target embeding (the groudtruth) is the data input,
# while encoded source sequence is accessed to as an unbounded memory.
# Here, the StaticInput defines a read-only memory
# for the recurrent_group.
decoder = paddle.layer.recurrent_group(
name=decoder_group_name,
step=gru_decoder_with_attention,
input=group_inputs)
lbl = paddle.layer.data(
name='target_language_next_word',
type=paddle.data_type.integer_value_sequence(target_dict_dim))
cost = paddle.layer.classification_cost(input=decoder, label=lbl)
return cost
if not is_generating:
trg_embedding = paddle.layer.embedding(
input=paddle.layer.data(
name='target_language_word',
type=paddle.data_type.integer_value_sequence(target_dict_dim)),
size=word_vector_dim,
param_attr=paddle.attr.ParamAttr(name='_target_language_embedding'))
group_inputs.append(trg_embedding)
# For decoder equipped with attention mechanism, in training,
# target embeding (the groudtruth) is the data input,
# while encoded source sequence is accessed to as an unbounded memory.
# Here, the StaticInput defines a read-only memory
# for the recurrent_group.
decoder = paddle.layer.recurrent_group(
name=decoder_group_name,
step=gru_decoder_with_attention,
input=group_inputs)
lbl = paddle.layer.data(
name='target_language_next_word',
type=paddle.data_type.integer_value_sequence(target_dict_dim))
cost = paddle.layer.classification_cost(input=decoder, label=lbl)
return cost
else:
# In generation, the decoder predicts a next target word based on
# the encoded source sequence and the last generated target word.
# The encoded source sequence (encoder's output) must be specified by
# StaticInput, which is a read-only memory.
# Embedding of the last generated word is automatically gotten by
# GeneratedInputs, which is initialized by a start mark, such as <s>,
# and must be included in generation.
trg_embedding = paddle.layer.GeneratedInputV2(
size=target_dict_dim,
embedding_name='_target_language_embedding',
embedding_size=word_vector_dim)
group_inputs.append(trg_embedding)
beam_gen = paddle.layer.beam_search(
name=decoder_group_name,
step=gru_decoder_with_attention,
input=group_inputs,
bos_id=0,
eos_id=1,
beam_size=beam_size,
max_length=max_length)
return beam_gen
def main():
......
......@@ -8,199 +8,255 @@ Please be aware that you will need to change `Dockers settings
<https://github.com/PaddlePaddle/Paddle/issues/627>`_ to make full use
of your hardware resource on Mac OS X and Windows.
Working With Docker
-------------------
Docker is simple as long as we understand a few basic concepts:
- *image*: A Docker image is a pack of software. It could contain one or more programs and all their dependencies. For example, the PaddlePaddle's Docker image includes pre-built PaddlePaddle and Python and many Python packages. We can run a Docker image directly, other than installing all these software. We can type
.. code-block:: bash
docker images
to list all images in the system. We can also run
.. code-block:: bash
docker pull paddlepaddle/paddle:0.10.0rc2
to download a Docker image, paddlepaddle/paddle in this example,
from Dockerhub.com.
- *container*: considering a Docker image a program, a container is a
"process" that runs the image. Indeed, a container is exactly an
operating system process, but with a virtualized filesystem, network
port space, and other virtualized environment. We can type
.. code-block:: bash
docker run paddlepaddle/paddle:0.10.0rc2
to start a container to run a Docker image, paddlepaddle/paddle in this example.
- By default docker container have an isolated file system namespace,
we can not see the files in the host file system. By using *volume*,
mounted files in host will be visible inside docker container.
Following command will mount current dirctory into /data inside
docker container, run docker container from debian image with
command :code:`ls /data`.
.. code-block:: bash
docker run --rm -v $(pwd):/data debian ls /data
Usage of CPU-only and GPU Images
----------------------------------
For each version of PaddlePaddle, we release 2 types of Docker images: development
image and production image. Production image includes CPU-only version and a CUDA
GPU version and their no-AVX versions. We put the docker images on
`dockerhub.com <https://hub.docker.com/r/paddledev/paddle/>`_. You can find the
latest versions under "tags" tab at dockerhub.com.
1. development image :code:`paddlepaddle/paddle:<version>-dev`
For each version of PaddlePaddle, we release two types of Docker images:
development image and production image. Production image includes
CPU-only version and a CUDA GPU version and their no-AVX versions. We
put the docker images on `dockerhub.com
<https://hub.docker.com/r/paddledev/paddle/>`_. You can find the
latest versions under "tags" tab at dockerhub.com
This image has packed related develop tools and runtime environment. Users and
developers can use this image instead of their own local computer to accomplish
development, build, releasing, document writing etc. While different version of
paddle may depends on different version of libraries and tools, if you want to
setup a local environment, you must pay attention to the versions.
The development image contains:
- gcc/clang
- nvcc
- Python
- sphinx
- woboq
- sshd
Many developers use servers with GPUs, they can use ssh to login to the server
and run :code:`docker exec` to enter the docker container and start their work.
Also they can start a development docker image with SSHD service, so they can login to
the container and start work.
1. Production images, this image might have multiple variants:
To run the CPU-only image as an interactive container:
- GPU/AVX::code:`paddlepaddle/paddle:<version>-gpu`
- GPU/no-AVX::code:`paddlepaddle/paddle:<version>-gpu-noavx`
- CPU/AVX::code:`paddlepaddle/paddle:<version>`
- CPU/no-AVX::code:`paddlepaddle/paddle:<version>-noavx`
.. code-block:: bash
Please be aware that the CPU-only and the GPU images both use the
AVX instruction set, but old computers produced before 2008 do not
support AVX. The following command checks if your Linux computer
supports AVX:
docker run -it --rm paddledev/paddle:<version> /bin/bash
.. code-block:: bash
or, we can run it as a daemon container
if cat /proc/cpuinfo | grep -i avx; then echo Yes; else echo No; fi
.. code-block:: bash
To run the CPU-only image as an interactive container:
docker run -d -p 2202:22 -p 8888:8888 paddledev/paddle:<version>
.. code-block:: bash
and SSH to this container using password :code:`root`:
docker run -it --rm paddlepaddle/paddle:0.10.0rc2 /bin/bash
.. code-block:: bash
Above method work with the GPU image too -- the recommended way is
using `nvidia-docker <https://github.com/NVIDIA/nvidia-docker>`_.
ssh -p 2202 root@localhost
Please install nvidia-docker first following this `tutorial
<https://github.com/NVIDIA/nvidia-docker#quick-start>`_.
An advantage of using SSH is that we can connect to PaddlePaddle from
more than one terminals. For example, one terminal running vi and
another one running Python interpreter. Another advantage is that we
can run the PaddlePaddle container on a remote server and SSH to it
from a laptop.
Now you can run a GPU image:
.. code-block:: bash
2. Production images, this image might have multiple variants:
- GPU/AVX::code:`paddlepaddle/paddle:<version>-gpu`
- GPU/no-AVX::code:`paddlepaddle/paddle:<version>-gpu-noavx`
- CPU/AVX::code:`paddlepaddle/paddle:<version>`
- CPU/no-AVX::code:`paddlepaddle/paddle:<version>-noavx`
nvidia-docker run -it --rm paddlepaddle/paddle:0.10.0rc2-gpu /bin/bash
Please be aware that the CPU-only and the GPU images both use the AVX
instruction set, but old computers produced before 2008 do not support
AVX. The following command checks if your Linux computer supports
AVX:
2. development image :code:`paddlepaddle/paddle:<version>-dev`
.. code-block:: bash
This image has packed related develop tools and runtime
environment. Users and developers can use this image instead of
their own local computer to accomplish development, build,
releasing, document writing etc. While different version of paddle
may depends on different version of libraries and tools, if you
want to setup a local environment, you must pay attention to the
versions. The development image contains:
- gcc/clang
- nvcc
- Python
- sphinx
- woboq
- sshd
Many developers use servers with GPUs, they can use ssh to login to
the server and run :code:`docker exec` to enter the docker
container and start their work. Also they can start a development
docker image with SSHD service, so they can login to the container
and start work.
if cat /proc/cpuinfo | grep -i avx; then echo Yes; else echo No; fi
Train Model Using Python API
----------------------------
If it doesn't, we will use the non-AVX images.
Our official docker image provides a runtime for PaddlePaddle
programs. The typical workflow will be as follows:
Above methods work with the GPU image too -- just please don't forget
to install GPU driver. To support GPU driver, we recommend to use
[nvidia-docker](https://github.com/NVIDIA/nvidia-docker). Run using
Create a directory as workspace:
.. code-block:: bash
.. code-block:: bash
nvidia-docker run -it --rm paddledev/paddle:0.10.0rc1-gpu /bin/bash
mkdir ~/workspace
Note: If you would have a problem running nvidia-docker, you may try the old method we have used (not recommended).
Edit a PaddlePaddle python program using your favourite editor
.. code-block:: bash
.. code-block:: bash
export CUDA_SO="$(\ls /usr/lib64/libcuda* | xargs -I{} echo '-v {}:{}') $(\ls /usr/lib64/libnvidia* | xargs -I{} echo '-v {}:{}')"
export DEVICES=$(\ls /dev/nvidia* | xargs -I{} echo '--device {}:{}')
docker run ${CUDA_SO} ${DEVICES} -it paddledev/paddle:<version>-gpu
emacs ~/workspace/example.py
Run the program using docker:
3. Use production image to release you AI application
Suppose that we have a simple application program in :code:`a.py`, we can test and run it using the production image:
.. code-block:: bash
```bash
docker run -it -v $PWD:/work paddle /work/a.py
```
docker run --rm -v ~/workspace:/workspace paddlepaddle/paddle:0.10.0rc2 python /workspace/example.py
But this works only if all dependencies of :code:`a.py` are in the production image. If this is not the case, we need to build a new Docker image from the production image and with more dependencies installs.
Or if you are using GPU for training:
.. code-block:: bash
PaddlePaddle Book
------------------
nvidia-docker run --rm -v ~/workspace:/workspace paddlepaddle/paddle:0.10.0rc2-gpu python /workspace/example.py
The Jupyter Notebook is an open-source web application that allows
you to create and share documents that contain live code, equations,
visualizations and explanatory text in a single browser.
Above commands will start a docker container by running :code:`python
/workspace/example.py`. It will stop once :code:`python
/workspace/example.py` finishes.
PaddlePaddle Book is an interactive Jupyter Notebook for users and developers.
We already exposed port 8888 for this book. If you want to
dig deeper into deep learning, PaddlePaddle Book definitely is your best choice.
Another way is to tell docker to start a :code:`/bin/bash` session and
run PaddlePaddle program interactively:
We provide a packaged book image, simply issue the command:
.. code-block:: bash
docker run -it -v ~/workspace:/workspace paddlepaddle/paddle:0.10.0rc2 /bin/bash
# now we are inside docker container
cd /workspace
python example.py
Running with GPU is identical:
.. code-block:: bash
docker run -p 8888:8888 paddlepaddle/book
nvidia-docker run -it -v ~/workspace:/workspace paddlepaddle/paddle:0.10.0rc2-gpu /bin/bash
# now we are inside docker container
cd /workspace
python example.py
Then, you would back and paste the address into the local browser:
.. code-block:: text
Develop PaddlePaddle or Train Model Using C++ API
---------------------------------------------------
http://localhost:8888/
We will be using PaddlePaddle development image since it contains all
compiling tools and dependencies.
That's all. Enjoy your journey!
Let's clone PaddlePaddle repo first:
Development Using Docker
------------------------
.. code-block:: bash
Developers can work on PaddlePaddle using Docker. This allows
developers to work on different platforms -- Linux, Mac OS X, and
Windows -- in a consistent way.
git clone https://github.com/PaddlePaddle/Paddle.git && cd Paddle
1. Build the Development Docker Image
Mount both workspace folder and paddle code folder into docker
container, so we can access them inside docker container. There are
two ways of using PaddlePaddle development docker image:
.. code-block:: bash
- run interactive bash directly
git clone --recursive https://github.com/PaddlePaddle/Paddle
cd Paddle
docker build -t paddle:dev .
.. code-block:: bash
Note that by default :code:`docker build` wouldn't import source
tree into the image and build it. If we want to do that, we need docker the
development docker image and then run the following command:
# use nvidia-docker instead of docker if you need to use GPU
docker run -it -v ~/workspace:/workspace -v $(pwd):/paddle paddlepaddle/paddle:0.10.0rc2-dev /bin/bash
# now we are inside docker container
.. code-block:: bash
- or, we can run it as a daemon container
docker run -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_AVX=ON" -e "TEST=OFF" paddle:dev
.. code-block:: bash
# use nvidia-docker instead of docker if you need to use GPU
docker run -d -p 2202:22 -p 8888:8888 -v ~/workspace:/workspace -v $(pwd):/paddle paddlepaddle/paddle:0.10.0rc2-dev /usr/sbin/sshd -D
2. Run the Development Environment
and SSH to this container using password :code:`root`:
Once we got the image :code:`paddle:dev`, we can use it to develop
Paddle by mounting the local source code tree into a container that
runs the image:
.. code-block:: bash
.. code-block:: bash
ssh -p 2202 root@localhost
docker run -d -p 2202:22 -p 8888:8888 -v $PWD:/paddle paddle:dev sshd
An advantage is that we can run the PaddlePaddle container on a
remote server and SSH to it from a laptop.
This runs a container of the development environment Docker image
with the local source tree mounted to :code:`/paddle` of the
container.
When developing PaddlePaddle, you can edit PaddlePaddle source code
from outside of docker container using your favoriate editor. To
compile PaddlePaddle, run inside container:
The above :code:`docker run` commands actually starts
an SSHD server listening on port 2202. This allows us to log into
this container with:
.. code-block:: bash
.. code-block:: bash
WITH_GPU=OFF WITH_AVX=ON WITH_TEST=ON bash /paddle/paddle/scripts/docker/build.sh
ssh root@localhost -p 2202
This builds everything about Paddle in :code:`/paddle/build`. And we
can run unit tests there:
Usually, I run above commands on my Mac. I can also run them on a
GPU server :code:`xxx.yyy.zzz.www` and ssh from my Mac to it:
.. code-block:: bash
.. code-block:: bash
cd /paddle/build
ctest
my-mac$ ssh root@xxx.yyy.zzz.www -p 2202
When training model using C++ API, we can edit paddle program in
~/workspace outside of docker. And build from /workspace inside of
docker.
3. Build and Install Using the Development Environment
PaddlePaddle Book
------------------
Once I am in the container, I can use
:code:`paddle/scripts/docker/build.sh` to build, install, and test
Paddle:
The Jupyter Notebook is an open-source web application that allows
you to create and share documents that contain live code, equations,
visualizations and explanatory text in a single browser.
.. code-block:: bash
PaddlePaddle Book is an interactive Jupyter Notebook for users and developers.
We already exposed port 8888 for this book. If you want to
dig deeper into deep learning, PaddlePaddle Book definitely is your best choice.
/paddle/paddle/scripts/docker/build.sh
We provide a packaged book image, simply issue the command:
This builds everything about Paddle in :code:`/paddle/build`. And
we can run unit tests there:
.. code-block:: bash
.. code-block:: bash
docker run -p 8888:8888 paddlepaddle/book
Then, you would back and paste the address into the local browser:
.. code-block:: text
http://localhost:8888/
cd /paddle/build
ctest
That's all. Enjoy your journey!
Documentation
......
......@@ -55,6 +55,7 @@ extensions = [
'sphinx.ext.napoleon',
'sphinx.ext.graphviz'
]
mathjax_path="https://cdn.bootcss.com/mathjax/2.7.0/MathJax.js"
table_styling_embed_css = True
autodoc_member_order = 'bysource'
......
......@@ -17,7 +17,11 @@ limitations under the License. */
#include <stdio.h>
#include "hl_base.h"
#if defined(__ARM_NEON__) || defined(__ARM_NEON)
#include "hl_neon_matrix_kernel.cuh"
#else
#include "hl_sse_matrix_kernel.cuh"
#endif
/**
* @brief cpu element wise unary operator.
......
......@@ -66,6 +66,8 @@ typedef BaseOp SSESquaredDiff;
typedef BaseOp SSEFirst;
typedef BaseOp SSESecond;
typedef BaseOp SSEClassificationError;
#elif defined(__ARM__NEON__) || defined(__ARM_NEON)
#include "hl_matrix_base_neon.cuh"
#else
#include "hl_matrix_base_sse.cuh"
#endif
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifndef HL_MATRIX_BASE_NEON_CUH_
#define HL_MATRIX_BASE_NEON_CUH_
namespace aggregate {
class SSESum {
public:
static const bool sse = true;
INLINE float32x4_t vecOp(const float32x4_t a, const float32x4_t b) const {
return vaddq_f32(a, b);
}
};
class SSEMax {
public:
static const bool sse = true;
INLINE float32x4_t vecOp(const float32x4_t a, const float32x4_t b) const {
return vmaxq_f32(a, b);
}
};
class SSEMin {
public:
static const bool sse = true;
INLINE float32x4_t vecOp(const float32x4_t a, const float32x4_t b) const {
return vminq_f32(a, b);
}
};
} // namespace aggregate
namespace base {
namespace unary {
class SSEIdentity {
public:
static const bool sse = true;
INLINE float32x4_t vecOp(const float32x4_t a) const {
return a;
}
};
} // namespace unary
namespace binary {
class SSEAdd {
public:
static const bool sse = true;
INLINE float32x4_t vecOp(const float32x4_t a, const float32x4_t b) const {
return vaddq_f32(a, b);
}
};
class SSEAdd2 {
public:
static const bool sse = true;
const real p1;
const real p2;
float32x4_t mp1;
float32x4_t mp2;
public:
SSEAdd2(const real s1, const real s2) : p1(s1), p2(s2) {
mp1 = vdupq_n_f32(p1);
mp2 = vdupq_n_f32(p2);
}
INLINE float32x4_t vecOp(const float32x4_t a, const float32x4_t b) const {
float32x4_t tmp1, tmp2;
tmp1 = vmulq_f32(mp1, a);
tmp2 = vmulq_f32(mp2, b);
return vaddq_f32(tmp1, tmp2);
}
};
class SSESub {
public:
static const bool sse = true;
INLINE float32x4_t vecOp(const float32x4_t a, const float32x4_t b) const {
return vsubq_f32(a, b);
}
};
class SSEMul {
public:
static const bool sse = true;
INLINE float32x4_t vecOp(const float32x4_t a, const float32x4_t b) const {
return vmulq_f32(a, b);
}
};
class SSEDiv {
public:
static const bool sse = true;
INLINE float32x4_t vecOp(const float32x4_t a, const float32x4_t b) const {
float32x4_t tmp;
tmp = vrecpeq_f32(b);
return vmulq_f32(a, tmp);
}
};
class SSESquaredDiff {
public:
static const bool sse = true;
INLINE float32x4_t vecOp(const float32x4_t a, const float32x4_t b) const {
float32x4_t tmp;
tmp = vsubq_f32(a, b);
return vmulq_f32(tmp, tmp);
}
};
class SSEFirst {
public:
static const bool sse = true;
INLINE float32x4_t vecOp(const float32x4_t a, const float32x4_t b) const {
return a;
}
};
class SSESecond {
public:
static const bool sse = true;
INLINE float32x4_t vecOp(const float32x4_t a, const float32x4_t b) const {
return b;
}
};
class SSEClassificationError {
public:
static const bool sse = true;
const real p;
float32x4_t mp;
uint32x4_t result;
public:
explicit SSEClassificationError(const real s) : p(s) {
mp = vdupq_n_f32(p);
result = vdupq_n_u32(1);
}
// TODO: to be check
INLINE float32x4_t vecOp(const float32x4_t a, const float32x4_t b) const {
uint32x4_t tmp1 = vcgtq_f32(a, mp);
uint32x4_t tmp2 = vcgtq_f32(b, mp);
uint32x4_t tmp3 = veorq_u32(tmp1, tmp2);
return vcvtq_f32_u32(vandq_u32(tmp3, result));
}
};
} // namespace binary
} // namespace base
#endif /* HL_MATRIX_BASE_NEON_CUH_ */
......@@ -17,13 +17,20 @@ limitations under the License. */
#include "hl_base.h"
#ifdef __CUDA_ARCH__
#if defined(__CUDA_ARCH__)
#include <vector_types.h>
#ifndef PADDLE_TYPE_DOUBLE
typedef float4 vecType;
#else
typedef double2 vecType;
#endif
#elif (defined __ARM_NEON) || (defined __ARM_NEON__)
#include <arm_neon.h>
#ifndef PADDLE_TYPE_DOUBLE
typedef float32x4_t vecType;
#else
#error NEON instructions does not support double precision
#endif
#else
#include <mmintrin.h>
#include <xmmintrin.h>
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifndef HL_NEON_MATRIX_KERNEL_CUH_
#define HL_NEON_MATRIX_KERNEL_CUH_
#include "hl_matrix_type.cuh"
#define VECTOR_SIZE 16
/* number of float in vector */
#define VECTOR_LEN 4
#define VECTOR_SET vdupq_n_f32
inline bool hl_check_align(size_t size) {
return !(size & (VECTOR_SIZE - 1));
}
inline bool hl_check_align(void *ptr) {
return hl_check_align(reinterpret_cast<size_t>(ptr));
}
template <class Agg>
inline real hl_agg_op(Agg agg, vecType mm) {
float32x4_t rev = vrev64q_f32(mm);
float32x4_t tmp1 = agg.vecOp(rev, rev);
float32x2_t lo = vget_high_f32(rev);
float32x2_t hi = vget_low_f32(rev);
float32x4_t tmp2 = vcombine_f32(hi, lo);
float32x4_t ret = agg.vecOp(tmp1, tmp2);
return vgetq_lane_f32(ret, 0);
}
template <class Agg, class Op, class Saver>
void hl_sse_matrix_row_op(Agg agg, Op op, Saver sv,
int dimM, int dimN,
real *dst, int ld,
real *A, int lda) {
for (int i = 0; i < dimM; i++, A += lda) {
vecType mm = VECTOR_SET(agg.init());
vecType *a = (vecType*)(A);
for (int j = 0; j < dimN / VECTOR_LEN; j++, a++) {
mm = agg.vecOp(mm, op.vecOp(*a));
}
int rem = dimN % VECTOR_LEN;
if (rem) {
real tmp = hl_agg_op(agg, mm);
real *a = A + (dimN / VECTOR_LEN) * VECTOR_LEN;
for (int j = 0; j < rem; j++) {
tmp = agg(tmp, op(a[j]));
}
dst[i*ld] = sv(dst[i*ld], tmp);
} else {
dst[i*ld] = sv(dst[i*ld], hl_agg_op(agg, mm));
}
}
}
template <class Agg, class Op, class Saver>
void hl_sse_matrix_row_op(Agg agg, Op op, Saver sv,
int dimM, int dimN,
real *dst, int ld,
real *A, int lda,
real *B, int ldb) {
for (int i = 0; i < dimM; i++, A += lda, B += ldb) {
vecType mm = VECTOR_SET(agg.init());
vecType *a = (vecType*)(A);
vecType *b = (vecType*)(B);
for (int j = 0; j < dimN / VECTOR_LEN; j++, a++, b++) {
mm = agg.vecOp(mm, op.vecOp(*a, *b));
}
int rem = dimN % VECTOR_LEN;
if (rem) {
real tmp = hl_agg_op(agg, mm);
real *a = A + (dimN / VECTOR_LEN) * VECTOR_LEN;
real *b = B + (dimN / VECTOR_LEN) * VECTOR_LEN;
for (int j = 0; j < rem; j++) {
tmp = agg(tmp, op(a[j], b[j]));
}
dst[i*ld] = sv(dst[i*ld], tmp);
} else {
dst[i*ld] = sv(dst[i*ld], hl_agg_op(agg, mm));
}
}
}
template <class Agg, class Op, class Saver>
void hl_matrix_column_op(Agg agg, Op op, Saver sv,
int dimM, int dimN,
real *dst,
real *A, int lda) {
for (int j = 0; j < dimN; j++) {
real tmp = agg.init();
for (int i = 0; i < dimM; i++) {
tmp = agg(tmp, op(A[i * lda + j]));
}
dst[j] = sv(dst[j], tmp);
}
}
template <class Agg, class Op, class Saver>
void hl_matrix_column_op(Agg agg, Op op, Saver sv,
int dimM, int dimN,
real *dst,
real *A, int lda,
real *B, int ldb) {
for (int j = 0; j < dimN; j++) {
real tmp = agg.init();
for (int i = 0; i < dimM; i++) {
tmp = agg(tmp, op(A[i * lda + j], B[i * ldb + j]));
}
dst[j] = sv(dst[j], tmp);
}
}
/*
* MaxRow greater than or equal dimN
* dimN is multiples of VECTOR_LEN
* so rem <= MaxRow / VECTOR_LEN
*/
template <int MaxRow, class Agg, class Op, class Saver>
void hl_sse_column_op_with_rem(Agg agg, Op op, Saver sv,
int dimM, int dimN,
real *dst,
real *A, int lda) {
vecType mm[MaxRow / VECTOR_LEN];
for (int n = 0; n < MaxRow / VECTOR_LEN; n++) {
mm[n] = VECTOR_SET(agg.init());
}
for (int i = 0; i < dimM; i++) {
vecType *a = (vecType*)(A + i * lda);
for (int n = 0; n < dimN / VECTOR_LEN; n++) {
mm[n] = agg.vecOp(mm[n], op.vecOp(a[n]));
}
}
vecType *result = (vecType*)(dst);
for (int n = 0; n < dimN / VECTOR_LEN; n++) {
result[n] = sv.vecOp(result[n], mm[n]);
}
int rem = dimN % VECTOR_LEN;
if (rem) {
A += (dimN / VECTOR_LEN) * VECTOR_LEN;
dst += (dimN / VECTOR_LEN) * VECTOR_LEN;
hl_matrix_column_op(agg, op, sv, dimM, rem, dst, A, lda);
}
}
/*
* dimN is multiples of VECTOR_LEN
* dimN greater than Step
*/
template <int Step, class Agg, class Op, class Saver>
void hl_sse_matrix_column_op(Agg agg, Op op, Saver sv,
int dimM, int dimN,
real *dst,
real *A, int lda) {
for (int j = 0; j < dimN / Step; j++, dst += Step, A += Step) {
vecType mm[Step / VECTOR_LEN];
for (int n = 0; n < Step / VECTOR_LEN; n++) {
mm[n] = VECTOR_SET(agg.init());
}
for (int i = 0; i < dimM; i++) {
vecType *a = (vecType*)(A + i * lda);
for (int n = 0; n < Step / VECTOR_LEN; n++) {
mm[n] = agg.vecOp(mm[n], op.vecOp(a[n]));
}
}
vecType *result = (vecType*)(dst);
for (int n = 0; n < Step / VECTOR_LEN; n++) {
result[n] = sv.vecOp(result[n], mm[n]);
}
}
int remRow = dimN % Step;
if (remRow) {
hl_sse_column_op_with_rem<Step>(agg, op, sv, dimM, remRow, dst, A, lda);
}
}
template <class Agg, class Op, class Saver>
void hl_sse_matrix_column_op(Agg agg, Op op, Saver sv,
int dimM, int dimN,
real *dst,
real *A, int lda) {
if (dimN <= 16) {
hl_sse_matrix_column_op<16>(agg, op, sv, dimM, dimN, dst, A, lda);
} else if (dimN <= 32) {
hl_sse_matrix_column_op<32>(agg, op, sv, dimM, dimN, dst, A, lda);
} else if (dimN <= 1024 || dimM <= 512) {
hl_sse_matrix_column_op<64>(agg, op, sv, dimM, dimN, dst, A, lda);
} else {
hl_sse_matrix_column_op<1024>(agg, op, sv, dimM, dimN, dst, A, lda);
}
}
template <int MaxRow, class Agg, class Op, class Saver>
void hl_sse_column_op_with_rem(Agg agg, Op op, Saver sv,
int dimM, int dimN,
real *dst,
real *A, int lda,
real *B, int ldb) {
vecType mm[MaxRow / VECTOR_LEN];
for (int n = 0; n < MaxRow / VECTOR_LEN; n++) {
mm[n] = VECTOR_SET(agg.init());
}
for (int i = 0; i < dimM; i++) {
vecType *a = (vecType*)(A + i * lda);
vecType *b = (vecType*)(B + i * ldb);
for (int n = 0; n < dimN / VECTOR_LEN; n++) {
mm[n] = agg.vecOp(mm[n], op.vecOp(a[n], b[n]));
}
}
vecType *result = (vecType*)(dst);
for (int n = 0; n < dimN / VECTOR_LEN; n++) {
result[n] = sv.vecOp(result[n], mm[n]);
}
int rem = dimN % VECTOR_LEN;
if (rem) {
A += (dimN / VECTOR_LEN) * VECTOR_LEN;
B += (dimN / VECTOR_LEN) * VECTOR_LEN;
dst += (dimN / VECTOR_LEN) * VECTOR_LEN;
hl_matrix_column_op(agg, op, sv, dimM, rem, dst, A, lda, B, ldb);
}
}
template <int Step, class Agg, class Op, class Saver>
void hl_sse_matrix_column_op(Agg agg, Op op, Saver sv,
int dimM, int dimN,
real *dst,
real *A, int lda,
real *B, int ldb) {
for (int j = 0; j < dimN / Step; j++, dst += Step, A += Step, B += Step) {
vecType mm[Step / VECTOR_LEN];
for (int n = 0; n < Step / VECTOR_LEN; n++) {
mm[n] = VECTOR_SET(agg.init());
}
for (int i = 0; i < dimM; i++) {
vecType *a = (vecType*)(A + i * lda);
vecType *b = (vecType*)(B + i * ldb);
for (int n = 0; n < Step / VECTOR_LEN; n++) {
mm[n] = agg.vecOp(mm[n], op.vecOp(a[n], b[n]));
}
}
vecType *result = (vecType*)(dst);
for (int n = 0; n < Step / VECTOR_LEN; n++) {
result[n] = sv.vecOp(result[n], mm[n]);
}
}
int remRow = dimN % Step;
if (remRow) {
hl_sse_column_op_with_rem<Step>(
agg, op, sv, dimM, remRow, dst, A, lda, B, ldb);
}
}
template <class Agg, class Op, class Saver>
void hl_sse_matrix_column_op(Agg agg, Op op, Saver sv,
int dimM, int dimN,
real *dst,
real *A, int lda,
real *B, int ldb) {
if (dimN <= 16) {
hl_sse_matrix_column_op<16>(agg, op, sv, dimM, dimN, dst, A, lda, B, ldb);
} else if (dimN <= 32) {
hl_sse_matrix_column_op<32>(agg, op, sv, dimM, dimN, dst, A, lda, B, ldb);
} else if (dimN <= 1024 || dimM <= 512) {
hl_sse_matrix_column_op<64>(agg, op, sv, dimM, dimN, dst, A, lda, B, ldb);
} else {
hl_sse_matrix_column_op<1024>(agg, op, sv, dimM, dimN, dst, A, lda, B, ldb);
}
}
#endif /* HL_NEON_MATRIX_KERNEL_CUH_ */
......@@ -159,4 +159,10 @@ extern void hl_sequence_avg_forward(real* dst,
int width,
const int mode);
extern void hl_sequence_avg_backward(real* dst,
real* src,
const int* starts,
int height,
int width,
const int mode);
#endif /* HL_SEQUENCE_H_ */
......@@ -57,4 +57,10 @@ inline void hl_sequence_avg_forward(real* dst,
int width,
const int mode) {}
inline void hl_sequence_avg_backward(real* dst,
real* src,
const int* starts,
int height,
int width,
const int mode) {}
#endif // HL_SEQUENCE_STUB_H_
......@@ -325,12 +325,12 @@ __global__ void KeSequenceAvgForward(real* dst,
int seqLength = end - start;
if (seqLength == 0) return;
real sum = 0.0;
for (int i = 0; i < seqLength; i++) {
sum += src[(start + i) * width + col];
for (int i = start; i < end; i++) {
sum += src[i * width + col];
}
sum = mode == 1 ? sum :
(mode == 0 ? sum / seqLength : sum * my_rsqrt((real)seqLength));
dst[row * width + col] = sum;
dst[gid] = sum;
}
}
......@@ -354,3 +354,48 @@ void hl_sequence_avg_forward(real* dst,
(dst, src, starts, height, width, mode);
CHECK_SYNC("hl_sequence_avg_forward failed");
}
__global__ void KeSequenceAvgBackward(real* dst,
real* src,
const int* starts,
int height,
int width,
const int mode) {
int gid = blockIdx.x * blockDim.x + threadIdx.x;
int row = gid / width;
int col = gid % width;
if (gid < height * width) {
int start = starts[row];
int end = starts[row + 1];
int seqLength = end - start;
if (seqLength == 0) return;
real grad = src[gid];
grad = mode == 1 ? grad :
(mode == 0 ? grad / seqLength : grad * my_rsqrt((real)seqLength));
for (int i = start; i < end; i++) {
dst[i * width + col] += grad;
}
}
}
void hl_sequence_avg_backward(real* dst,
real* src,
const int* starts,
int height,
int width,
const int mode) {
CHECK_NOTNULL(dst);
CHECK_NOTNULL(src);
CHECK_NOTNULL(starts);
int block = 512;
int grid = DIVUP(width * height, 512);
CHECK(mode == 0 || mode == 1 || mode == 2)
<< "mode error in hl_sequence_avg_backward!";
KeSequenceAvgBackward<<< grid, block, 0, STREAM_DEFAULT >>>
(dst, src, starts, height, width, mode);
CHECK_SYNC("hl_sequence_avg_backward failed");
}
......@@ -16,66 +16,6 @@ limitations under the License. */
namespace paddle {
template <>
size_t FuncConfig::get<size_t>(const std::string& key) const {
auto it = valueMap_.find(key);
CHECK(it != valueMap_.end()) << "Cannot find value: '" << key << "'";
return it->second.s;
}
template <>
real FuncConfig::get<real>(const std::string& key) const {
auto it = valueMap_.find(key);
CHECK(it != valueMap_.end()) << "Cannot find value: '" << key << "'";
return it->second.r;
}
template <>
int FuncConfig::get<int>(const std::string& key) const {
auto it = valueMap_.find(key);
CHECK(it != valueMap_.end()) << "Cannot find value: '" << key << "'";
return it->second.i;
}
template <>
bool FuncConfig::get<bool>(const std::string& key) const {
auto it = valueMap_.find(key);
CHECK(it != valueMap_.end()) << "Cannot find value: '" << key << "'";
return it->second.b;
}
template <>
FuncConfig& FuncConfig::set<size_t>(const std::string& key, size_t v) {
CHECK_EQ(static_cast<int>(valueMap_.count(key)), 0) << "Duplicated value: "
<< key;
valueMap_[key].s = v;
return *this;
}
template <>
FuncConfig& FuncConfig::set<real>(const std::string& key, real v) {
CHECK_EQ(static_cast<int>(valueMap_.count(key)), 0) << "Duplicated value: "
<< key;
valueMap_[key].r = v;
return *this;
}
template <>
FuncConfig& FuncConfig::set<int>(const std::string& key, int v) {
CHECK_EQ(static_cast<int>(valueMap_.count(key)), 0) << "Duplicated value: "
<< key;
valueMap_[key].i = v;
return *this;
}
template <>
FuncConfig& FuncConfig::set<bool>(const std::string& key, bool v) {
CHECK_EQ(static_cast<int>(valueMap_.count(key)), 0) << "Duplicated value: "
<< key;
valueMap_[key].b = v;
return *this;
}
void BufferArgs::addArg(const Matrix& arg,
const TensorShape& shape,
ArgType argType) {
......
......@@ -18,32 +18,49 @@ limitations under the License. */
#include <vector>
#include "BufferArg.h"
#include "paddle/math/Matrix.h"
#include "paddle/utils/Any.h"
#include "paddle/utils/ClassRegistrar.h"
#include "paddle/utils/Error.h"
namespace paddle {
/**
* Function Configuration.
* The argument type of Function::init.
* Follow-up will consider moving this data structure to Proto inside.
*/
class FuncConfig {
public:
union value {
size_t s;
real r;
int i;
bool b;
};
template <typename T>
T get(const std::string& key) const;
T get(const std::string& key, Error* err = nullptr) const {
try {
return any_cast<T>(valueMap_.at(key));
} catch (std::exception& e) { // could be cast or out of range exception.
if (err) {
*err = Error(e.what());
} else {
LOG(FATAL) << "Cannot get key " << key << " with error " << e.what();
}
return T();
}
}
template <typename T>
FuncConfig& set(const std::string& key, T v);
FuncConfig& set(const std::string& key, T v, Error* err = nullptr) {
auto it = valueMap_.find(key);
if (it != valueMap_.end()) { // already contains key.
if (err) {
*err = Error("Key %s is already set in FuncConfig", key.c_str());
} else {
LOG(FATAL) << "Key " << key << " is already set in FuncConfig.";
}
return *this;
}
valueMap_[key] = any(v);
return *this;
}
protected:
std::map<std::string, value> valueMap_;
mutable std::unordered_map<std::string, any> valueMap_;
};
/**
......
......@@ -25,9 +25,9 @@ void Pad<DEVICE_TYPE_CPU>(real* outputs,
const int inH,
const int inW,
const PadConf& pad) {
int cstart = pad.channelStart, cend = pad.channelEnd;
int hstart = pad.heightStart, hend = pad.heightEnd;
int wstart = pad.widthStart, wend = pad.widthEnd;
int cstart = pad.channel[0], cend = pad.channel[1];
int hstart = pad.height[0], hend = pad.height[1];
int wstart = pad.width[0], wend = pad.width[1];
int outC = inC + cstart + cend;
int outH = inH + hstart + hend;
int outW = inW + wstart + wend;
......@@ -51,9 +51,9 @@ void PadGrad<DEVICE_TYPE_CPU>(real* inGrad,
const int inH,
const int inW,
const PadConf& pad) {
int cstart = pad.channelStart, cend = pad.channelEnd;
int hstart = pad.heightStart, hend = pad.heightEnd;
int wstart = pad.widthStart, wend = pad.widthEnd;
int cstart = pad.channel[0], cend = pad.channel[1];
int hstart = pad.height[0], hend = pad.height[1];
int wstart = pad.width[0], wend = pad.width[1];
int outC = inC + cstart + cend;
int outH = inH + hstart + hend;
int outW = inW + wstart + wend;
......@@ -71,6 +71,12 @@ void PadGrad<DEVICE_TYPE_CPU>(real* inGrad,
}
}
static inline PadConf castToPadConf(const FuncConfig& conf) {
return {conf.get<std::vector<uint32_t>>("channel"),
conf.get<std::vector<uint32_t>>("height"),
conf.get<std::vector<uint32_t>>("width")};
}
/**
* \brief Padding zeros to input according to the specify dimension.
* The struct pad_ contains the padding size in each dimension.
......@@ -127,14 +133,7 @@ void PadGrad<DEVICE_TYPE_CPU>(real* inGrad,
template <DeviceType Device>
class PadFunc : public FunctionBase {
public:
void init(const FuncConfig& config) override {
pad_.channelStart = config.get<int>("cstart");
pad_.channelEnd = config.get<int>("cend");
pad_.heightStart = config.get<int>("hstart");
pad_.heightEnd = config.get<int>("hend");
pad_.widthStart = config.get<int>("wstart");
pad_.widthEnd = config.get<int>("wend");
}
void init(const FuncConfig& config) override { pad_ = castToPadConf(config); }
void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
CHECK_EQ(1UL, inputs.size());
......@@ -175,14 +174,7 @@ private:
template <DeviceType Device>
class PadGradFunc : public FunctionBase {
public:
void init(const FuncConfig& config) override {
pad_.channelStart = config.get<int>("cstart");
pad_.channelEnd = config.get<int>("cend");
pad_.heightStart = config.get<int>("hstart");
pad_.heightEnd = config.get<int>("hend");
pad_.widthStart = config.get<int>("wstart");
pad_.widthEnd = config.get<int>("wend");
}
void init(const FuncConfig& config) override { pad_ = castToPadConf(config); }
void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
CHECK_EQ(1UL, inputs.size());
......
......@@ -19,18 +19,12 @@ limitations under the License. */
namespace paddle {
struct PadConf {
/// how many values to add before the data along channel dimension.
int channelStart;
/// how many values to add after the data along channel dimension.
int channelEnd;
/// how many values to add before the data along height dimension.
int heightStart;
/// how many values to add after the data along height dimension.
int heightEnd;
/// how many values to add before the data along width dimension.
int widthStart;
/// how many values to add after the data along width dimension.
int widthEnd;
/// how many values to add before/after the data along channel dimension.
std::vector<uint32_t> channel;
/// how many values to add before/after the data along height dimension.
std::vector<uint32_t> height;
/// how many values to add before/after the data along width dimension.
std::vector<uint32_t> width;
};
/**
......
......@@ -44,9 +44,9 @@ void Pad<DEVICE_TYPE_GPU>(real* outputs,
size_t nth = num * inC * inH * inW;
int blockSize = 1024;
int gridSize = (nth + 1024 - 1) / 1024;
int cstart = pad.channelStart, cend = pad.channelEnd;
int hstart = pad.heightStart, hend = pad.heightEnd;
int wstart = pad.widthStart, wend = pad.widthEnd;
int cstart = pad.channel[0], cend = pad.channel[1];
int hstart = pad.height[0], hend = pad.height[1];
int wstart = pad.width[0], wend = pad.width[1];
int outC = inC + cstart + cend;
int outH = inH + hstart + hend;
int outW = inW + wstart + wend;
......@@ -83,9 +83,9 @@ void PadGrad<DEVICE_TYPE_GPU>(real* inGrad,
int nth = num * inC * inH * inW;
int blockSize = 1024;
int gridSize = (nth + 1024 - 1) / 1024;
int cstart = pad.channelStart, cend = pad.channelEnd;
int hstart = pad.heightStart, hend = pad.heightEnd;
int wstart = pad.widthStart, wend = pad.widthEnd;
int cstart = pad.channel[0], cend = pad.channel[1];
int hstart = pad.height[0], hend = pad.height[1];
int wstart = pad.width[0], wend = pad.width[1];
int outC = inC + cstart + cend;
int outH = inH + hstart + hend;
int outW = inW + wstart + wend;
......
......@@ -24,48 +24,22 @@ TEST(Pad, real) {
for (size_t imgSizeW : {5, 32, 96}) {
VLOG(3) << " numSamples=" << numSamples << " channels=" << channels
<< " imgSizeH=" << imgSizeH << " imgSizeW=" << imgSizeW;
FunctionCompare compare("Pad",
FuncConfig()
.set("cstart", 2)
.set("cend", 3)
.set("hstart", 1)
.set("hend", 2)
.set("wstart", 3)
.set("wend", 2));
TensorShape inDims{numSamples, channels, imgSizeH, imgSizeW};
TensorShape outDims{
numSamples, channels + 5, imgSizeH + 3, imgSizeW + 5};
compare.addInputs(BufferArg(VALUE_TYPE_FLOAT, inDims));
compare.addOutputs(BufferArg(VALUE_TYPE_FLOAT, outDims, ASSIGN_TO));
compare.run();
}
}
}
}
}
TEST(PadGrad, real) {
for (size_t numSamples : {5, 32}) {
for (size_t channels : {1, 5, 32}) {
for (size_t imgSizeH : {5, 33, 100}) {
for (size_t imgSizeW : {5, 32, 96}) {
VLOG(3) << " numSamples=" << numSamples << " channels=" << channels
<< " imgSizeH=" << imgSizeH << " imgSizeW=" << imgSizeW;
FunctionCompare compare("PadGrad",
FuncConfig()
.set("cstart", 2)
.set("cend", 3)
.set("hstart", 1)
.set("hend", 2)
.set("wstart", 3)
.set("wend", 2));
TensorShape inDims{numSamples, channels, imgSizeH, imgSizeW};
TensorShape outDims{
numSamples, channels + 5, imgSizeH + 3, imgSizeW + 5};
compare.addInputs(BufferArg(VALUE_TYPE_FLOAT, outDims));
compare.addOutputs(BufferArg(VALUE_TYPE_FLOAT, inDims, ASSIGN_TO));
compare.run();
for (bool test_grad : {false, true}) {
FunctionCompare compare(
test_grad ? "PadGrad" : "Pad",
FuncConfig()
.set<std::vector<uint32_t>>("channel", {2, 3})
.set<std::vector<uint32_t>>("height", {1, 2})
.set<std::vector<uint32_t>>("width", {3, 2}));
TensorShape inDims{numSamples, channels, imgSizeH, imgSizeW};
TensorShape outDims{
numSamples, channels + 5, imgSizeH + 3, imgSizeW + 5};
compare.addInputs(
BufferArg(VALUE_TYPE_FLOAT, test_grad ? outDims : inDims));
compare.addOutputs(BufferArg(
VALUE_TYPE_FLOAT, test_grad ? inDims : outDims, ASSIGN_TO));
compare.run();
}
}
}
}
......
......@@ -42,7 +42,8 @@ void AgentLayer::forward(PassType passType) {
// get Arguments from real layers
if (numSamples_ > 0 && numSamples_ < realHeight) {
if (realOutput.ids) {
output_.ids->subVecFrom(*realOutput.ids, 0, numSamples_);
output_.ids =
IVector::create(realOutput.ids->getData(), numSamples_, useGpu_);
} else {
output_.subArgFrom(
realOutput, /* offset */ 0, numSamples_, getSize(), useGpu_);
......
......@@ -26,8 +26,6 @@ bool AverageLayer::init(const LayerMap& layerMap,
const ParameterMap& parameterMap) {
SequencePoolLayer::init(layerMap, parameterMap);
dataMtx_ = Matrix::create(nullptr, 1, 1, false, useGpu_);
outMtx_ = Matrix::create(nullptr, 1, getSize(), false, useGpu_);
// average strategy
if (config_.average_strategy() == "average") {
mode_ = kAverage;
......@@ -60,43 +58,9 @@ void AverageLayer::forward(PassType passType) {
void AverageLayer::backward(const UpdateCallback& callback) {
SequencePoolLayer::backward(callback);
const int* starts = startPositions_->getData(false);
MatrixPtr grad = getInputGrad(0);
if (grad) {
size_t dim = getSize();
real* gradientData = getInputGrad(0)->getData();
real* gradient = getOutputGrad()->getData();
size_t numSequences = startPositions_->getSize() - 1;
for (size_t sequenceId = 0; sequenceId < numSequences; ++sequenceId) {
// TODO(Dangqingqing) optimization for GPU
int sequenceLength = starts[sequenceId + 1] - starts[sequenceId];
if (0 == sequenceLength) {
// empty sequence
continue;
}
dataMtx_->setData(
gradientData + starts[sequenceId] * dim, sequenceLength, dim);
outMtx_->setData(gradient + sequenceId * dim);
switch (mode_) {
case kAverage: {
// plain average
dataMtx_->addBias(*outMtx_, 1.0f / sequenceLength);
break;
}
case kSum: {
// sum instead of average
dataMtx_->addBias(*outMtx_, 1.0f);
break;
}
case kAverageSquareRootN: {
// divide by square root of sequenceLength
dataMtx_->addBias(*outMtx_, 1.0f / sqrt(sequenceLength));
break;
}
default: { LOG(FATAL) << "should not reach here"; }
}
}
if (getInputGrad(0)) {
getInputGrad(0)->sequenceAvgBackward(
*getOutputGrad(), *startPositions_->getVector(useGpu_), mode_);
}
}
......
......@@ -45,8 +45,6 @@ public:
void backward(const UpdateCallback& callback = nullptr) override;
protected:
MatrixPtr outMtx_;
MatrixPtr dataMtx_;
int mode_;
};
} // namespace paddle
......@@ -107,6 +107,10 @@ void ExpandConvBaseLayer::expandOneFrame(MatrixPtr image,
int channel = isDeconv_ ? numFilters_ : channels_[inIdx];
resetExpandInput(subK_[inIdx] * groups_[inIdx], subN_[inIdx]);
CHECK_EQ(image->getWidth(),
static_cast<size_t>(imgSizeH_[inIdx] * imgSizeW_[inIdx] * channel));
real *imgData = image->getData() + startIdx * image->getWidth();
MatrixPtr imageTmp =
Matrix::create(imgData,
......
......@@ -36,7 +36,7 @@ namespace paddle {
* | |- 5
* |
* |-*- 0
* |- 1
* |- 1
* @endcode
*
* where * indicates an internal node, and each leaf node represents a class.
......
......@@ -36,12 +36,9 @@ bool PadLayer::init(const LayerMap& layerMap,
CHECK_EQ(2, pad_conf.pad_c_size());
CHECK_EQ(2, pad_conf.pad_h_size());
CHECK_EQ(2, pad_conf.pad_w_size());
padc_.push_back(pad_conf.pad_c(0));
padc_.push_back(pad_conf.pad_c(1));
padh_.push_back(pad_conf.pad_h(0));
padh_.push_back(pad_conf.pad_h(1));
padw_.push_back(pad_conf.pad_w(0));
padw_.push_back(pad_conf.pad_w(1));
padc_ = {pad_conf.pad_c(0), pad_conf.pad_c(1)};
padh_ = {pad_conf.pad_h(0), pad_conf.pad_h(1)};
padw_ = {pad_conf.pad_w(0), pad_conf.pad_w(1)};
outDims_ = TensorShape(4);
setOutDims(0);
......@@ -49,21 +46,15 @@ bool PadLayer::init(const LayerMap& layerMap,
createFunction(forward_,
"Pad",
FuncConfig()
.set("cstart", padc_[0])
.set("cend", padc_[1])
.set("hstart", padh_[0])
.set("hend", padh_[1])
.set("wstart", padw_[0])
.set("wend", padw_[1]));
.set("channel", padc_)
.set("height", padh_)
.set("width", padw_));
createFunction(backward_,
"PadGrad",
FuncConfig()
.set("cstart", padc_[0])
.set("cend", padc_[1])
.set("hstart", padh_[0])
.set("hend", padh_[1])
.set("wstart", padw_[0])
.set("wend", padw_[1]));
.set("channel", padc_)
.set("height", padh_)
.set("width", padw_));
return true;
}
......
......@@ -38,9 +38,9 @@ protected:
void setOutDims(const size_t batchSize);
void setTensorDim(const size_t batchSize);
std::vector<int> padc_;
std::vector<int> padh_;
std::vector<int> padw_;
std::vector<uint32_t> padc_;
std::vector<uint32_t> padh_;
std::vector<uint32_t> padw_;
TensorShape inDims_;
TensorShape outDims_;
};
......
......@@ -778,8 +778,10 @@ void testProjectionGrad(ProjectionConfig conf,
config.biasSize = biasSize == 0 ? config.layerConfig.size() : biasSize;
config.layerConfig.set_bias_size(config.biasSize);
config.layerConfig.set_shared_biases(sharedBias);
config.inputDefs.push_back(
{inputType, "layer_0", conf.input_size(), parameterSize});
config.inputDefs.push_back({inputType,
"layer_0",
static_cast<size_t>(conf.input_size()),
parameterSize});
*config.layerConfig.add_inputs()->mutable_proj_conf() = conf;
config.testState = testState;
testLayerGrad(config, "mixed", batchSize, false, useGpu);
......
......@@ -85,11 +85,16 @@ int getrf<float>(const CBLAS_ORDER order,
float* A,
const int lda,
int* ipiv) {
#ifdef PADDLE_USE_LAPACK
#ifdef PADDLE_USE_ATLAS
return clapack_sgetrf(order, M, N, A, lda, ipiv);
#else
return LAPACKE_sgetrf(order, M, N, A, lda, ipiv);
#endif
#else
LOG(FATAL) << "Not implemented";
#endif
return 0;
}
template <>
......@@ -99,11 +104,16 @@ int getrf<double>(const CBLAS_ORDER order,
double* A,
const int lda,
int* ipiv) {
#ifdef PADDLE_USE_LAPACK
#ifdef PADDLE_USE_ATLAS
return clapack_dgetrf(order, M, N, A, lda, ipiv);
#else
return LAPACKE_dgetrf(order, M, N, A, lda, ipiv);
#endif
#else
LOG(FATAL) << "Not implemented";
#endif
return 0;
}
template <>
......@@ -112,11 +122,16 @@ int getri<float>(const CBLAS_ORDER order,
float* A,
const int lda,
const int* ipiv) {
#ifdef PADDLE_USE_LAPACK
#ifdef PADDLE_USE_ATLAS
return clapack_sgetri(order, N, A, lda, ipiv);
#else
return LAPACKE_sgetri(order, N, A, lda, ipiv);
#endif
#else
LOG(FATAL) << "Not implemented";
#endif
return 0;
}
template <>
......@@ -125,11 +140,16 @@ int getri<double>(const CBLAS_ORDER order,
double* A,
const int lda,
const int* ipiv) {
#ifdef PADDLE_USE_LAPACK
#ifdef PADDLE_USE_ATLAS
return clapack_dgetri(order, N, A, lda, ipiv);
#else
return LAPACKE_dgetri(order, N, A, lda, ipiv);
#endif
#else
LOG(FATAL) << "Not implemented";
#endif
return 0;
}
template <>
......
......@@ -17,11 +17,14 @@ limitations under the License. */
#ifdef PADDLE_USE_MKL
#include <mkl.h>
#ifdef PADDLE_USE_LAPACK
#include <mkl_lapacke.h>
#endif
#else
extern "C" {
#include <cblas.h>
}
#ifdef PADDLE_USE_LAPACK
#ifdef PADDLE_USE_ATLAS
extern "C" {
#include <clapack.h>
......@@ -30,6 +33,7 @@ extern "C" {
#include <lapacke.h>
#endif
#endif
#endif
#include <cmath>
......
......@@ -483,6 +483,20 @@ void GpuMatrix::sequenceAvgForward(Matrix& a,
hl_sequence_avg_forward(dst, src, starts, height, width, mode);
}
void GpuMatrix::sequenceAvgBackward(Matrix& a,
const IVector& startsPos,
int mode) {
size_t height = a.getHeight();
size_t width = getWidth();
CHECK_EQ(height, startsPos.getSize() - 1);
CHECK_EQ(width, a.getWidth());
real* dst = getData();
real* src = a.getData();
const int* starts = startsPos.getData();
hl_sequence_avg_backward(dst, src, starts, height, width, mode);
}
/* this = scaleAB*(a*b) + scaleT*this */
void GpuMatrix::mul(const GpuMatrix& a,
const GpuMatrix& b,
......@@ -2304,6 +2318,41 @@ void CpuMatrix::sequenceAvgForward(Matrix& a,
}
}
void CpuMatrix::sequenceAvgBackward(Matrix& a,
const IVector& startsPos,
int mode) {
size_t height = a.getHeight();
size_t width = getWidth();
CHECK_EQ(height, startsPos.getSize() - 1);
CHECK_EQ(width, a.getWidth());
real* dst = getData();
real* src = a.getData();
const int* starts = startsPos.getData();
MatrixPtr outMtx = Matrix::create(nullptr, 1, width, false, false);
MatrixPtr dataMtx = Matrix::create(nullptr, 1, width, false, false);
for (size_t i = 0; i < height; ++i) {
int sequenceLength = starts[i + 1] - starts[i];
if (0 == sequenceLength) {
// empty sequence
continue;
}
outMtx->setData(dst + starts[i] * width, sequenceLength, width);
dataMtx->setData(src + i * width);
if (mode == 0) {
// plain average
outMtx->addBias(*dataMtx, 1.0f / sequenceLength);
} else if (mode == 1) {
// sum instead of average
outMtx->addBias(*dataMtx, 1.0f);
} else if (mode == 2) {
// divide by square root of sequenceLength
outMtx->addBias(*dataMtx, 1.0f / std::sqrt(sequenceLength));
} else {
LOG(FATAL) << "should not reach here";
}
}
}
/* this = scaleAB*(a*b) + scaleT*this*/
void CpuMatrix::mul(const Matrix& a,
const Matrix& b,
......@@ -2377,41 +2426,8 @@ void CpuMatrix::mul(CpuMatrix* a, CpuMatrix* b, real scaleAB, real scaleT) {
int lda = a->getStride();
int ldb = b->getStride();
int ldc = getStride();
#ifndef PADDLE_TYPE_DOUBLE
cblas_sgemm(CblasRowMajor,
a_trans,
b_trans,
M,
N,
K,
scaleAB,
A,
lda,
B,
ldb,
scaleT,
C,
ldc);
#else
cblas_dgemm(CblasRowMajor,
a_trans,
b_trans,
M,
N,
K,
scaleAB,
A,
lda,
B,
ldb,
scaleT,
C,
ldc);
// TODO(yuyang18): Is gemm defined other place?
#endif
VLOG(2) << " A[0]=" << A[0] << " A[1]=" << A[1] << " B[0]=" << B[0]
<< " B[1]=" << B[1] << " C[0]=" << C[0] << " C[1]=" << C[1];
gemm<real>(
a_trans, b_trans, M, N, K, scaleAB, A, lda, B, ldb, scaleT, C, ldc);
}
void CpuMatrix::mul(
......
......@@ -461,6 +461,12 @@ public:
LOG(FATAL) << "Not implemented";
}
virtual void sequenceAvgBackward(Matrix& a,
const IVector& startsPos,
int mode) {
LOG(FATAL) << "Not implemented";
}
/**
* @code
* this = scaleAB*(a*b) + scaleT*this
......@@ -1203,6 +1209,7 @@ public:
void collectSharedBias(Matrix& a, real scale);
void sequenceAvgForward(Matrix& a, const IVector& startsPos, int mode);
void sequenceAvgBackward(Matrix& a, const IVector& startsPos, int mode);
/**
* @code
......@@ -1619,6 +1626,7 @@ public:
void collectSharedBias(Matrix& a, real scale);
void sequenceAvgForward(Matrix& a, const IVector& startsPos, int mode);
void sequenceAvgBackward(Matrix& a, const IVector& startsPos, int mode);
/**
* @code
......
......@@ -13,119 +13,12 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "SIMDFunctions.h"
#ifdef __SSE3__
#include <immintrin.h>
#endif
#include <algorithm>
#ifndef __AVX__
static void addto_sse(float* a, const float* b, size_t len) {
int offset = len % 16;
__m128 ma0, ma1, ma2, ma3;
__m128 mb0, mb1, mb2, mb3;
for (unsigned int k = 0; k < len / 16; k++, a += 16, b += 16) {
ma0 = _mm_load_ps(a);
ma1 = _mm_load_ps(a + 4);
ma2 = _mm_load_ps(a + 8);
ma3 = _mm_load_ps(a + 12);
mb0 = _mm_load_ps(b);
mb1 = _mm_load_ps(b + 4);
mb2 = _mm_load_ps(b + 8);
mb3 = _mm_load_ps(b + 12);
ma0 = _mm_add_ps(ma0, mb0);
ma1 = _mm_add_ps(ma1, mb1);
ma2 = _mm_add_ps(ma2, mb2);
ma3 = _mm_add_ps(ma3, mb3);
_mm_store_ps(a, ma0);
_mm_store_ps(a + 4, ma1);
_mm_store_ps(a + 8, ma2);
_mm_store_ps(a + 12, ma3);
}
for (int i = 0; i < offset; i++) a[i] += b[i];
}
static void batch_addto_sse(float* a, const float* b[], int batch, size_t len) {
int offset = len % 16;
__m128 ma0, ma1, ma2, ma3;
__m128 mb0, mb1, mb2, mb3;
for (unsigned int k = 0; k < len / 16; k++, a += 16) {
ma0 = _mm_load_ps(a);
ma1 = _mm_load_ps(a + 4);
ma2 = _mm_load_ps(a + 8);
ma3 = _mm_load_ps(a + 12);
for (int i = 0; i < batch; i++) {
mb0 = _mm_load_ps(b[i]);
mb1 = _mm_load_ps(b[i] + 4);
mb2 = _mm_load_ps(b[i] + 8);
mb3 = _mm_load_ps(b[i] + 12);
ma0 = _mm_add_ps(ma0, mb0);
ma1 = _mm_add_ps(ma1, mb1);
ma2 = _mm_add_ps(ma2, mb2);
ma3 = _mm_add_ps(ma3, mb3);
b[i] += 16;
}
_mm_store_ps(a, ma0);
_mm_store_ps(a + 4, ma1);
_mm_store_ps(a + 8, ma2);
_mm_store_ps(a + 12, ma3);
}
for (int i = 0; i < offset; i++) {
for (int k = 0; k < batch; k++) a[i] += b[k][i];
}
return;
}
static void col_max_sse(float* result,
const float* data,
int dim,
int numSamples) {
// first sample, direct copy
for (int d = 0; d < dim; ++d) {
result[d] = data[d];
}
int offset = dim % 16;
__m128 ma0, ma1, ma2, ma3;
__m128 mb0, mb1, mb2, mb3;
// first 16n dims
for (int k = 0; k < dim / 16; k++, result += 16, data += 16) {
ma0 = _mm_load_ps(result);
ma1 = _mm_load_ps(result + 4);
ma2 = _mm_load_ps(result + 8);
ma3 = _mm_load_ps(result + 12);
for (int i = 1; i < numSamples; i++) {
mb0 = _mm_load_ps(data + i * dim);
mb1 = _mm_load_ps(data + i * dim + 4);
mb2 = _mm_load_ps(data + i * dim + 8);
mb3 = _mm_load_ps(data + i * dim + 12);
ma0 = _mm_max_ps(ma0, mb0);
ma1 = _mm_max_ps(ma1, mb1);
ma2 = _mm_max_ps(ma2, mb2);
ma3 = _mm_max_ps(ma3, mb3);
}
_mm_store_ps(result, ma0);
_mm_store_ps(result + 4, ma1);
_mm_store_ps(result + 8, ma2);
_mm_store_ps(result + 12, ma3);
}
// last dims
for (int d = 0; d < offset; ++d) {
float sm = data[d];
for (int i = 1; i < numSamples; ++i) {
sm = std::max(sm, data[i * dim + d]);
}
result[d] = sm;
}
}
#else
#ifdef __AVX__
static void addto_avx(float* a, const float* b, size_t len) {
int offset = len % 32;
......@@ -355,17 +248,128 @@ static void decayL1_avx(
}
}
#elif defined(__SSE3__)
static void addto_sse(float* a, const float* b, size_t len) {
int offset = len % 16;
__m128 ma0, ma1, ma2, ma3;
__m128 mb0, mb1, mb2, mb3;
for (unsigned int k = 0; k < len / 16; k++, a += 16, b += 16) {
ma0 = _mm_load_ps(a);
ma1 = _mm_load_ps(a + 4);
ma2 = _mm_load_ps(a + 8);
ma3 = _mm_load_ps(a + 12);
mb0 = _mm_load_ps(b);
mb1 = _mm_load_ps(b + 4);
mb2 = _mm_load_ps(b + 8);
mb3 = _mm_load_ps(b + 12);
ma0 = _mm_add_ps(ma0, mb0);
ma1 = _mm_add_ps(ma1, mb1);
ma2 = _mm_add_ps(ma2, mb2);
ma3 = _mm_add_ps(ma3, mb3);
_mm_store_ps(a, ma0);
_mm_store_ps(a + 4, ma1);
_mm_store_ps(a + 8, ma2);
_mm_store_ps(a + 12, ma3);
}
for (int i = 0; i < offset; i++) a[i] += b[i];
}
static void batch_addto_sse(float* a, const float* b[], int batch, size_t len) {
int offset = len % 16;
__m128 ma0, ma1, ma2, ma3;
__m128 mb0, mb1, mb2, mb3;
for (unsigned int k = 0; k < len / 16; k++, a += 16) {
ma0 = _mm_load_ps(a);
ma1 = _mm_load_ps(a + 4);
ma2 = _mm_load_ps(a + 8);
ma3 = _mm_load_ps(a + 12);
for (int i = 0; i < batch; i++) {
mb0 = _mm_load_ps(b[i]);
mb1 = _mm_load_ps(b[i] + 4);
mb2 = _mm_load_ps(b[i] + 8);
mb3 = _mm_load_ps(b[i] + 12);
ma0 = _mm_add_ps(ma0, mb0);
ma1 = _mm_add_ps(ma1, mb1);
ma2 = _mm_add_ps(ma2, mb2);
ma3 = _mm_add_ps(ma3, mb3);
b[i] += 16;
}
_mm_store_ps(a, ma0);
_mm_store_ps(a + 4, ma1);
_mm_store_ps(a + 8, ma2);
_mm_store_ps(a + 12, ma3);
}
for (int i = 0; i < offset; i++) {
for (int k = 0; k < batch; k++) a[i] += b[k][i];
}
return;
}
static void col_max_sse(float* result,
const float* data,
int dim,
int numSamples) {
// first sample, direct copy
for (int d = 0; d < dim; ++d) {
result[d] = data[d];
}
int offset = dim % 16;
__m128 ma0, ma1, ma2, ma3;
__m128 mb0, mb1, mb2, mb3;
// first 16n dims
for (int k = 0; k < dim / 16; k++, result += 16, data += 16) {
ma0 = _mm_load_ps(result);
ma1 = _mm_load_ps(result + 4);
ma2 = _mm_load_ps(result + 8);
ma3 = _mm_load_ps(result + 12);
for (int i = 1; i < numSamples; i++) {
mb0 = _mm_load_ps(data + i * dim);
mb1 = _mm_load_ps(data + i * dim + 4);
mb2 = _mm_load_ps(data + i * dim + 8);
mb3 = _mm_load_ps(data + i * dim + 12);
ma0 = _mm_max_ps(ma0, mb0);
ma1 = _mm_max_ps(ma1, mb1);
ma2 = _mm_max_ps(ma2, mb2);
ma3 = _mm_max_ps(ma3, mb3);
}
_mm_store_ps(result, ma0);
_mm_store_ps(result + 4, ma1);
_mm_store_ps(result + 8, ma2);
_mm_store_ps(result + 12, ma3);
}
// last dims
for (int d = 0; d < offset; ++d) {
float sm = data[d];
for (int i = 1; i < numSamples; ++i) {
sm = std::max(sm, data[i * dim + d]);
}
result[d] = sm;
}
}
#endif
#ifndef __AVX__
#define SIMD_INVOKE(func, ...) func##_sse(__VA_ARGS__)
#else
#if defined(__AVX__)
#define SIMD_INVOKE(func, ...) func##_avx(__VA_ARGS__)
#elif defined(__SSE3__)
#define SIMD_INVOKE(func, ...) func##_sse(__VA_ARGS__)
#endif
namespace paddle {
namespace simd {
namespace internal {
#ifdef __SSE3__
void addToImpl(float* a, const float* b, size_t len) {
SIMD_INVOKE(addto, a, b, len);
}
......@@ -376,6 +380,7 @@ void batchAddToImpl(float* a, const float* b[], int batch, size_t len) {
void colMaxImpl(float* result, const float* data, int dim, int numSamples) {
SIMD_INVOKE(col_max, result, data, dim, numSamples);
}
#endif
#ifdef __AVX__
void decayL1AvxImpl(float* dst, float* src, float lambda, size_t len) {
......@@ -385,8 +390,8 @@ void decayL1AvxImpl(
float* dst, float* src, float* lr, float lambda, size_t len) {
decayL1_avx(dst, src, lr, lambda, len);
}
#endif
} // namespace internal
} // namespace simd
} // namespace paddle
......@@ -128,17 +128,29 @@ void decayL1AvxImpl(
template <>
inline void addTo(float* a, const float* b, size_t len) {
#ifdef __SSE3__
internal::addToImpl(a, b, len);
#else
naive::addTo(a, b, len);
#endif
}
template <>
inline void batchAddTo(float* a, const float* b[], int batch, size_t len) {
#ifdef __SSE3__
internal::batchAddToImpl(a, b, batch, len);
#else
naive::batchAddTo(a, b, batch, len);
#endif
}
template <>
inline void colMax(float* result, const float* data, int dim, int numSamples) {
#ifdef __SSE3__
internal::colMaxImpl(result, data, dim, numSamples);
#else
naive::colMax(result, data, dim, numSamples);
#endif
}
template <>
......
......@@ -14,6 +14,7 @@ limitations under the License. */
#include "Storage.h"
#include "Allocator.h"
#include "paddle/utils/StringUtil.h"
#include "paddle/utils/Util.h"
DEFINE_int32(pool_limit_size,
......@@ -62,7 +63,7 @@ PoolAllocator* StorageEngine::getGpuAllocator(int deviceId) {
}
if (gpuAllocator_[deviceId] == nullptr) {
std::string name =
"gpu" + std::to_string(deviceId) + std::string("_pool");
"gpu" + str::to_string(deviceId) + std::string("_pool");
gpuAllocator_[deviceId] =
new PoolAllocator(new GpuAllocator(), FLAGS_pool_limit_size, name);
}
......
......@@ -685,7 +685,7 @@ TEST(SMatrix, topK) {
}
}
void testMatrixSequenceAvgForward(int batchSize, int inputDim, int mode) {
void testMatrixSequenceAvg(int batchSize, int inputDim, int mode) {
MatrixPtr cpuInput = std::make_shared<CpuMatrix>(batchSize, inputDim);
MatrixPtr gpuInput = std::make_shared<GpuMatrix>(batchSize, inputDim);
cpuInput->randomizeUniform();
......@@ -706,15 +706,25 @@ void testMatrixSequenceAvgForward(int batchSize, int inputDim, int mode) {
gpuOutput->sequenceAvgForward(*gpuInput, *gpuSequence, mode);
TensorCheckErr(*cpuOutput, *gpuOutput);
MatrixPtr cpuInGrad = std::make_shared<CpuMatrix>(batchSize, inputDim);
MatrixPtr gpuInGrad = std::make_shared<GpuMatrix>(batchSize, inputDim);
cpuInGrad->randomizeUniform();
gpuInGrad->copyFrom(*cpuInGrad);
cpuInGrad->sequenceAvgBackward(*cpuOutput, *cpuSequence, mode);
gpuInGrad->sequenceAvgBackward(*gpuOutput, *gpuSequence, mode);
TensorCheckErr(*cpuInGrad, *gpuInGrad);
}
TEST(Matrix, sequenceAvgForward) {
TEST(Matrix, sequenceAvg) {
for (auto batchSize : {10, 128, 6000}) {
for (auto inputDim : {32, 100, 512}) {
for (auto mode : {0, 1, 2}) {
VLOG(3) << " batchSize=" << batchSize << " inputDim=" << inputDim
<< " mode=" << mode;
testMatrixSequenceAvgForward(batchSize, inputDim, mode);
testMatrixSequenceAvg(batchSize, inputDim, mode);
}
}
}
......
......@@ -29,6 +29,7 @@ limitations under the License. */
#include "paddle/utils/Flags.h"
#include "paddle/utils/GlobalConstants.h"
#include "paddle/utils/Stat.h"
#include "paddle/utils/StringUtil.h"
DEFINE_int32(pserver_num_threads, 1, "number of threads for sync op exec");
DEFINE_double(async_lagged_ratio_min,
......@@ -218,7 +219,8 @@ void ParameterServer2::setConfig(const SetConfigRequest& request,
callback(response);
/// always defined, barrier slowest node function need it.
statSet_.reset(new StatSet("ParameterServer" + std::to_string(serverId_)));
statSet_.reset(new StatSet("ParameterServer" +
str::to_string(static_cast<int>(serverId_))));
}
real bufferSum(const std::vector<ParameterServer2::Buffer>& buffers) {
......
......@@ -160,10 +160,19 @@ class SparseFloatScanner(SparseBinaryScanner):
class IndexScanner(IScanner):
def __init__(self, input_type, pos):
IScanner.__init__(self, input_type, pos)
self.__ids__ = []
self.__ids__ = None
self.__idx__ = 0
def pre_scan(self, dat):
self.__idx__ += 1
def finish_pre_scan(self, argument):
self.__ids__ = [0] * self.__idx__
self.__idx__ = 0
def scan(self, dat):
self.__ids__.append(dat)
self.__ids__[self.__idx__] = dat
self.__idx__ += 1
def finish_scan(self, argument):
ids = swig_paddle.IVector.create(self.__ids__, self.data_in_gpu)
......
......@@ -94,7 +94,7 @@ docker build -t paddle:dev --build-arg UBUNTU_MIRROR=mirror://mirrors.ubuntu.com
Given the development image `paddle:dev`, the following command builds PaddlePaddle from the source tree on the development computer (host):
```bash
docker run -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_AVX=ON" -e "WITH_TEST=OFF" -e "RUN_TEST=OFF" paddle:dev
docker run --rm -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_AVX=ON" -e "WITH_TEST=OFF" -e "RUN_TEST=OFF" paddle:dev
```
This command mounts the source directory on the host into `/paddle` in the container, so the default entry point of `paddle:dev`, `build.sh`, could build the source code with possible local changes. When it writes to `/paddle/build` in the container, it writes to `$PWD/build` on the host indeed.
......@@ -110,7 +110,7 @@ Users can specify the following Docker build arguments with either "ON" or "OFF"
- `WITH_AVX`: ***Required***. Set to "OFF" prevents from generating AVX instructions. If you don't know what is AVX, you might want to set "ON".
- `WITH_TEST`: ***Optional, default OFF***. Build unit tests binaries. Once you've built the unit tests, you can run these test manually by the following command:
```bash
docker run -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_AVX=ON" paddle:dev sh -c "cd /paddle/build; make coverall"
docker run --rm -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_AVX=ON" paddle:dev sh -c "cd /paddle/build; make coverall"
```
- `RUN_TEST`: ***Optional, default OFF***. Run unit tests after building. You can't run unit tests without building it.
......@@ -129,7 +129,7 @@ This production image is minimal -- it includes binary `paddle`, the shared libr
Again the development happens on the host. Suppose that we have a simple application program in `a.py`, we can test and run it using the production image:
```bash
docker run -it -v $PWD:/work paddle /work/a.py
docker run --rm -it -v $PWD:/work paddle /work/a.py
```
But this works only if all dependencies of `a.py` are in the production image. If this is not the case, we need to build a new Docker image from the production image and with more dependencies installs.
......@@ -166,3 +166,18 @@ docker tag myapp me/myapp
docker push
kubectl ...
```
### Reading source code with woboq codebrowser
For developers who are interested in the C++ source code, please use -e "WOBOQ=ON" to enable the building of C++ source code into HTML pages using [Woboq codebrowser](https://github.com/woboq/woboq_codebrowser).
- The following command builds PaddlePaddle, generates HTML pages from C++ source code, and writes HTML pages into `$HOME/woboq_out` on the host:
```bash
docker run -v $PWD:/paddle -v $HOME/woboq_out:/woboq_out -e "WITH_GPU=OFF" -e "WITH_AVX=ON" -e "WITH_TEST=ON" -e "WOBOQ=ON" paddle:dev
```
- You can open the generated HTML files in your Web browser. Or, if you want to run a Nginx container to serve them for a wider audience, you can run:
```
docker run -v $HOME/woboq_out:/usr/share/nginx/html -d -p 8080:80 nginx
```
......@@ -46,7 +46,7 @@ make install
# install them in docker
cpack -D CPACK_GENERATOR='DEB' -D CPACK_DEBIAN_PACKAGE_DEPENDS="" ..
if [[ ${BUILD_WOBOQ:-OFF} == 'ON' ]]; then
if [[ ${WOBOQ:-OFF} == 'ON' ]]; then
apt-get install -y clang-3.8 llvm-3.8 libclang-3.8-dev
# Install woboq_codebrowser.
git clone https://github.com/woboq/woboq_codebrowser /woboq
......@@ -56,7 +56,7 @@ if [[ ${BUILD_WOBOQ:-OFF} == 'ON' ]]; then
.
make
export WOBOQ_OUT=/usr/share/nginx/html/paddle
export WOBOQ_OUT=/woboq_out/paddle
export BUILD_DIR=/paddle/build
mkdir -p $WOBOQ_OUT
cp -rv /woboq/data $WOBOQ_OUT/../data
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#if __cplusplus > 201402L
#include <any>
namespace paddle {
// using std::any for C++ 17
using std::any;
using std::any_cast;
using std::bad_any_cast;
} // namespace paddle
#else
#include <any.hpp>
namespace paddle {
// use linb::any for C++ 11
using linb::any;
using linb::any_cast;
using linb::bad_any_cast;
} // namespace paddle
#endif
......@@ -19,7 +19,7 @@ limitations under the License. */
/// for MSVC
#define CPUID(info, x) __cpuidex(info, x, 0)
#else
#elif !defined(__ANDROID__)
#include <cpuid.h>
......@@ -31,6 +31,7 @@ limitations under the License. */
namespace paddle {
SIMDFlags::SIMDFlags() {
#if !defined(__ANDROID__)
unsigned int cpuInfo[4];
// CPUID: https://en.wikipedia.org/wiki/CPUID
// clang-format off
......@@ -51,6 +52,9 @@ SIMDFlags::SIMDFlags() {
CPUID(cpuInfo, 0x80000001);
simd_flags_ |= cpuInfo[2] & (1 << 16) ? SIMD_FMA4 : SIMD_NONE;
// clang-fotmat on
#else
simd_flags_ = SIMD_NEON;
#endif
}
SIMDFlags const* SIMDFlags::instance() {
......
......@@ -30,6 +30,7 @@ enum simd_t {
SIMD_AVX = 1 << 8, ///< AVX
SIMD_AVX2 = 1 << 9, ///< AVX 2
SIMD_AVX512 = 1 << 10, ///< AVX 512
SIMD_NEON = 1 << 11, /// NEON
};
// clang-format on
......@@ -96,6 +97,7 @@ private:
#define HAS_AVX HAS_SIMD(SIMD_AVX)
#define HAS_AVX2 HAS_SIMD(SIMD_AVX2)
#define HAS_AVX512 HAS_SIMD(SIMD_AVX512)
#define HAS_NEON HAS_SIMD(SIMD_NEON)
// clang-format on
/**
......
......@@ -18,6 +18,7 @@ limitations under the License. */
*/
#include "Logging.h"
#include <cstdlib>
namespace paddle {
......
......@@ -54,6 +54,25 @@ inline T toWithStatus(const std::string& s, bool* ok = nullptr) {
return v;
}
/**
* Cast type T to string with status.
*
* @param [in] v input value of type T.
* @param [out] ok status, return true if there is no error in casting. Set
* nullptr if user don't care error at all.
* @return result of casting. If error occurred, a empty string will be
* returned.
*/
template <class T>
inline std::string toWithStatus(const T v, bool* ok = nullptr) {
std::ostringstream sout;
sout << v;
if (ok) {
*ok = !sout.fail();
}
return sout.str();
}
/// Convert string to type T. It makes sure all the characters in s are used.
/// Otherwise it will abort.
///
......@@ -67,6 +86,18 @@ inline T to(const std::string& s) {
return v;
}
/// Convert type T to string.
///
/// @tparam T type of input value
/// @param v input value of type T
template <class T>
std::string to_string(T v) {
bool ok;
std::string s = toWithStatus<T>(v, &ok);
CHECK(ok) << "Cannot convert v(" << v << ") to type std::string";
return s;
}
} // namespace str
#undef DEFINE_STRING_CONVERSION
......
......@@ -15,11 +15,16 @@ limitations under the License. */
#include "Util.h"
#include <dirent.h>
#include <pmmintrin.h>
#include <signal.h>
#include <sys/stat.h>
#include <sys/types.h>
#ifdef __SSE__
#include <xmmintrin.h>
#endif
#ifdef __SSE3__
#include <pmmintrin.h>
#endif
#include <fstream>
#include <mutex>
......@@ -163,8 +168,12 @@ void initMain(int argc, char** argv) {
installProfilerSwitch();
#ifdef __SSE__
_MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON);
#endif
#ifdef __SSE3__
_MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON);
#endif
if (FLAGS_seed == 0) {
unsigned int t = time(NULL);
......
......@@ -15,6 +15,7 @@ limitations under the License. */
#include "paddle/utils/Locks.h"
#include <semaphore.h>
#include <unistd.h>
#include "paddle/utils/Logging.h"
namespace paddle {
class SemaphorePrivate {
......@@ -26,7 +27,10 @@ Semaphore::Semaphore(int initValue) : m(new SemaphorePrivate()) {
sem_init(&m->sem, 0, initValue);
}
Semaphore::~Semaphore() { sem_destroy(&m->sem); }
Semaphore::~Semaphore() {
sem_destroy(&m->sem);
delete m;
}
bool Semaphore::timeWait(struct timespec* ts) {
return (0 == sem_timedwait(&m->sem, ts));
......@@ -36,36 +40,101 @@ void Semaphore::wait() { sem_wait(&m->sem); }
void Semaphore::post() { sem_post(&m->sem); }
#ifdef PADDLE_USE_PTHREAD_SPINLOCK
class SpinLockPrivate {
public:
inline SpinLockPrivate() { pthread_spin_init(&lock_, 0); }
inline ~SpinLockPrivate() { pthread_spin_destroy(&lock_); }
inline void lock() { pthread_spin_lock(&lock_); }
inline void unlock() { pthread_spin_unlock(&lock_); }
pthread_spinlock_t lock_;
char padding_[64 - sizeof(pthread_spinlock_t)];
};
SpinLock::SpinLock() : m(new SpinLockPrivate()) {}
#else
SpinLock::~SpinLock() { delete m; }
#include <atomic>
class SpinLockPrivate {
public:
inline void lock() {
while (lock_.test_and_set(std::memory_order_acquire)) {
}
}
inline void unlock() { lock_.clear(std::memory_order_release); }
std::atomic_flag lock_ = ATOMIC_FLAG_INIT;
char padding_[64 - sizeof(lock_)]; // Padding to cache line size
};
void SpinLock::lock() { pthread_spin_lock(&m->lock_); }
#endif
void SpinLock::unlock() { pthread_spin_unlock(&m->lock_); }
SpinLock::SpinLock() : m(new SpinLockPrivate()) {}
SpinLock::~SpinLock() { delete m; }
void SpinLock::lock() { m->lock(); }
void SpinLock::unlock() { m->unlock(); }
#ifdef PADDLE_USE_PTHREAD_BARRIER
class ThreadBarrierPrivate {
public:
pthread_barrier_t barrier_;
inline explicit ThreadBarrierPrivate(int count) {
pthread_barrier_init(&barrier_, nullptr, count);
}
inline ~ThreadBarrierPrivate() { pthread_barrier_destroy(&barrier_); }
inline void wait() { pthread_barrier_wait(&barrier_); }
};
ThreadBarrier::ThreadBarrier(int count) : m(new ThreadBarrierPrivate()) {
pthread_barrier_init(&m->barrier_, nullptr, count);
}
#else
ThreadBarrier::~ThreadBarrier() {
pthread_barrier_destroy(&m->barrier_);
delete m;
}
class ThreadBarrierPrivate {
public:
pthread_mutex_t mutex_;
pthread_cond_t cond_;
int count_;
int tripCount_;
inline explicit ThreadBarrierPrivate(int cnt) : count_(0), tripCount_(cnt) {
CHECK_NE(cnt, 0);
CHECK_GE(pthread_mutex_init(&mutex_, 0), 0);
CHECK_GE(pthread_cond_init(&cond_, 0), 0);
}
inline ~ThreadBarrierPrivate() {
pthread_cond_destroy(&cond_);
pthread_mutex_destroy(&mutex_);
}
/**
* @brief wait
* @return true if the last wait
*/
inline bool wait() {
pthread_mutex_lock(&mutex_);
++count_;
if (count_ >= tripCount_) {
count_ = 0;
pthread_cond_broadcast(&cond_);
pthread_mutex_unlock(&mutex_);
return true;
} else {
pthread_cond_wait(&cond_, &mutex_);
pthread_mutex_unlock(&mutex_);
return false;
}
}
};
#endif
void ThreadBarrier::wait() { pthread_barrier_wait(&m->barrier_); }
ThreadBarrier::ThreadBarrier(int count) : m(new ThreadBarrierPrivate(count)) {}
ThreadBarrier::~ThreadBarrier() { delete m; }
void ThreadBarrier::wait() { m->wait(); }
} // namespace paddle
......@@ -19,6 +19,7 @@ limitations under the License. */
#include "paddle/utils/CustomStackTrace.h"
#include "paddle/utils/Locks.h"
#include "paddle/utils/StringUtil.h"
#include "paddle/utils/Util.h"
DEFINE_int32(test_thread_num, 10, "testing thread number");
......@@ -69,11 +70,11 @@ TEST(CustomStackTrace, normalTrain) {
while (countDown-- > 0) {
start.wait();
for (size_t i = 0; i < layerSize; ++i) {
tracer.push("layer_" + std::to_string(i));
tracer.push("layer_" + paddle::str::to_string(i));
}
tracer.pop("");
for (size_t i = 0; i < layerSize; ++i) {
tracer.pop("layer_" + std::to_string(layerSize - 1 - i));
tracer.pop("layer_" + paddle::str::to_string(layerSize - 1 - i));
}
finish.wait();
}
......@@ -89,7 +90,7 @@ TEST(CustomStackTrace, normalTest) {
while (countDown-- > 0) {
start.wait();
for (size_t i = 0; i < layerSize; ++i) {
tracer.push("layer_" + std::to_string(i));
tracer.push("layer_" + paddle::str::to_string(i));
}
tracer.clear(); // in forward test, tracer will clear after forward.
finish.wait();
......
......@@ -13,13 +13,14 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/utils/CustomStackTrace.h"
#include "paddle/utils/StringUtil.h"
#include "paddle/utils/Util.h"
int main(int argc, char** argv) {
paddle::initMain(argc, argv);
for (size_t i = 0; i < 1000; ++i) {
paddle::gLayerStackTrace.push("layer_" + std::to_string(i));
paddle::gLayerStackTrace.push("layer_" + paddle::str::to_string(i));
if (i == 998) {
throw "Unhandle exception";
}
......
......@@ -18,7 +18,8 @@ limitations under the License. */
using namespace paddle; // NOLINT
TEST(SIMDFlags, gccTest) {
#if (defined(__GNUC__) || defined(__GNUG__)) && !(defined(__clang__))
#if (defined(__GNUC__) || defined(__GNUG__)) && !(defined(__clang__)) && \
!defined(__arm__)
// clang-format off
CHECK(!__builtin_cpu_supports("sse") != HAS_SSE);
CHECK(!__builtin_cpu_supports("sse2") != HAS_SSE2);
......@@ -43,4 +44,5 @@ TEST(SIMDFlags, normalPrint) {
LOG(INFO) << "Has AVX: " << std::boolalpha << HAS_AVX;
LOG(INFO) << "Has AVX2: " << std::boolalpha << HAS_AVX2;
LOG(INFO) << "Has AVX512: " << std::boolalpha << HAS_AVX512;
LOG(INFO) << "Has NEON: " << std::boolalpha << HAS_NEON;
}
......@@ -24,8 +24,9 @@ add_custom_target(paddle_python ALL DEPENDS
${OUTPUT_DIR}/.timestamp)
add_subdirectory(paddle/trainer_config_helpers/tests)
add_subdirectory(paddle/v2/reader/tests)
add_subdirectory(paddle/v2/tests)
add_subdirectory(paddle/v2/reader/tests)
add_subdirectory(paddle/v2/plot/tests)
install(DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/dist/
DESTINATION opt/paddle/share/wheels
......
......@@ -18,7 +18,7 @@ import inspect
from paddle.trainer.config_parser import *
from .activations import LinearActivation, SigmoidActivation, TanhActivation, \
ReluActivation, IdentityActivation, SoftmaxActivation
ReluActivation, IdentityActivation, SoftmaxActivation, BaseActivation
from .evaluators import *
from .poolings import MaxPooling, AvgPooling, BasePoolingType
from .attrs import *
......@@ -1916,7 +1916,7 @@ def cos_sim(a, b, scale=1, size=1, name=None, layer_attr=None):
@layer_support()
def hsigmoid(input,
label,
num_classes,
num_classes=None,
name=None,
bias_attr=None,
param_attr=None,
......@@ -1932,8 +1932,7 @@ def hsigmoid(input,
.. code-block:: python
cost = hsigmoid(input=[layer1, layer2],
label=data_layer,
num_classes=3)
label=data_layer)
:param input: Input layers. It could be a LayerOutput or list/tuple of
LayerOutput.
......@@ -1941,12 +1940,14 @@ def hsigmoid(input,
:param label: Label layer.
:type label: LayerOutput
:param num_classes: number of classes.
:type num_classes: int
:type num_classes: int|None
:param name: layer name
:type name: basestring
:param bias_attr: Bias attribute. None means default bias.
False means no bias.
:type bias_attr: ParameterAttribute|False
:param param_attr: Parameter Attribute. None means default parameter.
:type param_attr: ParameterAttribute|None
:param layer_attr: Extra Layer Attribute.
:type layer_attr: ExtraLayerAttribute
:return: LayerOutput object.
......@@ -1966,6 +1967,11 @@ def hsigmoid(input,
assert isinstance(label, LayerOutput)
assert label.layer_type == LayerType.DATA
if num_classes is None:
num_classes = label.size
if num_classes is None or num_classes <= 2:
raise ValueError("hsigmoid label size must larger than 2.")
ipts_for_layer = []
parents = []
for each_input, each_param_attr in zip(input, param_attr):
......@@ -2253,8 +2259,9 @@ def img_pool_layer(input,
pool_type.name = 'avg'
type_name = pool_type.name + '-projection' \
if (isinstance(pool_type, AvgPooling) or isinstance(pool_type, MaxPooling)) \
else pool_type.name
if (
isinstance(pool_type, AvgPooling) or isinstance(pool_type, MaxPooling)) \
else pool_type.name
pool_size_y = pool_size if pool_size_y is None else pool_size_y
stride_y = stride if stride_y is None else stride_y
......@@ -3294,8 +3301,8 @@ def recurrent_group(step,
assert (targetInlink == None or targetInlink_in_inlinks())
targetInlinkName = None if targetInlink == None \
else targetInlink.name if isinstance(targetInlink, LayerOutput) \
else targetInlink.input.name
else targetInlink.name if isinstance(targetInlink, LayerOutput) \
else targetInlink.input.name
contains_sub_seq = [False]
......@@ -4807,12 +4814,14 @@ def crf_decoding_layer(input,
return LayerOutput(name, LayerType.CRF_DECODING_LAYER, parents, size=1)
@wrap_act_default(act=SigmoidActivation())
@wrap_bias_attr_default(has_bias=True)
@wrap_name_default()
@layer_support()
def nce_layer(input,
label,
num_classes,
act=None,
weight=None,
num_neg_samples=10,
neg_distribution=None,
......@@ -4841,6 +4850,8 @@ def nce_layer(input,
:type weight: LayerOutput
:param num_classes: number of classes.
:type num_classes: int
:param act: Activation, default is Sigmoid.
:type act: BaseActivation
:param num_neg_samples: number of negative samples. Default is 10.
:type num_neg_samples: int
:param neg_distribution: The distribution for generating the random negative labels.
......@@ -4863,6 +4874,8 @@ def nce_layer(input,
assert isinstance(neg_distribution, collections.Sequence)
assert len(neg_distribution) == num_classes
assert sum(neg_distribution) == 1
if not isinstance(act, BaseActivation):
raise TypeError()
ipts_for_layer = []
parents = []
......@@ -4884,12 +4897,17 @@ def nce_layer(input,
type=LayerType.NCE_LAYER,
num_classes=num_classes,
neg_sampling_dist=neg_distribution,
active_type=act.name,
num_neg_samples=num_neg_samples,
inputs=ipts_for_layer,
bias=ParamAttr.to_bias(bias_attr),
**ExtraLayerAttribute.to_kwargs(layer_attr))
return LayerOutput(
name, LayerType.NCE_LAYER, parents=parents, size=l.config.size)
name,
LayerType.NCE_LAYER,
parents=parents,
size=l.config.size,
activation=act)
"""
......
......@@ -21,19 +21,22 @@ import data_type
import topology
import data_feeder
import networks
import evaluator
from . import dataset
from . import reader
from . import plot
import attr
import pooling
import inference
import networks
import py_paddle.swig_paddle as api
import minibatch
import plot
__all__ = [
'optimizer', 'layer', 'activation', 'parameters', 'init', 'trainer',
'event', 'data_type', 'attr', 'pooling', 'data_feeder', 'dataset', 'reader',
'topology', 'networks', 'infer'
'topology', 'networks', 'infer', 'plot', 'evaluator'
]
......
......@@ -65,13 +65,42 @@ class Layer(object):
def __init__(self, name=None, parent_layers=None):
assert isinstance(parent_layers, dict)
self.name = name
self.__contex__ = {}
self.__context__ = {}
self.__parent_layers__ = parent_layers
# some layer may have some extra parent layer
self.__extra_parent__ = []
# used for evaluator.
self.__children_layers__ = []
def extra_parent(self):
return self.__extra_parent__
def append_extra_parent(self, parent):
self.__extra_parent__.append(parent)
def append_child(self, layer, parent_names):
self.__children_layers__.append((layer, parent_names))
def to_proto(self, context):
"""
function to set proto attribute
"""
self.__context__ = context
# STEP: short cut if this layer is parsed before.
if self.context_name() in context:
if self.use_context_name():
return context[self.context_name()]
else:
return context[self.name]
# STEP: parse extra_parent that is not used by this layer but must
# be parsed before this layer.
for p in self.__extra_parent__:
p.to_proto(context=context)
# STEP: parse parent that is used by this layer, get the result and
# insert into kwargs of the next layer's to_proto_impl method.
kwargs = dict()
for layer_name in self.__parent_layers__:
if not isinstance(self.__parent_layers__[layer_name],
......@@ -83,12 +112,29 @@ class Layer(object):
self.__parent_layers__[layer_name])
kwargs[layer_name] = v1_layer
# STEP: parse myself and add myself into context.
ret_val = self.to_proto_impl(**kwargs)
if self.context_name() is not None \
and self.context_name() not in context:
context[self.context_name()] = ret_val
# STEP: parse children that should be pased after this layer.
for layer, pnames in self.__children_layers__:
drop = False
# child will only be parsed if all parents are in context.
for pname in pnames:
if pname not in context:
drop = True
break
if drop:
continue
layer.to_proto(context=context)
# STEP: return v1 layer result
if self.context_name() is None:
return self.to_proto_impl(**kwargs)
elif self.context_name() not in context:
context[self.context_name()] = self.to_proto_impl(**kwargs)
self.__contex__ = context
if self.use_context_name():
return ret_val
elif self.use_context_name():
return context[self.context_name()]
else:
return context[self.name]
......@@ -113,10 +159,13 @@ class Layer(object):
this layer is called.
:return:
"""
return self.__contex__[self.context_name()].size
return self.__context__[self.context_name()].size
def __convert_to_v2__(method_name, parent_names, is_default_name=True):
def __convert_to_v2__(method_name,
parent_names,
is_default_name=True,
attach_parent=False):
if is_default_name:
wrapper = wrap_name_default(name_prefix=method_name)
else:
......@@ -129,9 +178,20 @@ def __convert_to_v2__(method_name, parent_names, is_default_name=True):
parent_layers = dict()
other_kwargs = dict()
for pname in parent_names:
if kwargs.has_key(pname):
if pname in kwargs:
parent_layers[pname] = kwargs[pname]
if attach_parent:
pnames = [x.context_name() for x in parent_layers.values()]
for pname in parent_layers:
layers = kwargs[pname]
if not isinstance(layers, collections.Sequence):
layers = [layers]
for layer in layers:
layer.append_child(self, pnames)
for key in kwargs.keys():
if key not in parent_names:
other_kwargs[key] = kwargs[key]
......
......@@ -66,13 +66,6 @@ def download(url, module_name, md5sum):
return filename
def dict_add(a_dict, ele):
if ele in a_dict:
a_dict[ele] += 1
else:
a_dict[ele] = 1
def fetch_all():
for module_name in filter(lambda x: not x.startswith("__"),
dir(paddle.v2.dataset)):
......
......@@ -22,6 +22,7 @@ into paddle reader creators.
"""
import paddle.v2.dataset.common
import collections
import tarfile
import Queue
import re
......@@ -58,10 +59,10 @@ def build_dict(pattern, cutoff):
"""
Build a word dictionary, the key is word, and the value is index.
"""
word_freq = {}
word_freq = collections.defaultdict(int)
for doc in tokenize(pattern):
for word in doc:
paddle.v2.dataset.common.dict_add(word_freq, word)
word_freq[word] += 1
# Not sure if we should prune less-frequent words here.
word_freq = filter(lambda x: x[1] > cutoff, word_freq.items())
......
......@@ -18,6 +18,7 @@ This module will download dataset from http://www.fit.vutbr.cz/~imikolov/rnnlm/
parse train/test set into paddle reader creators.
"""
import paddle.v2.dataset.common
import collections
import tarfile
__all__ = ['train', 'test', 'build_dict']
......@@ -27,15 +28,14 @@ MD5 = '30177ea32e27c525793142b6bf2c8e2d'
def word_count(f, word_freq=None):
add = paddle.v2.dataset.common.dict_add
if word_freq == None:
word_freq = {}
if word_freq is None:
word_freq = collections.defaultdict(int)
for l in f:
for w in l.strip().split():
add(word_freq, w)
add(word_freq, '<s>')
add(word_freq, '<e>')
word_freq[w] += 1
word_freq['<s>'] += 1
word_freq['<e>'] += 1
return word_freq
......
......@@ -20,8 +20,10 @@ parse train/test set into paddle reader creators.
"""
import tarfile
import gzip
from paddle.v2.dataset.common import download
from paddle.v2.parameters import Parameters
__all__ = ['train', 'test', 'build_dict']
......@@ -30,6 +32,9 @@ MD5_DEV_TEST = '7d7897317ddd8ba0ae5c5fa7248d3ff5'
# this is a small set of data for test. The original data is too large and will be add later.
URL_TRAIN = 'http://paddlepaddle.cdn.bcebos.com/demo/wmt_shrinked_data/wmt14.tgz'
MD5_TRAIN = 'a755315dd01c2c35bde29a744ede23a6'
# this is the pretrained model, whose bleu = 26.92
URL_MODEL = 'http://paddlepaddle.bj.bcebos.com/demo/wmt_14/wmt14_model.tar.gz'
MD5_MODEL = '6b097d23e15654608c6f74923e975535'
START = "<s>"
END = "<e>"
......@@ -126,5 +131,13 @@ def test(dict_size):
download(URL_TRAIN, 'wmt14', MD5_TRAIN), 'test/test', dict_size)
def model():
tar_file = download(URL_MODEL, 'wmt14', MD5_MODEL)
with gzip.open(tar_file, 'r') as f:
parameters = Parameters.from_tar(f)
return parameters
def fetch():
download(URL_TRAIN, 'wmt14', MD5_TRAIN)
download(URL_MODEL, 'wmt14', MD5_MODEL)
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import paddle.trainer_config_helpers.evaluators as evs
import inspect
from config_base import __convert_to_v2__
__all__ = []
def initialize():
def convert_to_new_name(nm):
return nm[:-len("_evaluator")]
for __ev_name__ in filter(lambda x: x.endswith('_evaluator'), evs.__all__):
__ev__ = getattr(evs, __ev_name__)
if hasattr(__ev__, 'argspec'):
argspec = __ev__.argspec
else:
argspec = inspect.getargspec(__ev__)
parent_names = filter(lambda x: x in ['input', 'label', 'weight'],
argspec.args)
v2_ev = __convert_to_v2__(
__ev_name__,
parent_names=parent_names,
is_default_name='name' in argspec.args,
attach_parent=True)
__new_name__ = convert_to_new_name(__ev_name__)
globals()[__new_name__] = v2_ev
globals()[__new_name__].__name__ = __new_name__
__all__.append(__new_name__)
initialize()
......@@ -33,40 +33,52 @@ The primary usage shows below.
import collections
import inspect
from config_base import Layer, __convert_to_v2__
import re
import paddle.trainer_config_helpers as conf_helps
from paddle.trainer.config_parser import \
RecurrentLayerGroupWithoutOutLinksBegin, RecurrentLayerGroupSetOutLink, \
RecurrentLayerGroupEnd, model_type
from paddle.trainer_config_helpers.config_parser_utils import \
parse_network_config as __parse__
from paddle.trainer_config_helpers.default_decorators import wrap_act_default
from paddle.trainer_config_helpers.default_decorators import \
wrap_bias_attr_default
from paddle.trainer_config_helpers.default_decorators import wrap_name_default
from paddle.trainer_config_helpers.layers import RecurrentLayerGroupSetGenerator, Generator
from paddle.trainer_config_helpers.layers import layer_support
from paddle.trainer.config_parser import \
RecurrentLayerGroupWithoutOutLinksBegin, RecurrentLayerGroupSetOutLink, \
RecurrentLayerGroupEnd, model_type
import activation
import re
import attr
import data_type
from config_base import Layer, __convert_to_v2__
__all__ = ['parse_network', 'data']
def parse_network(*outputs):
def parse_network(output_layers, extra_layers=None):
"""
Parse all output layers and then generate a ModelConfig object.
Parse all layers in the neural network graph and
then generate a ModelConfig object.
.. note::
This function is used internally in paddle.v2 module. User should never
invoke this method.
:param outputs: Output layers.
:type outputs: Layer
:param output_layers: Output layers.
:type output_layers: Layer
:param extra_layers: Some layers in the neural network graph are not in the
path of output_layers.
:type extra_layers: Layer
:return: A ModelConfig object instance.
:rtype: ModelConfig
"""
if not isinstance(output_layers, collections.Sequence):
output_layers = [output_layers]
if extra_layers is not None and not isinstance(extra_layers,
collections.Sequence):
extra_layers = [extra_layers]
def __real_func__():
"""
......@@ -74,7 +86,11 @@ def parse_network(*outputs):
the plain old paddle configuration function.
"""
context = dict()
real_output = [each.to_proto(context=context) for each in outputs]
real_output = [each.to_proto(context=context) for each in output_layers]
if extra_layers is not None:
extra_output = [
each.to_proto(context=context) for each in extra_layers
]
conf_helps.outputs(real_output)
return __parse__(__real_func__)
......@@ -119,54 +135,23 @@ class DataLayerV2(Layer):
return doc
class WithExtraParent(Layer):
def extra_parent(self):
return self.__extra_parent__
def __init__(self, name=None, parent_layers=None):
self.__extra_parent__ = []
super(WithExtraParent, self).__init__(
name=name, parent_layers=parent_layers)
def append_extra_parent(self, parent):
self.__extra_parent__.append(parent)
def to_proto(self, context):
class MemoryV2(Layer):
def __init__(self, name, extra_input=None, **kwargs):
"""
function to set proto attribute
Init memory object, if memory is inited inside recurrent_group step
function, it may depend on a boot_layer that should be initialized
outside recurrent_group, so we:
1. add RecurrentLayerInput to extra_parent of self.
2. add boot_layer to the extra_parent of RecurrentLayerInput.
:param extra_input: list of RecurrentLayerInput
:type extra_input: [RecurrentLayerInput]
"""
kwargs = dict()
for p in self.__extra_parent__:
p.to_proto(context=context)
for layer_name in self.__parent_layers__:
if not isinstance(self.__parent_layers__[layer_name],
collections.Sequence):
v1_layer = self.__parent_layers__[layer_name].to_proto(
context=context)
else:
v1_layer = map(lambda x: x.to_proto(context=context),
self.__parent_layers__[layer_name])
kwargs[layer_name] = v1_layer
if self.context_name() is None:
return self.to_proto_impl(context=context, **kwargs)
elif self.context_name() not in context:
context[self.context_name()] = self.to_proto_impl(
context=context, **kwargs)
if self.use_context_name():
return context[self.context_name()]
else:
return context[self.name]
class MemoryV2(WithExtraParent):
def __init__(self, name, **kwargs):
self.name = name
super(MemoryV2, self).__init__(name=name, parent_layers=dict())
self.__kwargs__ = kwargs
self.__boot_layer_name__ = None
if 'boot_layer' in kwargs:
begin_of_current_rnn = []
# TODO(yuyang18): Fix inspect, it could be wrong when user invoke a
......@@ -189,11 +174,10 @@ class MemoryV2(WithExtraParent):
assert begin_of_current_rnn is not None
for extra in begin_of_current_rnn:
self.append_extra_parent(extra)
assert isinstance(extra, WithExtraParent)
extra.append_extra_parent(kwargs['boot_layer'])
self.__boot_layer_name__ = kwargs['boot_layer'].name
def to_proto_impl(self, context, **kwargs):
def to_proto_impl(self, **kwargs):
args = dict()
for each in kwargs:
args[each] = kwargs[each]
......@@ -201,7 +185,7 @@ class MemoryV2(WithExtraParent):
args[each] = self.__kwargs__[each]
if self.__boot_layer_name__ is not None:
args['boot_layer'] = context[self.__boot_layer_name__]
args['boot_layer'] = self.__context__[self.__boot_layer_name__]
size = args.get('size', None)
if size is not None:
......@@ -223,22 +207,6 @@ class MemoryV2(WithExtraParent):
return True
class LayerOutputV2(Layer):
"""
LayerOutputV2 is used to store the result of LayerOutput in v1 api.
It will not store it's parents because layer_output has been parsed already.
"""
def __init__(self, layer_output):
assert isinstance(layer_output, conf_helps.LayerOutput)
self.layer_output = layer_output
super(LayerOutputV2, self).__init__(
name=layer_output.name, parent_layers=dict())
def to_proto_impl(self):
return self.layer_output
class StaticInputV2(object):
def __init__(self, input, is_seq=False, size=None):
assert isinstance(input, LayerV2)
......@@ -250,6 +218,66 @@ class StaticInputV2(object):
# assert input.size is not None or size is not None
class BaseGeneratedInputV2(object):
def __init__(self):
self.bos_id = None
self.eos_id = None
def before_real_step(self):
raise NotImplementedError()
def after_real_step(self, *args):
raise NotImplementedError()
class GeneratedInputV2(BaseGeneratedInputV2):
def __init__(self, size, embedding_name, embedding_size):
super(GeneratedInputV2, self).__init__()
self.size = size
self.embedding_name = embedding_name
self.embedding_size = embedding_size
def after_real_step(self, input):
return max_id(input=input, name='__beam_search_predict__')
def before_real_step(self):
predict_id = memory(
name='__beam_search_predict__',
size=self.size,
boot_with_const_id=self.bos_id)
trg_emb = embedding(
input=predict_id,
size=self.embedding_size,
param_attr=attr.ParamAttr(name=self.embedding_name))
return trg_emb
class RecurrentLayerGroupSetGeneratorV2(Layer):
def __init__(self, eos_name, max_length, beam_size, num_results_per_sample):
self.eos_name = eos_name
self.max_length = max_length
self.beam_size = beam_size
self.num_results_per_sample = num_results_per_sample
super(RecurrentLayerGroupSetGeneratorV2, self).__init__(
name=eos_name, parent_layers={})
def to_proto_impl(self, **kwargs):
RecurrentLayerGroupSetGenerator(
Generator(
eos_layer_name=self.eos_name,
max_num_frames=self.max_length,
beam_size=self.beam_size,
num_results_per_sample=self.num_results_per_sample))
return self
def context_name(self):
return self.eos_name + ".fake"
def use_context_name(self):
return True
class MixedLayerV2(Layer):
"""
This class is use to support `with` grammar. If not, the following code
......@@ -328,18 +356,24 @@ def mixed(size=0,
return MixedLayerV2(size, input, name, act, bias_attr, layer_attr)
class RecurrentLayerInput(WithExtraParent):
class RecurrentLayerInput(Layer):
def __init__(self, recurrent_name, index, parent_layers):
assert len(parent_layers) == 1
self.__parents__ = parent_layers.values()[0]
super(RecurrentLayerInput, self).__init__(
name=self.__parents__[index].name, parent_layers=parent_layers)
parents_len = len(parent_layers)
assert parents_len <= 1
if parents_len == 0:
self.__parents__ = []
else:
self.__parents__ = parent_layers.values()[0]
self.__recurrent_name__ = recurrent_name
name = self.__parents__[
index].name if index >= 0 else self.context_name()
super(RecurrentLayerInput, self).__init__(
name=name, parent_layers=parent_layers)
def context_name(self):
return self.__recurrent_name__ + ".begin"
def to_proto_impl(self, context, **kwargs):
def to_proto_impl(self, **kwargs):
model_type('recurrent_nn')
RecurrentLayerGroupWithoutOutLinksBegin(
name=self.__recurrent_name__,
......@@ -436,6 +470,11 @@ def recurrent_group(step, input, name=None):
for i in xrange(len(non_static_inputs))
]
extra_input = None
if len(non_static_inputs) == 0:
extra_input = RecurrentLayerInput(
recurrent_name=name, index=-1, parent_layers={})
def __real_step__(*args):
rnn_input = list(args)
static_inputs = filter(lambda x: isinstance(x, StaticInputV2), input)
......@@ -443,6 +482,7 @@ def recurrent_group(step, input, name=None):
mem_name = "__%s_memory__" % static_input.input.name
mem = memory(
name=mem_name,
extra_input=extra_input,
is_seq=static_input.is_seq,
size=static_input.input.calculate_size,
boot_layer=static_input.input)
......@@ -472,6 +512,73 @@ def recurrent_group(step, input, name=None):
return retv
@wrap_name_default()
def beam_search(step,
input,
bos_id,
eos_id,
beam_size,
max_length=500,
name=None,
num_results_per_sample=None):
if num_results_per_sample is None:
num_results_per_sample = beam_size
assert num_results_per_sample <= beam_size
# logger.warning("num_results_per_sample should be less than beam_size")
if isinstance(input, StaticInputV2) or isinstance(input,
BaseGeneratedInputV2):
input = [input]
generated_input_index = -1
real_input = []
for i, each_input in enumerate(input):
assert isinstance(each_input, StaticInputV2) or isinstance(
each_input, BaseGeneratedInputV2)
if isinstance(each_input, BaseGeneratedInputV2):
assert generated_input_index == -1
generated_input_index = i
else:
real_input.append(each_input)
assert generated_input_index != -1
gipt = input[generated_input_index]
assert isinstance(gipt, BaseGeneratedInputV2)
gipt.bos_id = bos_id
gipt.eos_id = eos_id
def __real_step__(*args):
eos_name = "__%s_eos_layer__" % name
generator = RecurrentLayerGroupSetGeneratorV2(
eos_name, max_length, beam_size, num_results_per_sample)
args = list(args)
before_step_layer = gipt.before_real_step()
before_step_layer.append_child(
layer=generator, parent_names=[before_step_layer.name])
args.insert(generated_input_index, before_step_layer)
predict = gipt.after_real_step(step(*args))
eos_layer = eos(input=predict, eos_id=eos_id, name=eos_name)
predict.append_child(layer=eos_layer, parent_names=[predict.name])
return predict
# tmp = paddle.layer.recurrent_group(
# step=__real_step__,
# input=real_input,
# reverse=False,
# name=name,
# is_generating=True)
tmp = recurrent_group(step=__real_step__, input=real_input, name=name)
return tmp
__projection_names__ = filter(lambda x: x.endswith('_projection'),
dir(conf_helps))
......
......@@ -159,7 +159,8 @@ class Parameters(object):
if not self.has_key(key):
raise ValueError("No such parameter %s" % key)
conf = self.__param_conf__[key]
return tuple(map(int, conf.dims))
dims = conf.dims if conf.dims else (1, conf.size)
return tuple(map(int, dims))
def __setitem__(self, key, value):
"""
......
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from plot import Ploter
__all__ = ['Ploter']
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
class PlotData(object):
def __init__(self):
self.step = []
self.value = []
def append(self, step, value):
self.step.append(step)
self.value.append(value)
def reset(self):
self.step = []
self.value = []
class Ploter(object):
def __init__(self, *args):
self.__args__ = args
self.__plot_data__ = {}
for title in args:
self.__plot_data__[title] = PlotData()
# demo in notebooks will use Ploter to plot figure, but when we convert
# the ipydb to py file for testing, the import of matplotlib will make the
# script crash. So we can use `export DISABLE_PLOT=True` to disable import
# these libs
self.__disable_plot__ = os.environ.get("DISABLE_PLOT")
if not self.__plot_is_disabled__():
import matplotlib.pyplot as plt
from IPython import display
self.plt = plt
self.display = display
def __plot_is_disabled__(self):
return self.__disable_plot__ == "True"
def append(self, title, step, value):
assert isinstance(title, basestring)
assert self.__plot_data__.has_key(title)
data = self.__plot_data__[title]
assert isinstance(data, PlotData)
data.append(step, value)
def plot(self):
if self.__plot_is_disabled__():
return
titles = []
for title in self.__args__:
data = self.__plot_data__[title]
assert isinstance(data, PlotData)
if len(data.step) > 0:
titles.append(title)
self.plt.plot(data.step, data.value)
self.plt.legend(titles, loc='upper left')
self.display.clear_output(wait=True)
self.display.display(self.plt.gcf())
self.plt.gcf().clear()
def reset(self):
for key in self.__plot_data__:
data = self.__plot_data__[key]
assert isinstance(data, PlotData)
data.reset()
add_test(NAME test_ploter
COMMAND bash ${PROJ_ROOT}/python/paddle/v2/plot/tests/run_tests.sh
${PYTHON_EXECUTABLE})
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import test_ploter
__all__ = ['test_ploter.py']
#!/bin/bash
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
pushd `dirname $0` > /dev/null
SCRIPTPATH=$PWD
popd > /dev/null
cd $SCRIPTPATH
$1 -m pip install ../../../../../paddle/dist/*.whl
export DISABLE_PLOT="True"
test_list="test_ploter.py"
export PYTHONPATH=$PWD/../../../../../python/
for fn in $test_list
do
echo "test $fn"
$1 $fn
if [ $? -ne 0 ]; then
exit 1
fi
done
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
from paddle.v2.plot import Ploter
class TestCommon(unittest.TestCase):
def test_append(self):
title1 = "title1"
title2 = "title2"
plot_test = Ploter(title1, title2)
plot_test.append(title1, 1, 2)
plot_test.append(title1, 2, 5)
plot_test.append(title2, 3, 4)
self.assertEqual(plot_test.__plot_data__[title1].step, [1, 2])
self.assertEqual(plot_test.__plot_data__[title1].value, [2, 5])
self.assertEqual(plot_test.__plot_data__[title2].step, [3])
self.assertEqual(plot_test.__plot_data__[title2].value, [4])
plot_test.reset()
self.assertEqual(plot_test.__plot_data__[title1].step, [])
self.assertEqual(plot_test.__plot_data__[title1].value, [])
self.assertEqual(plot_test.__plot_data__[title2].step, [])
self.assertEqual(plot_test.__plot_data__[title2].value, [])
if __name__ == '__main__':
unittest.main()
......@@ -19,6 +19,7 @@ import paddle.v2.data_type as data_type
import paddle.v2.layer as layer
import paddle.v2.pooling as pooling
import paddle.v2.networks as networks
import paddle.v2.evaluator as evaluator
pixel = layer.data(name='pixel', type=data_type.dense_vector(128))
label = layer.data(name='label', type=data_type.integer_value(10))
......@@ -58,13 +59,13 @@ class ImageLayerTest(unittest.TestCase):
num_channels=16,
pool_type=pooling.Max())
maxout = layer.maxout(input=conv, num_channels=16, groups=4)
print layer.parse_network(maxpool, spp, maxout)
print layer.parse_network([maxpool, spp, maxout])
def test_norm_layer(self):
norm1 = layer.img_cmrnorm(input=conv, size=5)
norm2 = layer.batch_norm(input=conv)
norm3 = layer.sum_to_one_norm(input=conv)
print layer.parse_network(norm1, norm2, norm3)
print layer.parse_network([norm1, norm2, norm3])
class AggregateLayerTest(unittest.TestCase):
......@@ -77,7 +78,8 @@ class AggregateLayerTest(unittest.TestCase):
first_seq = layer.first_seq(input=pixel)
concat = layer.concat(input=[last_seq, first_seq])
seq_concat = layer.seq_concat(a=last_seq, b=first_seq)
print layer.parse_network(pool, last_seq, first_seq, concat, seq_concat)
print layer.parse_network(
[pool, last_seq, first_seq, concat, seq_concat])
class MathLayerTest(unittest.TestCase):
......@@ -94,8 +96,10 @@ class MathLayerTest(unittest.TestCase):
tensor = layer.tensor(a=pixel, b=pixel, size=1000)
cos_sim = layer.cos_sim(a=pixel, b=pixel)
trans = layer.trans(input=tensor)
print layer.parse_network(addto, linear_comb, interpolation, power,
scaling, slope, tensor, cos_sim, trans)
print layer.parse_network([
addto, linear_comb, interpolation, power, scaling, slope, tensor,
cos_sim, trans
])
class ReshapeLayerTest(unittest.TestCase):
......@@ -109,7 +113,8 @@ class ReshapeLayerTest(unittest.TestCase):
repeat = layer.repeat(input=pixel, num_repeats=4)
reshape = layer.seq_reshape(input=pixel, reshape_size=4)
rotate = layer.rotate(input=pixel, height=16, width=49)
print layer.parse_network(block_expand, expand, repeat, reshape, rotate)
print layer.parse_network(
[block_expand, expand, repeat, reshape, rotate])
class RecurrentLayerTest(unittest.TestCase):
......@@ -118,7 +123,7 @@ class RecurrentLayerTest(unittest.TestCase):
recurrent = layer.recurrent(input=word)
lstm = layer.lstmemory(input=word)
gru = layer.grumemory(input=word)
print layer.parse_network(recurrent, lstm, gru)
print layer.parse_network([recurrent, lstm, gru])
class CostLayerTest(unittest.TestCase):
......@@ -138,10 +143,10 @@ class CostLayerTest(unittest.TestCase):
cost10 = layer.sum_cost(input=inference)
cost11 = layer.huber_cost(input=score, label=label)
print layer.parse_network(cost1, cost2)
print layer.parse_network(cost3, cost4)
print layer.parse_network(cost5, cost6)
print layer.parse_network(cost7, cost8, cost9, cost10, cost11)
print layer.parse_network([cost1, cost2])
print layer.parse_network([cost3, cost4])
print layer.parse_network([cost5, cost6])
print layer.parse_network([cost7, cost8, cost9, cost10, cost11])
crf = layer.crf(input=inference, label=label)
crf_decoding = layer.crf_decoding(input=inference, size=3)
......@@ -150,8 +155,8 @@ class CostLayerTest(unittest.TestCase):
nce = layer.nce(input=inference, label=label, num_classes=3)
hsigmoid = layer.hsigmoid(input=inference, label=label, num_classes=3)
print layer.parse_network(crf, crf_decoding, ctc, warp_ctc, nce,
hsigmoid)
print layer.parse_network(
[crf, crf_decoding, ctc, warp_ctc, nce, hsigmoid])
class OtherLayerTest(unittest.TestCase):
......@@ -159,7 +164,7 @@ class OtherLayerTest(unittest.TestCase):
maxid = layer.max_id(input=inference)
sampling_id = layer.sampling_id(input=inference)
eos = layer.eos(input=maxid, eos_id=5)
print layer.parse_network(maxid, sampling_id, eos)
print layer.parse_network([maxid, sampling_id, eos])
def test_slicing_joining_layer(self):
pad = layer.pad(input=conv, pad_c=[2, 3], pad_h=[1, 2], pad_w=[3, 1])
......@@ -262,5 +267,20 @@ class NetworkTests(unittest.TestCase):
print layer.parse_network(vgg_out)
class EvaluatorTest(unittest.TestCase):
def test_evaluator(self):
img = layer.data(name='pixel', type=data_type.dense_vector(784))
output = layer.fc(input=img,
size=10,
act=activation.Softmax(),
name='fc_here')
lbl = layer.data(name='label', type=data_type.integer_value(10))
cost = layer.cross_entropy_cost(input=output, label=lbl)
evaluator.classification_error(input=output, label=lbl)
print layer.parse_network(cost)
print layer.parse_network(output)
if __name__ == '__main__':
unittest.main()
......@@ -17,7 +17,6 @@ import collections
from paddle.proto.ModelConfig_pb2 import ModelConfig
import layer as v2_layer
from layer import WithExtraParent
__all__ = ['Topology']
......@@ -41,9 +40,8 @@ def __bfs_travel__(callback, *layers):
__break__ = callback(each_layer)
if __break__:
return
__layers__ = each_layer.__parent_layers__.values()
if isinstance(each_layer, WithExtraParent):
__layers__ = __layers__ + each_layer.extra_parent()
__layers__ = each_layer.__parent_layers__.values() + \
each_layer.extra_parent()
__bfs_travel__(callback, *__layers__)
......@@ -53,14 +51,26 @@ class Topology(object):
and network configs.
"""
def __init__(self, layers):
if not isinstance(layers, collections.Sequence):
__check_layer_type__(layers)
layers = [layers]
for layer in layers:
__check_layer_type__(layer)
def __init__(self, layers, extra_layers=None):
def __check__(layers):
if not isinstance(layers, collections.Sequence):
__check_layer_type__(layers)
layers = [layers]
for layer in layers:
__check_layer_type__(layer)
return layers
layers = __check__(layers)
self.layers = layers
self.__model_config__ = v2_layer.parse_network(*layers)
if extra_layers is not None:
extra_layers = __check__(extra_layers)
self.__model_config__ = v2_layer.parse_network(
layers, extra_layers=extra_layers)
if extra_layers is not None:
self.layers.extend(extra_layers)
assert isinstance(self.__model_config__, ModelConfig)
def proto(self):
......
......@@ -37,9 +37,12 @@ class SGD(object):
:type cost: paddle.v2.config_base.Layer
:param parameters: The parameters dictionary.
:type parameters: paddle.v2.parameters.Parameters
:param extra_layers: Some layers in the neural network graph are not
in the path of cost layer.
:type extra_layers: paddle.v2.config_base.Layer
"""
def __init__(self, cost, parameters, update_equation):
def __init__(self, cost, parameters, update_equation, extra_layers=None):
if not isinstance(parameters, v2_parameters.Parameters):
raise TypeError('parameters should be parameters')
......@@ -47,11 +50,17 @@ class SGD(object):
if not isinstance(update_equation, v2_optimizer.Optimizer):
raise TypeError("update equation parameter must be "
"paddle.v2.optimizer.Optimizer")
topology = Topology(cost)
topology = Topology(cost, extra_layers=extra_layers)
self.__optimizer__ = update_equation
self.__topology__ = topology
self.__parameters__ = parameters
self.__topology_in_proto__ = topology.proto()
# In local mode, disable sparse_remote_update.
for param in self.__topology_in_proto__.parameters:
if param.sparse_remote_update:
param.sparse_remote_update = False
self.__data_types__ = topology.data_type()
gm = api.GradientMachine.createFromConfigProto(
self.__topology_in_proto__, api.CREATE_MODE_NORMAL,
......
......@@ -7,7 +7,8 @@ packages=['paddle',
'paddle.utils',
'paddle.v2',
'paddle.v2.dataset',
'paddle.v2.reader']
'paddle.v2.reader',
'paddle.v2.plot']
setup(name='paddle',
version='${PADDLE_VERSION}',
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册