提交 fa1c7cc3 编写于 作者: F fengjiayi

Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into complete_backward_doc

...@@ -24,4 +24,5 @@ cmake-build-* ...@@ -24,4 +24,5 @@ cmake-build-*
python/paddle/v2/framework/core.so python/paddle/v2/framework/core.so
CMakeFiles CMakeFiles
cmake_install.cmake cmake_install.cmake
paddle/.timestamp
python/paddlepaddle.egg-info/
...@@ -24,7 +24,7 @@ ...@@ -24,7 +24,7 @@
description: Format files with ClangFormat. description: Format files with ClangFormat.
entry: clang-format -i entry: clang-format -i
language: system language: system
files: \.(c|cc|cxx|cpp|h|hpp|hxx)$ files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx|proto)$
- repo: https://github.com/PaddlePaddle/pre-commit-golang - repo: https://github.com/PaddlePaddle/pre-commit-golang
sha: 8337620115c25ff8333f1b1a493bd031049bd7c0 sha: 8337620115c25ff8333f1b1a493bd031049bd7c0
hooks: hooks:
......
...@@ -37,8 +37,8 @@ before_install: ...@@ -37,8 +37,8 @@ before_install:
- if [[ "$JOB" == "check_style" ]]; then sudo ln -s /usr/bin/clang-format-3.8 /usr/bin/clang-format; fi - if [[ "$JOB" == "check_style" ]]; then sudo ln -s /usr/bin/clang-format-3.8 /usr/bin/clang-format; fi
# Paddle is using protobuf 3.1 currently. Protobuf 3.2 breaks the compatibility. So we specify the python # Paddle is using protobuf 3.1 currently. Protobuf 3.2 breaks the compatibility. So we specify the python
# protobuf version. # protobuf version.
- pip install numpy wheel 'protobuf==3.1' sphinx==1.5.6 recommonmark sphinx-rtd-theme==0.1.9 virtualenv pre-commit requests==2.9.2 LinkChecker - pip install -r $TRAVIS_BUILD_DIR/python/requirements.txt
- pip install rarfile - pip install wheel sphinx==1.5.6 recommonmark sphinx-rtd-theme==0.1.9 virtualenv pre-commit LinkChecker
- curl https://glide.sh/get | bash - curl https://glide.sh/get | bash
- eval "$(GIMME_GO_VERSION=1.8.3 gimme)" - eval "$(GIMME_GO_VERSION=1.8.3 gimme)"
- go get -u github.com/alecthomas/gometalinter - go get -u github.com/alecthomas/gometalinter
......
...@@ -14,8 +14,8 @@ ...@@ -14,8 +14,8 @@
cmake_minimum_required(VERSION 3.0) cmake_minimum_required(VERSION 3.0)
set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_CURRENT_SOURCE_DIR}/cmake") set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_CURRENT_SOURCE_DIR}/cmake")
set(PROJ_ROOT ${CMAKE_CURRENT_SOURCE_DIR}) set(PADDLE_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR})
set(PROJ_BINARY_ROOT ${CMAKE_CURRENT_BINARY_DIR}) set(PADDLE_BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR})
include(system) include(system)
...@@ -121,8 +121,8 @@ include(version) # set PADDLE_VERSION ...@@ -121,8 +121,8 @@ include(version) # set PADDLE_VERSION
include(coveralls) # set code coverage include(coveralls) # set code coverage
include_directories("${PROJ_ROOT}") include_directories("${PADDLE_SOURCE_DIR}")
include_directories("${PROJ_ROOT}/paddle/cuda/include") include_directories("${PADDLE_SOURCE_DIR}/paddle/cuda/include")
include_directories("${CMAKE_CURRENT_BINARY_DIR}/proto") include_directories("${CMAKE_CURRENT_BINARY_DIR}/proto")
include_directories("${CMAKE_CURRENT_BINARY_DIR}/go/pserver/client/c") include_directories("${CMAKE_CURRENT_BINARY_DIR}/go/pserver/client/c")
include_directories(${Boost_INCLUDE_DIRS}) include_directories(${Boost_INCLUDE_DIRS})
...@@ -144,7 +144,7 @@ if(WITH_GPU) ...@@ -144,7 +144,7 @@ if(WITH_GPU)
endif(WITH_GPU) endif(WITH_GPU)
if(WITH_MKLDNN) if(WITH_MKLDNN)
list(APPEND EXTERNAL_LIBS ${MKLDNN_LIBRARY} ${MKLDNN_IOMP_LIB}) list(APPEND EXTERNAL_LIBS ${MKLDNN_LIB} ${MKLDNN_IOMP_LIB})
endif() endif()
if(USE_NNPACK) if(USE_NNPACK)
...@@ -164,10 +164,12 @@ if(WITH_GOLANG) ...@@ -164,10 +164,12 @@ if(WITH_GOLANG)
add_subdirectory(go) add_subdirectory(go)
endif(WITH_GOLANG) endif(WITH_GOLANG)
set(PADDLE_PYTHON_BUILD_DIR "${CMAKE_CURRENT_BINARY_DIR}/python/build")
add_subdirectory(paddle) add_subdirectory(paddle)
if(WITH_PYTHON) if(WITH_PYTHON)
add_subdirectory(python) add_subdirectory(python)
endif() endif()
if(WITH_DOC) if(WITH_DOC)
add_subdirectory(doc) add_subdirectory(doc)
endif() endif()
...@@ -27,25 +27,24 @@ RUN apt-get update && \ ...@@ -27,25 +27,24 @@ RUN apt-get update && \
git python-pip python-dev openssh-server bison \ git python-pip python-dev openssh-server bison \
wget unzip unrar tar xz-utils bzip2 gzip coreutils ntp \ wget unzip unrar tar xz-utils bzip2 gzip coreutils ntp \
curl sed grep graphviz libjpeg-dev zlib1g-dev \ curl sed grep graphviz libjpeg-dev zlib1g-dev \
python-numpy python-matplotlib gcc-4.8 g++-4.8 \ python-matplotlib gcc-4.8 g++-4.8 \
automake locales clang-format-3.8 swig doxygen cmake \ automake locales clang-format swig doxygen cmake \
liblapack-dev liblapacke-dev libboost-dev \ liblapack-dev liblapacke-dev libboost-dev \
clang-3.8 llvm-3.8 libclang-3.8-dev \ clang-3.8 llvm-3.8 libclang-3.8-dev \
net-tools && \ net-tools && \
apt-get clean -y apt-get clean -y
# Install Go and glide # Install Go and glide
RUN wget -O go.tgz https://storage.googleapis.com/golang/go1.8.1.linux-amd64.tar.gz && \ RUN wget -qO- https://storage.googleapis.com/golang/go1.8.1.linux-amd64.tar.gz | \
tar -C /usr/local -xzf go.tgz && \ tar -xz -C /usr/local && \
mkdir /root/gopath && \ mkdir /root/gopath && \
mkdir /root/gopath/bin && \ mkdir /root/gopath/bin && \
mkdir /root/gopath/src && \ mkdir /root/gopath/src
rm go.tgz
ENV GOROOT=/usr/local/go GOPATH=/root/gopath ENV GOROOT=/usr/local/go GOPATH=/root/gopath
# should not be in the same line with GOROOT definition, otherwise docker build could not find GOROOT. # should not be in the same line with GOROOT definition, otherwise docker build could not find GOROOT.
ENV PATH=${PATH}:${GOROOT}/bin:${GOPATH}/bin ENV PATH=${PATH}:${GOROOT}/bin:${GOPATH}/bin
# install glide # install glide
RUN curl -q https://glide.sh/get | sh RUN curl -s -q https://glide.sh/get | sh
# git credential to skip password typing # git credential to skip password typing
RUN git config --global credential.helper store RUN git config --global credential.helper store
...@@ -56,19 +55,37 @@ RUN localedef -i en_US -f UTF-8 en_US.UTF-8 ...@@ -56,19 +55,37 @@ RUN localedef -i en_US -f UTF-8 en_US.UTF-8
# FIXME: due to temporary ipykernel dependency issue, specify ipykernel jupyter # FIXME: due to temporary ipykernel dependency issue, specify ipykernel jupyter
# version util jupyter fixes this issue. # version util jupyter fixes this issue.
RUN pip install --upgrade pip && \ RUN pip install --upgrade pip && \
pip install -U 'protobuf==3.1.0' && \ pip install -U wheel && \
pip install -U wheel pillow BeautifulSoup && \
pip install -U docopt PyYAML sphinx && \ pip install -U docopt PyYAML sphinx && \
pip install -U sphinx-rtd-theme==0.1.9 recommonmark && \ pip install -U sphinx-rtd-theme==0.1.9 recommonmark
pip install pre-commit 'requests==2.9.2' 'ipython==5.3.0' && \
RUN pip install pre-commit 'ipython==5.3.0' && \
pip install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \ pip install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \
pip install rarfile pip install opencv-python
COPY ./python/requirements.txt /root/
RUN pip install -r /root/requirements.txt
# To fix https://github.com/PaddlePaddle/Paddle/issues/1954, we use # To fix https://github.com/PaddlePaddle/Paddle/issues/1954, we use
# the solution in https://urllib3.readthedocs.io/en/latest/user-guide.html#ssl-py2 # the solution in https://urllib3.readthedocs.io/en/latest/user-guide.html#ssl-py2
RUN apt-get install -y libssl-dev libffi-dev RUN apt-get install -y libssl-dev libffi-dev
RUN pip install certifi urllib3[secure] RUN pip install certifi urllib3[secure]
# TODO(qijun) The template library Eigen doesn't work well with GCC 5
# coming with the default Docker image, so we switch to use GCC 4.8
# by default. And I will check Eigen library later.
RUN ln -sf gcc-4.8 /usr/bin/gcc && \
ln -sf gcc-ar-4.8 /usr/bin/gcc-ar && \
ln -sf gcc-nm-4.8 /usr/bin/gcc-nm && \
ln -sf gcc-ranlib-4.8 /usr/bin/gcc-ranlib && \
ln -sf gcc-4.8 /usr/bin/x86_64-linux-gnu-gcc && \
ln -sf gcc-ar-4.8 /usr/bin/x86_64-linux-gnu-gcc-ar && \
ln -sf gcc-nm-4.8 /usr/bin/x86_64-linux-gnu-gcc-nm && \
ln -sf gcc-ranlib-4.8 /usr/bin/x86_64-linux-gnu-gcc-ranlib && \
ln -sf g++-4.8 /usr/bin/g++ && \
ln -sf g++-4.8 /usr/bin/x86_64-linux-gnu-g++
# Install woboq_codebrowser to /woboq # Install woboq_codebrowser to /woboq
RUN git clone https://github.com/woboq/woboq_codebrowser /woboq && \ RUN git clone https://github.com/woboq/woboq_codebrowser /woboq && \
(cd /woboq \ (cd /woboq \
......
...@@ -74,8 +74,6 @@ if(WITH_MKLDNN) ...@@ -74,8 +74,6 @@ if(WITH_MKLDNN)
set(OPENMP_FLAGS "-fopenmp") set(OPENMP_FLAGS "-fopenmp")
set(CMAKE_C_CREATE_SHARED_LIBRARY_FORBIDDEN_FLAGS ${OPENMP_FLAGS}) set(CMAKE_C_CREATE_SHARED_LIBRARY_FORBIDDEN_FLAGS ${OPENMP_FLAGS})
set(CMAKE_CXX_CREATE_SHARED_LIBRARY_FORBIDDEN_FLAGS ${OPENMP_FLAGS}) set(CMAKE_CXX_CREATE_SHARED_LIBRARY_FORBIDDEN_FLAGS ${OPENMP_FLAGS})
set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -L${MKLDNN_IOMP_DIR} -liomp5 -Wl,--as-needed")
set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -L${MKLDNN_IOMP_DIR} -liomp5 -Wl,--as-needed")
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OPENMP_FLAGS}") set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OPENMP_FLAGS}")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OPENMP_FLAGS}") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OPENMP_FLAGS}")
else() else()
...@@ -131,7 +129,7 @@ if(WITH_GOLANG) ...@@ -131,7 +129,7 @@ if(WITH_GOLANG)
add_custom_command(OUTPUT ${CMAKE_BINARY_DIR}/glide add_custom_command(OUTPUT ${CMAKE_BINARY_DIR}/glide
COMMAND env GOPATH=${GOPATH} ${GLIDE} install COMMAND env GOPATH=${GOPATH} ${GLIDE} install
COMMAND touch ${CMAKE_BINARY_DIR}/glide COMMAND touch ${CMAKE_BINARY_DIR}/glide
DEPENDS ${PROJ_ROOT}/go/glide.lock DEPENDS ${PADDLE_SOURCE_DIR}/go/glide.lock
WORKING_DIRECTORY "${PADDLE_IN_GOPATH}/go" WORKING_DIRECTORY "${PADDLE_IN_GOPATH}/go"
) )
......
...@@ -42,26 +42,21 @@ macro(add_style_check_target TARGET_NAME) ...@@ -42,26 +42,21 @@ macro(add_style_check_target TARGET_NAME)
if(WITH_STYLE_CHECK) if(WITH_STYLE_CHECK)
set(SOURCES_LIST ${ARGN}) set(SOURCES_LIST ${ARGN})
list(REMOVE_DUPLICATES SOURCES_LIST) list(REMOVE_DUPLICATES SOURCES_LIST)
list(SORT SOURCES_LIST)
foreach(filename ${SOURCES_LIST}) foreach(filename ${SOURCES_LIST})
set(LINT ON)
foreach(pattern ${IGNORE_PATTERN}) foreach(pattern ${IGNORE_PATTERN})
if(filename MATCHES ${pattern}) if(filename MATCHES ${pattern})
message(STATUS "DROP LINT ${filename}") list(REMOVE_ITEM SOURCES_LIST ${filename})
set(LINT OFF)
endif() endif()
endforeach() endforeach()
if(LINT MATCHES ON) endforeach()
# cpplint code style
get_filename_component(base_filename ${filename} NAME) if(SOURCES_LIST)
set(CUR_GEN ${CMAKE_CURRENT_BINARY_DIR}/${base_filename}.cpplint) add_custom_command(TARGET ${TARGET_NAME} POST_BUILD
add_custom_command(TARGET ${TARGET_NAME} PRE_BUILD COMMAND "${PYTHON_EXECUTABLE}" "${PADDLE_SOURCE_DIR}/paddle/scripts/cpplint.py"
COMMAND "${PYTHON_EXECUTABLE}" "${PROJ_ROOT}/paddle/scripts/cpplint.py"
"--filter=${STYLE_FILTER}" "--filter=${STYLE_FILTER}"
"--write-success=${CUR_GEN}" ${filename} ${SOURCES_LIST}
COMMENT "cpplint: Checking source code style"
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}) WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
endif() endif()
endforeach()
endif() endif()
endmacro() endmacro()
...@@ -7,8 +7,8 @@ INCLUDE_DIRECTORIES(${ANY_SOURCE_DIR}/src/extern_lib_any) ...@@ -7,8 +7,8 @@ INCLUDE_DIRECTORIES(${ANY_SOURCE_DIR}/src/extern_lib_any)
ExternalProject_Add( ExternalProject_Add(
extern_lib_any extern_lib_any
${EXTERNAL_PROJECT_LOG_ARGS} ${EXTERNAL_PROJECT_LOG_ARGS}
GIT_REPOSITORY "https://github.com/thelink2012/any.git" GIT_REPOSITORY "https://github.com/PaddlePaddle/any.git"
GIT_TAG "8fef1e93710a0edf8d7658999e284a1142c4c020" GIT_TAG "15595d8324be9e8a9a80d9ae442fdd12bd66df5d"
PREFIX ${ANY_SOURCE_DIR} PREFIX ${ANY_SOURCE_DIR}
UPDATE_COMMAND "" UPDATE_COMMAND ""
CONFIGURE_COMMAND "" CONFIGURE_COMMAND ""
......
...@@ -28,7 +28,14 @@ INCLUDE_DIRECTORIES(${GFLAGS_INCLUDE_DIR}) ...@@ -28,7 +28,14 @@ INCLUDE_DIRECTORIES(${GFLAGS_INCLUDE_DIR})
ExternalProject_Add( ExternalProject_Add(
extern_gflags extern_gflags
${EXTERNAL_PROJECT_LOG_ARGS} ${EXTERNAL_PROJECT_LOG_ARGS}
GIT_REPOSITORY "https://github.com/gflags/gflags.git" # TODO(yiwang): The annoying warnings mentioned in
# https://github.com/PaddlePaddle/Paddle/issues/3277 are caused by
# gflags. I fired a PR https://github.com/gflags/gflags/pull/230
# to fix it. Before it gets accepted by the gflags team, we use
# my personal fork, which contains above fix, temporarily. Let's
# change this back to the official Github repo once my PR is
# merged.
GIT_REPOSITORY "https://github.com/wangkuiyi/gflags.git"
PREFIX ${GFLAGS_SOURCES_DIR} PREFIX ${GFLAGS_SOURCES_DIR}
UPDATE_COMMAND "" UPDATE_COMMAND ""
CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
......
...@@ -43,8 +43,8 @@ SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${MKLML_ROOT}/lib") ...@@ -43,8 +43,8 @@ SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${MKLML_ROOT}/lib")
INCLUDE_DIRECTORIES(${MKLML_INC_DIR}) INCLUDE_DIRECTORIES(${MKLML_INC_DIR})
SET(mklml_cmakefile ${MKLML_DOWNLOAD_DIR}/CMakeLists.txt) FILE(WRITE ${MKLML_DOWNLOAD_DIR}/CMakeLists.txt
FILE(WRITE ${mklml_cmakefile} "PROJECT(MKLML)\n" "PROJECT(MKLML)\n"
"cmake_minimum_required(VERSION 3.0)\n" "cmake_minimum_required(VERSION 3.0)\n"
"install(DIRECTORY ${MKLML_VER}\n" "install(DIRECTORY ${MKLML_VER}\n"
" DESTINATION ${MKLML_DST_DIR})\n") " DESTINATION ${MKLML_DST_DIR})\n")
...@@ -54,8 +54,7 @@ ExternalProject_Add( ...@@ -54,8 +54,7 @@ ExternalProject_Add(
${EXTERNAL_PROJECT_LOG_ARGS} ${EXTERNAL_PROJECT_LOG_ARGS}
PREFIX ${MKLML_SOURCE_DIR} PREFIX ${MKLML_SOURCE_DIR}
DOWNLOAD_DIR ${MKLML_DOWNLOAD_DIR} DOWNLOAD_DIR ${MKLML_DOWNLOAD_DIR}
DOWNLOAD_COMMAND wget --no-check-certificate -O ${MKLML_DOWNLOAD_DIR}/${MKLML_VER}.tgz ${MKLML_URL} DOWNLOAD_COMMAND wget --no-check-certificate -qO- ${MKLML_URL} | tar xz -C ${MKLML_DOWNLOAD_DIR}
&& tar -xzf ${MKLML_DOWNLOAD_DIR}/${MKLML_VER}.tgz
DOWNLOAD_NO_PROGRESS 1 DOWNLOAD_NO_PROGRESS 1
UPDATE_COMMAND "" UPDATE_COMMAND ""
CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${MKLML_INSTALL_ROOT} CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${MKLML_INSTALL_ROOT}
......
...@@ -69,9 +69,22 @@ ENDIF(NOT ${CBLAS_FOUND}) ...@@ -69,9 +69,22 @@ ENDIF(NOT ${CBLAS_FOUND})
MESSAGE(STATUS "BLAS library: ${CBLAS_LIBRARIES}") MESSAGE(STATUS "BLAS library: ${CBLAS_LIBRARIES}")
INCLUDE_DIRECTORIES(${CBLAS_INC_DIR}) INCLUDE_DIRECTORIES(${CBLAS_INC_DIR})
ADD_LIBRARY(cblas STATIC IMPORTED) # FIXME(gangliao): generate cblas target to track all high performance
SET_PROPERTY(TARGET cblas PROPERTY IMPORTED_LOCATION ${CBLAS_LIBRARIES}) # linear algebra libraries for cc_library(xxx SRCS xxx.c DEPS cblas)
SET(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/cblas_dummy.c)
FILE(WRITE ${dummyfile} "const char * dummy = \"${dummyfile}\";")
IF(${CBLAS_PROVIDER} MATCHES MKL)
ADD_LIBRARY(cblas SHARED ${dummyfile})
ELSE()
ADD_LIBRARY(cblas STATIC ${dummyfile})
ENDIF()
TARGET_LINK_LIBRARIES(cblas ${CBLAS_LIBRARIES})
IF(NOT ${CBLAS_FOUND}) IF(NOT ${CBLAS_FOUND})
ADD_DEPENDENCIES(cblas extern_openblas) ADD_DEPENDENCIES(cblas extern_openblas)
LIST(APPEND external_project_dependencies cblas) LIST(APPEND external_project_dependencies cblas)
ELSE()
IF("${CBLAS_PROVIDER}" STREQUAL "MKLML")
ADD_DEPENDENCIES(cblas mklml)
ENDIF()
ENDIF(NOT ${CBLAS_FOUND}) ENDIF(NOT ${CBLAS_FOUND})
...@@ -24,7 +24,6 @@ IF(WITH_PYTHON) ...@@ -24,7 +24,6 @@ IF(WITH_PYTHON)
ENDIF(WITH_PYTHON) ENDIF(WITH_PYTHON)
SET(py_env "") SET(py_env "")
SET(USE_VIRTUALENV_FOR_TEST 1)
IF(PYTHONINTERP_FOUND) IF(PYTHONINTERP_FOUND)
find_python_module(pip REQUIRED) find_python_module(pip REQUIRED)
find_python_module(numpy REQUIRED) find_python_module(numpy REQUIRED)
......
...@@ -9,11 +9,13 @@ function(CheckCompilerCXX11Flag) ...@@ -9,11 +9,13 @@ function(CheckCompilerCXX11Flag)
if(${CMAKE_CXX_COMPILER_VERSION} VERSION_LESS 4.8) if(${CMAKE_CXX_COMPILER_VERSION} VERSION_LESS 4.8)
message(FATAL_ERROR "Unsupported GCC version. GCC >= 4.8 required.") message(FATAL_ERROR "Unsupported GCC version. GCC >= 4.8 required.")
endif() endif()
if(NOT ANDROID)
# TODO(qijun) gcc 4.9 or later versions raise SEGV due to the optimization problem. # TODO(qijun) gcc 4.9 or later versions raise SEGV due to the optimization problem.
# Use Debug mode instead for now. # Use Debug mode instead for now.
if(CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 4.9 OR CMAKE_CXX_COMPILER_VERSION VERSION_EQUAL 4.9) if(CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 4.9 OR CMAKE_CXX_COMPILER_VERSION VERSION_EQUAL 4.9)
set(CMAKE_BUILD_TYPE "Debug" CACHE STRING "" FORCE) set(CMAKE_BUILD_TYPE "Debug" CACHE STRING "" FORCE)
endif() endif()
endif()
elseif(CMAKE_CXX_COMPILER_ID STREQUAL "AppleClang" OR CMAKE_CXX_COMPILER_ID STREQUAL "Clang") elseif(CMAKE_CXX_COMPILER_ID STREQUAL "AppleClang" OR CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
# cmake >= 3.0 compiler id "AppleClang" on Mac OS X, otherwise "Clang" # cmake >= 3.0 compiler id "AppleClang" on Mac OS X, otherwise "Clang"
# Apple Clang is a different compiler than upstream Clang which havs different version numbers. # Apple Clang is a different compiler than upstream Clang which havs different version numbers.
...@@ -115,7 +117,7 @@ set(COMMON_FLAGS ...@@ -115,7 +117,7 @@ set(COMMON_FLAGS
-Wno-error=literal-suffix -Wno-error=literal-suffix
-Wno-error=sign-compare -Wno-error=sign-compare
-Wno-error=unused-local-typedefs -Wno-error=unused-local-typedefs
-Wno-error=parentheses-equality # Warnings in Pybind11 -Wno-error=parentheses-equality # Warnings in pybind11
) )
set(GPU_COMMON_FLAGS set(GPU_COMMON_FLAGS
...@@ -195,6 +197,7 @@ endif() ...@@ -195,6 +197,7 @@ endif()
# Modern gpu architectures: Pascal # Modern gpu architectures: Pascal
if (CUDA_VERSION VERSION_GREATER "8.0" OR CUDA_VERSION VERSION_EQUAL "8.0") if (CUDA_VERSION VERSION_GREATER "8.0" OR CUDA_VERSION VERSION_EQUAL "8.0")
list(APPEND __arch_flags " -gencode arch=compute_60,code=sm_60") list(APPEND __arch_flags " -gencode arch=compute_60,code=sm_60")
list(APPEND CUDA_NVCC_FLAGS --expt-relaxed-constexpr)
endif() endif()
# Custom gpu architecture # Custom gpu architecture
......
...@@ -187,7 +187,13 @@ function(cc_library TARGET_NAME) ...@@ -187,7 +187,13 @@ function(cc_library TARGET_NAME)
endif() endif()
# cpplint code style # cpplint code style
add_style_check_target(${TARGET_NAME} ${cc_library_SRCS}) foreach(source_file ${cc_library_SRCS})
string(REGEX REPLACE "\\.[^.]*$" "" source ${source_file})
if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${source}.h)
list(APPEND cc_library_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/${source}.h)
endif()
endforeach()
add_style_check_target(${TARGET_NAME} ${cc_library_SRCS} ${cc_library_HEADERS})
else(cc_library_SRCS) else(cc_library_SRCS)
if (cc_library_DEPS) if (cc_library_DEPS)
...@@ -239,6 +245,14 @@ function(nv_library TARGET_NAME) ...@@ -239,6 +245,14 @@ function(nv_library TARGET_NAME)
add_dependencies(${TARGET_NAME} ${nv_library_DEPS}) add_dependencies(${TARGET_NAME} ${nv_library_DEPS})
target_link_libraries(${TARGET_NAME} ${nv_library_DEPS}) target_link_libraries(${TARGET_NAME} ${nv_library_DEPS})
endif() endif()
# cpplint code style
foreach(source_file ${nv_library_SRCS})
string(REGEX REPLACE "\\.[^.]*$" "" source ${source_file})
if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${source}.h)
list(APPEND cc_library_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/${source}.h)
endif()
endforeach()
add_style_check_target(${TARGET_NAME} ${nv_library_SRCS} ${nv_library_HEADERS})
else(nv_library_SRCS) else(nv_library_SRCS)
if (nv_library_DEPS) if (nv_library_DEPS)
merge_static_libs(${TARGET_NAME} ${nv_library_DEPS}) merge_static_libs(${TARGET_NAME} ${nv_library_DEPS})
...@@ -389,3 +403,16 @@ function(py_proto_compile TARGET_NAME) ...@@ -389,3 +403,16 @@ function(py_proto_compile TARGET_NAME)
protobuf_generate_python(py_srcs ${py_proto_compile_SRCS}) protobuf_generate_python(py_srcs ${py_proto_compile_SRCS})
add_custom_target(${TARGET_NAME} ALL DEPENDS ${py_srcs}) add_custom_target(${TARGET_NAME} ALL DEPENDS ${py_srcs})
endfunction() endfunction()
function(py_test TARGET_NAME)
if(WITH_TESTING)
set(options STATIC static SHARED shared)
set(oneValueArgs "")
set(multiValueArgs SRCS DEPS)
cmake_parse_arguments(py_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
add_test(NAME ${TARGET_NAME}
COMMAND env PYTHONPATH=${PADDLE_PYTHON_BUILD_DIR}/lib-python
python2 ${py_test_SRCS}
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
endif()
endfunction()
...@@ -12,7 +12,7 @@ set(CPACK_PACKAGE_DESCRIPTION "") ...@@ -12,7 +12,7 @@ set(CPACK_PACKAGE_DESCRIPTION "")
set(CPACK_DEBIAN_PACKAGE_DEPENDS "libpython2.7-dev, libstdc++6, python-pip, curl, libgfortran3, python-pip-whl") set(CPACK_DEBIAN_PACKAGE_DEPENDS "libpython2.7-dev, libstdc++6, python-pip, curl, libgfortran3, python-pip-whl")
set(CPACK_DEBIAN_PACKAGE_SECTION Devel) set(CPACK_DEBIAN_PACKAGE_SECTION Devel)
set(CPACK_DEBIAN_PACKAGE_VERSION ${PADDLE_VERSION}) set(CPACK_DEBIAN_PACKAGE_VERSION ${PADDLE_VERSION})
set(CPACK_DEBIAN_PACKAGE_CONTROL_EXTRA "${PROJ_ROOT}/paddle/scripts/deb/postinst") set(CPACK_DEBIAN_PACKAGE_CONTROL_EXTRA "${PADDLE_SOURCE_DIR}/paddle/scripts/deb/postinst")
#set(CPACK_GENERATOR "DEB") #set(CPACK_GENERATOR "DEB")
# Start cpack # Start cpack
include (CMakePackageConfigHelpers) include (CMakePackageConfigHelpers)
......
...@@ -118,7 +118,6 @@ endfunction() ...@@ -118,7 +118,6 @@ endfunction()
macro(add_unittest_without_exec TARGET_NAME) macro(add_unittest_without_exec TARGET_NAME)
add_executable(${TARGET_NAME} ${ARGN}) add_executable(${TARGET_NAME} ${ARGN})
link_paddle_test(${TARGET_NAME}) link_paddle_test(${TARGET_NAME})
add_style_check_target(${TARGET_NAME} ${ARGN})
endmacro() endmacro()
# add_unittest # add_unittest
...@@ -142,17 +141,20 @@ endmacro() ...@@ -142,17 +141,20 @@ endmacro()
function(create_resources res_file output_file) function(create_resources res_file output_file)
add_custom_command( add_custom_command(
OUTPUT ${output_file} OUTPUT ${output_file}
COMMAND python ARGS ${PROJ_ROOT}/cmake/make_resource.py ${res_file} ${output_file} COMMAND python ARGS ${PADDLE_SOURCE_DIR}/cmake/make_resource.py ${res_file} ${output_file}
DEPENDS ${res_file} ${PROJ_ROOT}/cmake/make_resource.py) DEPENDS ${res_file} ${PADDLE_SOURCE_DIR}/cmake/make_resource.py)
endfunction() endfunction()
# Create a python unittest using run_python_tests.sh, # Create a python unittest using run_python_tests.sh,
# which takes care of making correct running environment # which takes care of making correct running environment
function(add_python_test TEST_NAME) function(add_python_test TEST_NAME)
add_test(NAME ${TEST_NAME} foreach(arg ${ARGN})
COMMAND env PADDLE_PACKAGE_DIR=${PADDLE_PYTHON_PACKAGE_DIR} get_filename_component(py_fn ${arg} NAME_WE)
bash ${PROJ_ROOT}/paddle/scripts/run_python_tests.sh set(TRG_NAME ${TEST_NAME}_${py_fn})
${USE_VIRTUALENV_FOR_TEST} ${PYTHON_EXECUTABLE} ${ARGN} add_test(NAME ${TRG_NAME}
COMMAND env PYTHONPATH=${PADDLE_PYTHON_PACKAGE_DIR}
python2 ${arg}
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}) WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
endforeach()
endfunction() endfunction()
...@@ -4,7 +4,7 @@ set(tmp_version "HEAD") ...@@ -4,7 +4,7 @@ set(tmp_version "HEAD")
while ("${PADDLE_VERSION}" STREQUAL "") while ("${PADDLE_VERSION}" STREQUAL "")
execute_process( execute_process(
COMMAND ${GIT_EXECUTABLE} describe --tags --abbrev=0 ${tmp_version} COMMAND ${GIT_EXECUTABLE} describe --tags --abbrev=0 ${tmp_version}
WORKING_DIRECTORY ${PROJ_ROOT} WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}
OUTPUT_VARIABLE GIT_TAG_NAME OUTPUT_VARIABLE GIT_TAG_NAME
RESULT_VARIABLE GIT_RESULT RESULT_VARIABLE GIT_RESULT
ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE) ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
......
...@@ -257,6 +257,16 @@ seq_concat ...@@ -257,6 +257,16 @@ seq_concat
.. autoclass:: paddle.v2.layer.seq_concat .. autoclass:: paddle.v2.layer.seq_concat
:noindex: :noindex:
kmax_sequence_score
-------------------
.. autoclass:: paddle.v2.layer.kmax_sequence_score
:noindex:
sub_nested_seq
--------------
.. autoclass:: paddle.v2.layer.sub_nested_seq
:noindex:
Reshaping Layers Reshaping Layers
================ ================
......
## Auto Gradient Checker Design
## Backgraound:
- Operator forward computing is easy to check if the result is right because it has a clear definition. **But** backpropagation is a notoriously difficult algorithm to debug and get right:
- 1. you should get the right backpropagation formula according to the forward computation.
- 2. you should implement it right in CPP.
- 3. it's difficult to prepare test data.
- Auto gradient check gets a numeric gradient by forward Operator and use it as a reference of the backward Operator's result. It has several advantages:
- 1. numeric gradient checker only need forward operator.
- 2. user only need to prepare the input data for forward Operator.
## Mathematical Theory
The following two document from stanford has a detailed explanation of how to get numeric gradient and why it's useful.
- [Gradient checking and advanced optimization(en)](http://deeplearning.stanford.edu/wiki/index.php/Gradient_checking_and_advanced_optimization)
- [Gradient checking and advanced optimization(cn)](http://ufldl.stanford.edu/wiki/index.php/%E6%A2%AF%E5%BA%A6%E6%A3%80%E9%AA%8C%E4%B8%8E%E9%AB%98%E7%BA%A7%E4%BC%98%E5%8C%96)
## Numeric Gradient Implementation
### Python Interface
```python
def get_numeric_gradient(op,
input_values,
output_name,
input_to_check,
delta=0.005,
local_scope=None):
"""
Get Numeric Gradient for an operator's input.
:param op: C++ operator instance, could be an network
:param input_values: The input variables. Should be an dictionary, key is
variable name. Value is numpy array.
:param output_name: The final output variable name.
:param input_to_check: The input variable need to get gradient.
:param delta: The perturbation value for numeric gradient method. The
smaller delta is, the more accurate result will get. But if that delta is
too small, it could occur numerical stability problem.
:param local_scope: The local scope used for get_numeric_gradient.
:return: The gradient array in numpy format.
"""
```
### Explaination:
- Why need `output_name`
- One Operator may have multiple Output, you can get independent gradient from each Output. So user should set one output to calculate.
- Why need `input_to_check`
- One operator may have multiple inputs. Gradient Op can calculate the gradient of these Inputs at the same time. But Numeric Gradient needs to calculate them one by one. So `get_numeric_gradient` is designed to calculate the gradient for one input. If you need to compute multiple inputs, you can call `get_numeric_gradient` multiple times.
### Core Algorithm Implementation
```python
# we only compute gradient of one element each time.
# we use a for loop to compute the gradient of every element.
for i in xrange(tensor_size):
# get one input element throw it's index i.
origin = tensor_to_check.get_float_element(i)
# add delta to it, run op and then get the sum of the result tensor.
x_pos = origin + delta
tensor_to_check.set_float_element(i, x_pos)
y_pos = get_output()
# plus delta to this element, run op and get the sum of the result tensor.
x_neg = origin - delta
tensor_to_check.set_float_element(i, x_neg)
y_neg = get_output()
# restore old value
tensor_to_check.set_float_element(i, origin)
# compute the gradient of this element and store it into a numpy array.
gradient_flat[i] = (y_pos - y_neg) / delta / 2
# reshape the gradient result to the shape of the source tensor.
return gradient_flat.reshape(tensor_to_check.get_dims())
```
## Auto Graident Checker Framework
Each Operator Kernel has three kinds of Gradient:
- 1. Numeric Gradient
- 2. CPU Operator Gradient
- 3. GPU Operator Gradient(if supported)
Numeric Gradient Only relies on forward Operator. So we use Numeric Gradient as the reference value.
- 1. calculate the numeric gradient.
- 2. calculate CPU kernel Gradient with the backward Operator and compare it with the numeric gradient.
- 3. calculate GPU kernel Gradient with the backward Operator and compare it with the numeric gradient.(if support GPU)
#### Python Interface
```python
def check_grad(self,
forward_op,
input_vars,
inputs_to_check,
output_name,
no_grad_set=None,
only_cpu=False,
max_relative_error=0.005):
"""
:param forward_op: used to create backward_op
:param input_vars: numpy value of input variable. The following
computation will use these variables.
:param inputs_to_check: inputs var names that should check gradient.
:param output_name: output name that used to
:param max_relative_error: The relative tolerance parameter.
:param no_grad_set: used when create backward ops
:param only_cpu: only compute and check gradient on cpu kernel.
:return:
"""
```
### How to check if two numpy array is close enough?
if `abs_numeric_grad` is nearly zero, then use abs error for numeric_grad, not relative
```python
numeric_grad = ...
operator_grad = numpy.array(scope.find_var(grad_var_name(name)).get_tensor())
abs_numeric_grad = numpy.abs(numeric_grad)
# if abs_numeric_grad is nearly zero, then use abs error for numeric_grad, not relative
# error.
abs_numeric_grad[abs_numeric_grad < 1e-3] = 1
diff_mat = numpy.abs(abs_numeric_grad - operator_grad) / abs_numeric_grad
max_diff = numpy.max(diff_mat)
```
#### Notes:
1,The Input data for auto gradient checker should be reasonable to avoid numeric problem.
#### Refs:
- [Gradient checking and advanced optimization(en)](http://deeplearning.stanford.edu/wiki/index.php/Gradient_checking_and_advanced_optimization)
- [Gradient checking and advanced optimization(cn)](http://ufldl.stanford.edu/wiki/index.php/%E6%A2%AF%E5%BA%A6%E6%A3%80%E9%AA%8C%E4%B8%8E%E9%AB%98%E7%BA%A7%E4%BC%98%E5%8C%96)
# Alalysis of large model distributed training in Paddle
***NOTE: This is only some note for how we implemeted this scheme in V1, not a new design.***
## What is it
We often encounter cases that the embedding layer parameters(sparse) are so large that we can not store it in the trainer's memory when training. So we need to put them to several servers, and fetch them row by row instead of fetch all of the parameters.
## How to use
Specify command-line argument like `--loadsave_parameters_in_pserver=true --ports_num_for_sparse=1 --use_old_updater=1` when starting the paddle trainer. And also add something like `--ports_num_for_sparse=1 --pserver_num_threads=5` when starting pserver processes.
Accrodingly, configure your embedding layers like:
```python
SPARSE_REMOTE=True
w1 = data_layer(name="w1", size=dict_size)
emb1 = embedding_layer(input=w1, size=32, param_attr=ParameterAttribute(sparse_update=SPARSE_REMOTE))
w2 = data_layer(name="w2", size=dict_size)
emb2 = embedding_layer(input=w2, size=32, param_attr=ParameterAttribute(sparse_update=SPARSE_REMOTE))
...
```
## Implementation details
```c++
enum MatType {
MAT_NORMAL,
MAT_NORMAL_SHARED,
MAT_VALUE_SHARED,
MAT_SPARSE_ROW_IDS,
MAT_SPARSE_ROW_AUTO_GROW,
MAT_CACHE_ROW,
MAT_SPARSE_ROW,
MAT_SPARSE_ROW_PREFETCH,
MAT_SPARSE_ROW_PREFETCH_FULL_SIZE,
};
```
`MAT_SPARSE_ROW_PREFETCH` is what we use when configured to fetch only row of matrix when training.
In `trainer_internal.cpp:L93 trainOneBatch`:
```c++
if (config_->getOptConfig().use_sparse_remote_updater()) {
REGISTER_TIMER("prefetch");
gradientMachine_->prefetch(inArgs);
parameterUpdater_->getParametersRemote();
}
```
When doing actual network forward and backward, at the beginning of each batch, the trainer will try to download one row of data from pserver.
In `trainer/RemoteParameterUpdater.cpp`: `parameterUpdater_->getParametersRemote();`:
```c++
if (fullSize) {
...
} else {
getParams = [&] {
parameterClient_->getParameterSparse(
/* recvParameterType= */ PARAMETER_VALUE, sendBackParameterType);
};
applyL1 = [](Parameter& para, real decayRate) {
para.getMat(PARAMETER_VALUE)->applyL1(/*lr=*/1.0f, decayRate);
};
}
```
Calling `parameterClient_->getParameterSparse` will do remote call to pserver's `getParameterSparse`:
```c++
void ParameterServer2::getParameterSparse(const SendParameterRequest& request,
std::vector<Buffer>& inputBuffers,
SendParameterResponse* response,
std::vector<Buffer>* outputBuffers) {
(void)inputBuffers;
auto& buffer = *readWriteBuffer_;
size_t numReals = 0;
for (const auto& block : request.blocks()) {
numReals += getParameterConfig(block).dims(1);
}
buffer.resize(numReals);
VLOG(3) << "pserver: getParameterSparse, numReals=" << numReals;
ReadLockGuard guard(parameterMutex_);
size_t offset = 0;
for (const auto& block : request.blocks()) {
size_t width = getParameterConfig(block).dims(1);
Buffer buf = {buffer.data() + offset, width};
int type = request.send_back_parameter_type();
sendBackParameterSparse(block, type, response, &buf, width, outputBuffers);
offset += width;
}
}
```
`getParameterConfig(block).dims(1)` returns the width of the current "parameter block"(a shard of parameter object),
then `getParameterSparse` remote call returns only one row of data to the client.
# Intel® MKL-DNN on PaddlePaddle: Design Doc
我们计划将Intel深度神经网络数学库(**MKL-DNN**\[[1](#references)\])集成到PaddlePaddle,充分展现英特尔平台的优势,有效提升PaddlePaddle在英特尔架构上的性能。
我们短期内的基本目标是:
- 完成常用layer的MKL-DNN实现。
- 完成常见深度神经网络VGG,GoogLeNet 和 ResNet的MKL-DNN实现。
## Contents
- [Overview](#overview)
- [Actions](#actions)
- [CMake](#cmake)
- [Layers](#layers)
- [Activations](#activations)
- [Unit Tests](#unit-tests)
- [Protobuf Messages](#protobuf-messages)
- [Python API](#python-api)
- [Demos](#demos)
- [Benchmarking](#benchmarking)
- [Others](#others)
- [Design Concerns](#design-concerns)
## Overview
我们会把MKL-DNN作为第三方库集成进PaddlePaddle,整体框架图
<div align="center">
<img src="image/overview.png" width=350><br/>
Figure 1. PaddlePaddle on IA.
</div>
## Actions
我们把集成方案大致分为了如下几个方面。
### CMake
我们会在`CMakeLists.txt`中会添加`WITH_MKLDNN`的选项,当设置这个值为`ON`的时候会启用编译MKL-DNN功能。同时会自动开启OpenMP用于提高MKL-DNN的性能。
同时,我们会引入`WITH_MKLML`选项,用于选择是否使用MKL-DNN自带的MKLML安装包。这个安装包可以独立于MKL-DNN使用,但是建议在开启MKL-DNN的同时也打开MKLML的开关,这样才能发挥最好的性能。
所以,我们会在`cmake/external`目录新建`mkldnn.cmake``mklml.cmake`文件,它们会在编译PaddlePaddle的时候下载对应的软件包,并放到PaddlePaddle的third party目录中。
**备注**:当`WITH_MKLML=ON`的时候,会优先使用这个包作为PaddlePaddle的CBLAS和LAPACK库,所以会稍微改动`cmake/cblas.cmake`中的逻辑。
### Layers
所有MKL-DNN相关的C++ layers,都会按照PaddlePaddle的目录结构存放在
`paddle/gserver/layers`中,并且文件名都会一以*Mkldnn*开头。
所有MKL-DNN的layers都会继承于一个叫做`MkldnnLayer`的父类,该父类继承于PaddlePaddle的基类`Layer`
### Activations
由于在PaddlePaddle中,激活函数是独立于layer概念的,所以会在`paddle/gserver/activations`目录下添加一个`MkldnnActivation.h`文件定义一些用于MKL-DNN的接口,实现方法还是会在`ActivationFunction.cpp`文件。
### Unit Tests
会在`paddle/gserver/test`目录下添加`test_Mkldnn.cpp``MkldnnTester.*`用于MKL-DNN的测试。
Activation的测试,计划在PaddlePaddle原有的测试文件上直接添加新的测试type。
### Protobuf Messages
根据具体layer的需求可能会在`proto/ModelConfig.proto`里面添加必要的选项。
### Python API
目前只考虑**v1 API**
计划在`python/paddle/trainer/config_parser.py`里面添加`use_mkldnn`这个选择,方便用户选择使用MKL-DNN的layers。
具体实现方式比如:
```python
use_mkldnn = bool(int(g_command_config_args.get("use_mkldnn", 0)))
if use_mkldnn
self.layer_type = mkldnn_*
```
所有MKL-DNN的layer type会以*mkldnn_*开头,以示区分。
并且可能在`python/paddle/trainer_config_helper`目录下的`activations.py ``layers.py`里面添加必要的MKL-DNN的接口。
### Demos
会在`v1_api_demo`目录下添加一个`mkldnn`的文件夹,里面放入一些用于MKL-DNN测试的demo脚本。
### Benchmarking
会考虑添加部分逻辑在`benchmark/paddle/image/run.sh`,添加使用MKL-DNN的测试。
### Others
1. 如果在使用MKL-DNN的情况下,会把CPU的Buffer对齐为64。
2. 深入PaddlePaddle,寻找有没有其他可以优化的可能,进一步优化。比如可能会用OpenMP改进SGD的更新性能。
## Design Concerns
为了更好的符合PaddlePaddle的代码风格\[[2](#references)\],同时又尽可能少的牺牲MKL-DNN的性能\[[3](#references)\]
我们总结出一些特别需要注意的点:
1. 使用**deviceId_**。为了尽可能少的在父类Layer中添加变量或者函数,我们决定使用已有的`deviceId_`变量来区分layer的属性,定义`-2``MkldnnLayer`特有的设备ID。
2. 重写父类Layer的**init**函数,修改`deviceId_``-2`,代表这个layer是用于跑在MKL-DNN的环境下。
3. 创建`MkldnnMatrix`,用于管理MKL-DNN会用到的相关memory函数、接口以及会用的到格式信息。
4. 创建`MkldnnBase`,定义一些除了layer和memory相关的类和函数。包括MKL-DNN会用到`MkldnnStream``CpuEngine`,和未来可能还会用到`FPGAEngine`等。
5.**Argument**里添加两个`MkldnnMatrixPtr`,取名为`mkldnnValue``mkldnnGrad`,用于存放`MkldnnLayer`会用到的memory buffer。 并且添加函数cvt(会修改为一个更加合适的函数名),用于处理"CPU device"和"MKL-DNN device"之间memory的相互转化。
6. 在父类`Layer`中的`getOutput`函数中添加一段逻辑,用于判断`deviceId`,并针对device在MKL-DNN和CPU之间不统一的情况,做一个前期转换。 也就是调用`Argument`的cvt函数把output统一到需要的device上。
7. 在原来的`FLAGS`中添加一个`use_mkldnn`的flag,用于选择是否使用MKL-DNN的相关功能。
## References
1. [Intel Math Kernel Library for Deep Neural Networks (Intel MKL-DNN)](https://github.com/01org/mkl-dnn "Intel MKL-DNN")
2. [原来的方案](https://github.com/PaddlePaddle/Paddle/pull/3096)会引入**nextLayer**的信息。但是在PaddlePaddle中,无论是重构前的layer还是重构后的op,都不会想要知道next layer/op的信息。
3. MKL-DNN的高性能格式与PaddlePaddle原有的`NCHW`不同(PaddlePaddle中的CUDNN部分使用的也是`NCHW`,所以不存在这个问题),所以需要引入一个转换方法,并且只需要在必要的时候转换这种格式,才能更好的发挥MKL-DNN的性能。
...@@ -11,6 +11,15 @@ Paddle每次发新的版本,遵循以下流程: ...@@ -11,6 +11,15 @@ Paddle每次发新的版本,遵循以下流程:
* 编译这个版本的Ubuntu Deb包。如果失败,修复Ubuntu Deb包编译问题,Patch号加一,返回第二步。 * 编译这个版本的Ubuntu Deb包。如果失败,修复Ubuntu Deb包编译问题,Patch号加一,返回第二步。
* 使用Regression Test List作为检查列表,测试Docker镜像/ubuntu安装包的功能正确性 * 使用Regression Test List作为检查列表,测试Docker镜像/ubuntu安装包的功能正确性
* 如果失败,记录下所有失败的例子,在这个`release/版本号`分支中,修复所有bug后,Patch号加一,返回第二步 * 如果失败,记录下所有失败的例子,在这个`release/版本号`分支中,修复所有bug后,Patch号加一,返回第二步
* 编译这个版本的python wheel包,并发布到pypi。
* 由于pypi.python.org目前遵循[严格的命名规范PEP 513](https://www.python.org/dev/peps/pep-0513),在使用twine上传之前,需要重命名wheel包中platform相关的后缀,比如将`linux_x86_64`修改成`manylinux1_x86_64`
* pypi上的package名称为paddlepaddle和paddlepaddle_gpu,如果要上传GPU版本的包,需要修改build/python/setup.py中,name: "paddlepaddle_gpu"并重新打包wheel包:`python setup.py bdist_wheel`
* 上传方法:
```
cd build/python
pip install twine
twine upload dist/[package to upload]
```
4. 第三步完成后,将`release/版本号`分支合入master分支,并删除`release/版本号`分支。将master分支的合入commit打上tag,tag为`版本号`。同时再将`master`分支合入`develop`分支。最后删除`release/版本号`分支。 4. 第三步完成后,将`release/版本号`分支合入master分支,并删除`release/版本号`分支。将master分支的合入commit打上tag,tag为`版本号`。同时再将`master`分支合入`develop`分支。最后删除`release/版本号`分支。
5. 编译master分支的Docker发行镜像,发布到dockerhub。编译ubuntu的deb包,发布到github release页面 5. 编译master分支的Docker发行镜像,发布到dockerhub。编译ubuntu的deb包,发布到github release页面
6. 协同完成Release Note的书写 6. 协同完成Release Note的书写
......
...@@ -3,6 +3,43 @@ PaddlePaddle的Docker容器使用方式 ...@@ -3,6 +3,43 @@ PaddlePaddle的Docker容器使用方式
PaddlePaddle目前唯一官方支持的运行的方式是Docker容器。因为Docker能在所有主要操作系统(包括Linux,Mac OS X和Windows)上运行。 请注意,您需要更改 `Dockers设置 <https://github.com/PaddlePaddle/Paddle/issues/627>`_ 才能充分利用Mac OS X和Windows上的硬件资源。 PaddlePaddle目前唯一官方支持的运行的方式是Docker容器。因为Docker能在所有主要操作系统(包括Linux,Mac OS X和Windows)上运行。 请注意,您需要更改 `Dockers设置 <https://github.com/PaddlePaddle/Paddle/issues/627>`_ 才能充分利用Mac OS X和Windows上的硬件资源。
Docker使用入门
------------------------------
几个基础的概念帮助理解和使用Docker:
- *镜像*:一个Docker镜像是一个打包好的软件。它包含了这个软件本身和它所依赖的运行环境。PaddlePaddle的Docker镜像就包含了PaddlePaddle的Python库以及其依赖的多个Python库。这样我们可以直接在Docker中运行需要的程序而不需要安装后在执行。可以执行:
.. code-block:: bash
docker images
来列出当前系统中的所有镜像,同样可以执行:
.. code-block:: bash
docker pull paddlepaddle/paddle:0.10.0
来下载Docker镜像,paddlepaddle/paddle是从官方镜像源Dockerhub.com下载的,推荐国内用户使用ocker.paddlepaddle.org/paddle下载。
- *容器*: 如果说一个Docker镜像就是一个程序,那容器就是这个程序运行时产生的“进程”。
实际上,一个容器就是一个操作系统的进程,但是是运行在独立的进程空间,文件系统以及网络之上。
可以执行:
.. code-block:: bash
docker run paddlepaddle/paddle:0.10.0
来使用一个镜像启动一个容器。
- 默认情况下,Docker容器会运行在独立的文件系统空间之上,我们无法在Docker容器中
访问到主机上的文件。可以通过*挂载Volume*的方式,将主机上的文件或目录挂载到
Docker容器中。下面的命令把当前目录挂载到了容器中的 /data 目录下,容器使用
debian镜像,并且启动后执行 :code:`ls /data`。
.. code-block:: bash
docker run --rm -v $(pwd):/data debian ls /data
PaddlePaddle发布的Docker镜像使用说明 PaddlePaddle发布的Docker镜像使用说明
------------------------------ ------------------------------
...@@ -12,11 +49,11 @@ PaddlePaddle需要的所有编译工具。把编译出来的PaddlePaddle也打 ...@@ -12,11 +49,11 @@ PaddlePaddle需要的所有编译工具。把编译出来的PaddlePaddle也打
像,称为生产镜像,里面涵盖了PaddlePaddle运行所需的所有环境。每次 像,称为生产镜像,里面涵盖了PaddlePaddle运行所需的所有环境。每次
PaddlePaddle发布新版本的时候都会发布对应版本的生产镜像以及开发镜像。运 PaddlePaddle发布新版本的时候都会发布对应版本的生产镜像以及开发镜像。运
行镜像包括纯CPU版本和GPU版本以及其对应的非AVX版本。我们会在 行镜像包括纯CPU版本和GPU版本以及其对应的非AVX版本。我们会在
`dockerhub.com <https://hub.docker.com/r/paddlepaddle/paddle/tags/>`_ 提供最新 `dockerhub.com <https://hub.docker.com/r/paddlepaddle/paddle/tags/>`_
的Docker镜像,可以在"tags"标签下找到最新的Paddle镜像版本。为了方便在国 和国内镜像`docker.paddlepaddle.org` 提供最新
内的开发者下载Docker镜像,我们提供了国内的镜像服务器供大家使用。如果您 的Docker镜像,可以在"tags"标签下找到最新的Paddle镜像版本。
在国内,请把文档里命令中的paddlepaddle/paddle替换成
docker.paddlepaddle.org/paddle。 **注意:为了方便在国内的开发者下载Docker镜像,我们提供了国内的镜像服务器供大家使用。如果您在国内,请把文档里命令中的paddlepaddle/paddle替换成docker.paddlepaddle.org/paddle。**
1. 开发镜像::code:`paddlepaddle/paddle:0.10.0-dev` 1. 开发镜像::code:`paddlepaddle/paddle:0.10.0-dev`
...@@ -37,13 +74,13 @@ docker.paddlepaddle.org/paddle。 ...@@ -37,13 +74,13 @@ docker.paddlepaddle.org/paddle。
.. code-block:: bash .. code-block:: bash
docker run -it --rm paddlepaddle/paddle:0.10.0-dev /bin/bash docker run -it --rm -v $(pwd):/paddle paddlepaddle/paddle:0.10.0-dev /bin/bash
或者,可以以后台进程方式运行容器: 或者,可以以后台进程方式运行容器:
.. code-block:: bash .. code-block:: bash
docker run -d -p 2202:22 -p 8888:8888 paddledev/paddle:0.10.0-dev docker run -d -p 2202:22 -p 8888:8888 -v $(pwd):/paddle paddlepaddle/paddle:0.10.0-dev /usr/sbin/sshd -D
然后用密码 :code:`root` SSH进入容器: 然后用密码 :code:`root` SSH进入容器:
...@@ -68,6 +105,8 @@ docker.paddlepaddle.org/paddle。 ...@@ -68,6 +105,8 @@ docker.paddlepaddle.org/paddle。
如果输出是No,就需要选择使用no-AVX的镜像 如果输出是No,就需要选择使用no-AVX的镜像
**注:在0.10.0之后的版本,PaddlePaddle都可以自动判断硬件是否支持AVX,所以无需判断AVX即可使用**
以上方法在GPU镜像里也能用,只是请不要忘记提前在物理机上安装GPU最新驱动。 以上方法在GPU镜像里也能用,只是请不要忘记提前在物理机上安装GPU最新驱动。
为了保证GPU驱动能够在镜像里面正常运行,我们推荐使用[nvidia-docker](https://github.com/NVIDIA/nvidia-docker)来运行镜像。 为了保证GPU驱动能够在镜像里面正常运行,我们推荐使用[nvidia-docker](https://github.com/NVIDIA/nvidia-docker)来运行镜像。
......
...@@ -63,12 +63,35 @@ CPU-only version and a CUDA GPU version and their no-AVX versions. ...@@ -63,12 +63,35 @@ CPU-only version and a CUDA GPU version and their no-AVX versions.
We put the docker images on `dockerhub.com We put the docker images on `dockerhub.com
<https://hub.docker.com/r/paddlepaddle/paddle/tags/>`_. You can find the <https://hub.docker.com/r/paddlepaddle/paddle/tags/>`_. You can find the
latest versions under "tags" tab at dockerhub.com. If you are in latest versions under "tags" tab at dockerhub.com.
China, you can use our Docker image registry mirror to speed up the
download process. To use it, please replace all paddlepaddle/paddle in
the commands to docker.paddlepaddle.org/paddle.
1. Production images, this image might have multiple variants: ** NOTE: If you are in China, you can use our Docker image registry mirror to speed up the download process. To use it, please replace all paddlepaddle/paddle in the commands to docker.paddlepaddle.org/paddle.**
1. development image :code:`paddlepaddle/paddle:<version>-dev`
This image has packed related develop tools and runtime
environment. Users and developers can use this image instead of
their own local computer to accomplish development, build,
releasing, document writing etc. While different version of paddle
may depends on different version of libraries and tools, if you
want to setup a local environment, you must pay attention to the
versions. The development image contains:
- gcc/clang
- nvcc
- Python
- sphinx
- woboq
- sshd
Many developers use servers with GPUs, they can use ssh to login to
the server and run :code:`docker exec` to enter the docker
container and start their work. Also they can start a development
docker image with SSHD service, so they can login to the container
and start work.
2. Production images, this image might have multiple variants:
- GPU/AVX::code:`paddlepaddle/paddle:<version>-gpu` - GPU/AVX::code:`paddlepaddle/paddle:<version>-gpu`
- GPU/no-AVX::code:`paddlepaddle/paddle:<version>-gpu-noavx` - GPU/no-AVX::code:`paddlepaddle/paddle:<version>-gpu-noavx`
...@@ -84,7 +107,7 @@ the commands to docker.paddlepaddle.org/paddle. ...@@ -84,7 +107,7 @@ the commands to docker.paddlepaddle.org/paddle.
if cat /proc/cpuinfo | grep -i avx; then echo Yes; else echo No; fi if cat /proc/cpuinfo | grep -i avx; then echo Yes; else echo No; fi
**NOTE:versions after 0.10.0 will automatically detect system AVX support, so manual detect is not needed in this case.**
To run the CPU-only image as an interactive container: To run the CPU-only image as an interactive container:
.. code-block:: bash .. code-block:: bash
...@@ -103,29 +126,6 @@ the commands to docker.paddlepaddle.org/paddle. ...@@ -103,29 +126,6 @@ the commands to docker.paddlepaddle.org/paddle.
nvidia-docker run -it --rm paddlepaddle/paddle:0.10.0-gpu /bin/bash nvidia-docker run -it --rm paddlepaddle/paddle:0.10.0-gpu /bin/bash
2. development image :code:`paddlepaddle/paddle:<version>-dev`
This image has packed related develop tools and runtime
environment. Users and developers can use this image instead of
their own local computer to accomplish development, build,
releasing, document writing etc. While different version of paddle
may depends on different version of libraries and tools, if you
want to setup a local environment, you must pay attention to the
versions. The development image contains:
- gcc/clang
- nvcc
- Python
- sphinx
- woboq
- sshd
Many developers use servers with GPUs, they can use ssh to login to
the server and run :code:`docker exec` to enter the docker
container and start their work. Also they can start a development
docker image with SSHD service, so they can login to the container
and start work.
Train Model Using Python API Train Model Using Python API
---------------------------- ----------------------------
......
...@@ -13,22 +13,18 @@ ...@@ -13,22 +13,18 @@
# serve to show the default. # serve to show the default.
import sys import sys
import os, subprocess import os, subprocess
sys.path.insert(0, os.path.abspath('@PADDLE_SOURCE_DIR@/python'))
import shlex import shlex
from recommonmark import parser, transform from recommonmark import parser, transform
try: import paddle
import py_paddle import paddle.v2
import paddle
import paddle.v2
except ImportError:
print("Must install paddle python package before generating documentation")
sys.exit(1)
MarkdownParser = parser.CommonMarkParser MarkdownParser = parser.CommonMarkParser
AutoStructify = transform.AutoStructify AutoStructify = transform.AutoStructify
# If extensions (or modules to document with autodoc) are in another directory, # If extensions (or modules to document with autodoc) are in another directory,
# add these directories to sys.path here. If the directory is relative to the # add these directories to sys.path here. If the directory is relative to the
# documentation root, use os.path.abspath to make it absolute, like shown here. # documentation root, use os.path.abspath to make it absolute, like shown here.
templates_path = ["@PROJ_ROOT@/doc_theme/templates"] templates_path = ["@PADDLE_SOURCE_DIR@/doc_theme/templates"]
# -- General configuration ------------------------------------------------ # -- General configuration ------------------------------------------------
...@@ -124,7 +120,7 @@ html_theme = 'sphinx_rtd_theme' ...@@ -124,7 +120,7 @@ html_theme = 'sphinx_rtd_theme'
# Add any paths that contain custom static files (such as style sheets) here, # Add any paths that contain custom static files (such as style sheets) here,
# relative to this directory. They are copied after the builtin static files, # relative to this directory. They are copied after the builtin static files,
# so a file named "default.css" will overwrite the builtin "default.css". # so a file named "default.css" will overwrite the builtin "default.css".
html_static_path = ['@PROJ_ROOT@/doc_theme/static'] html_static_path = ['@PADDLE_SOURCE_DIR@/doc_theme/static']
# Output file base name for HTML help builder. # Output file base name for HTML help builder.
htmlhelp_basename = project + 'doc' htmlhelp_basename = project + 'doc'
......
...@@ -13,15 +13,11 @@ ...@@ -13,15 +13,11 @@
# serve to show the default. # serve to show the default.
import sys import sys
import os, subprocess import os, subprocess
sys.path.insert(0, os.path.abspath('@PADDLE_SOURCE_DIR@/python'))
import shlex import shlex
from recommonmark import parser, transform from recommonmark import parser, transform
try: import paddle
import py_paddle import paddle.v2
import paddle
import paddle.v2
except ImportError:
print("Must install paddle python package before generating documentation")
sys.exit(1)
MarkdownParser = parser.CommonMarkParser MarkdownParser = parser.CommonMarkParser
...@@ -29,7 +25,7 @@ AutoStructify = transform.AutoStructify ...@@ -29,7 +25,7 @@ AutoStructify = transform.AutoStructify
# If extensions (or modules to document with autodoc) are in another directory, # If extensions (or modules to document with autodoc) are in another directory,
# add these directories to sys.path here. If the directory is relative to the # add these directories to sys.path here. If the directory is relative to the
# documentation root, use os.path.abspath to make it absolute, like shown here. # documentation root, use os.path.abspath to make it absolute, like shown here.
templates_path = ["@PROJ_ROOT@/doc_theme/templates"] templates_path = ["@PADDLE_SOURCE_DIR@/doc_theme/templates"]
# -- General configuration ------------------------------------------------ # -- General configuration ------------------------------------------------
...@@ -124,7 +120,7 @@ html_theme = 'sphinx_rtd_theme' ...@@ -124,7 +120,7 @@ html_theme = 'sphinx_rtd_theme'
# Add any paths that contain custom static files (such as style sheets) here, # Add any paths that contain custom static files (such as style sheets) here,
# relative to this directory. They are copied after the builtin static files, # relative to this directory. They are copied after the builtin static files,
# so a file named "default.css" will overwrite the builtin "default.css". # so a file named "default.css" will overwrite the builtin "default.css".
html_static_path = ['@PROJ_ROOT@/doc_theme/static'] html_static_path = ['@PADDLE_SOURCE_DIR@/doc_theme/static']
# Output file base name for HTML help builder. # Output file base name for HTML help builder.
htmlhelp_basename = project + 'doc' htmlhelp_basename = project + 'doc'
......
...@@ -32,7 +32,7 @@ import ( ...@@ -32,7 +32,7 @@ import (
func main() { func main() {
port := flag.Int("port", 0, "port of the pserver") port := flag.Int("port", 0, "port of the pserver")
index := flag.Int("index", -1, "index of this pserver, should be larger or equal than 0") index := flag.Int("index", -1, "index of the pserver, set to -1 if use etcd for auto pserver index registry")
etcdEndpoint := flag.String("etcd-endpoint", "http://127.0.0.1:2379", etcdEndpoint := flag.String("etcd-endpoint", "http://127.0.0.1:2379",
"comma separated endpoint string for pserver to connect to etcd") "comma separated endpoint string for pserver to connect to etcd")
dialTimeout := flag.Duration("dial-timeout", 5*time.Second, "dial timeout") dialTimeout := flag.Duration("dial-timeout", 5*time.Second, "dial timeout")
...@@ -60,12 +60,12 @@ func main() { ...@@ -60,12 +60,12 @@ func main() {
idx, err = e.Register(*port) idx, err = e.Register(*port)
candy.Must(err) candy.Must(err)
cp, err = pserver.NewCheckpointFromFile(*checkpointPath, idx, e) cp, err = pserver.LoadCheckpoint(e, idx)
if err != nil { if err != nil {
if err == pserver.ErrCheckpointNotFound { if err == pserver.ErrCheckpointNotFound {
log.Infof("Could not find the pserver checkpoint.") log.Infof("Could not find the pserver checkpoint.")
} else { } else {
log.Errorf("Fetch checkpoint failed, %s", err) panic(err)
} }
} }
} }
......
hash: 2a1c0eca5c07a130e3d224f9821f96cfa37a39bf6bce141c855bbc57ef569f1c hash: 1b9b07408ca7fac27a374dc2ccd2433e4bff090484008a037df967284949a582
updated: 2017-07-29T07:34:48.722757905+08:00 updated: 2017-08-07T23:37:48.867469328Z
imports: imports:
- name: github.com/beorn7/perks - name: github.com/beorn7/perks
version: 4c0e84591b9aa9e6dcfdf3e020114cd81f89d5f9 version: 4c0e84591b9aa9e6dcfdf3e020114cd81f89d5f9
...@@ -10,7 +10,7 @@ imports: ...@@ -10,7 +10,7 @@ imports:
- name: github.com/cockroachdb/cmux - name: github.com/cockroachdb/cmux
version: 112f0506e7743d64a6eb8fedbcff13d9979bbf92 version: 112f0506e7743d64a6eb8fedbcff13d9979bbf92
- name: github.com/coreos/etcd - name: github.com/coreos/etcd
version: c31bec0f29facff13f7c3e3d948e55dd6689ed42 version: d0d1a87aa96ae14914751d42264262cb69eda170
subpackages: subpackages:
- alarm - alarm
- auth - auth
...@@ -24,6 +24,7 @@ imports: ...@@ -24,6 +24,7 @@ imports:
- error - error
- etcdserver - etcdserver
- etcdserver/api - etcdserver/api
- etcdserver/api/etcdhttp
- etcdserver/api/v2http - etcdserver/api/v2http
- etcdserver/api/v2http/httptypes - etcdserver/api/v2http/httptypes
- etcdserver/api/v3client - etcdserver/api/v3client
...@@ -145,6 +146,8 @@ imports: ...@@ -145,6 +146,8 @@ imports:
version: a1dba9ce8baed984a2495b658c82687f8157b98f version: a1dba9ce8baed984a2495b658c82687f8157b98f
subpackages: subpackages:
- xfs - xfs
- name: github.com/satori/go.uuid
version: 879c5887cd475cd7864858769793b2ceb0d44feb
- name: github.com/sirupsen/logrus - name: github.com/sirupsen/logrus
version: a3f95b5c423586578a4e099b11a46c2479628cac version: a3f95b5c423586578a4e099b11a46c2479628cac
- name: github.com/topicai/candy - name: github.com/topicai/candy
...@@ -208,11 +211,6 @@ testImports: ...@@ -208,11 +211,6 @@ testImports:
version: 04cdfd42973bb9c8589fd6a731800cf222fde1a9 version: 04cdfd42973bb9c8589fd6a731800cf222fde1a9
subpackages: subpackages:
- spew - spew
- name: github.com/docker/docker
version: b6d164e6c46d8115b146e4c3ac93784e9ef8b49e
subpackages:
- pkg/ioutils
- pkg/longpath
- name: github.com/pmezard/go-difflib - name: github.com/pmezard/go-difflib
version: d8ed2627bdf02c080bf22230dbb337003b7aba2d version: d8ed2627bdf02c080bf22230dbb337003b7aba2d
subpackages: subpackages:
......
...@@ -14,11 +14,13 @@ import: ...@@ -14,11 +14,13 @@ import:
version: ^1.0.0 version: ^1.0.0
- package: github.com/topicai/candy - package: github.com/topicai/candy
- package: golang.org/x/crypto - package: golang.org/x/crypto
vcs: git
repo: https://github.com/golang/crypto.git repo: https://github.com/golang/crypto.git
- package: golang.org/x/sys
vcs: git vcs: git
- package: golang.org/x/sys
repo: https://github.com/golang/sys.git repo: https://github.com/golang/sys.git
- package: golang.org/x/text
vcs: git vcs: git
- package: golang.org/x/text
repo: https://github.com/golang/text.git repo: https://github.com/golang/text.git
vcs: git
- package: github.com/satori/go.uuid
version: v1.1.0
...@@ -77,11 +77,12 @@ type taskEntry struct { ...@@ -77,11 +77,12 @@ type taskEntry struct {
NumFailure int NumFailure int
} }
type taskQueues struct { type masterState struct {
Todo []taskEntry Todo []taskEntry
Pending map[int]taskEntry // map from task ID to task entry Pending map[int]taskEntry // map from task ID to task entry
Done []taskEntry Done []taskEntry
Failed []taskEntry Failed []taskEntry
CurPass int
} }
// Service is the master server service. // Service is the master server service.
...@@ -95,10 +96,10 @@ type Service struct { ...@@ -95,10 +96,10 @@ type Service struct {
initDone bool initDone bool
mu sync.Mutex mu sync.Mutex
taskQueues taskQueues // State to be persisted to snapshot.
currPass int state masterState
jobTasks []taskEntry // The trainer that is currently saving model. This state is
// transient, does not need to be persisted to snapshot.
savingTrainer string savingTrainer string
} }
...@@ -141,8 +142,8 @@ func NewService(store Store, chunksPerTask int, timeoutDur time.Duration, failur ...@@ -141,8 +142,8 @@ func NewService(store Store, chunksPerTask int, timeoutDur time.Duration, failur
s.chunksPerTask = chunksPerTask s.chunksPerTask = chunksPerTask
s.timeoutDur = timeoutDur s.timeoutDur = timeoutDur
s.failureMax = failureMax s.failureMax = failureMax
s.taskQueues = taskQueues{} s.state = masterState{}
s.taskQueues.Pending = make(map[int]taskEntry) s.state.Pending = make(map[int]taskEntry)
s.ready = make(chan struct{}) s.ready = make(chan struct{})
s.store = store s.store = store
recovered, err := s.recover() recovered, err := s.recover()
...@@ -180,7 +181,7 @@ func (s *Service) recover() (bool, error) { ...@@ -180,7 +181,7 @@ func (s *Service) recover() (bool, error) {
} }
dec := gob.NewDecoder(gr) dec := gob.NewDecoder(gr)
var tqs taskQueues var tqs masterState
err = dec.Decode(&tqs) err = dec.Decode(&tqs)
if err != nil { if err != nil {
return false, err return false, err
...@@ -193,7 +194,12 @@ func (s *Service) recover() (bool, error) { ...@@ -193,7 +194,12 @@ func (s *Service) recover() (bool, error) {
log.Errorln(err) log.Errorln(err)
} }
s.taskQueues = tqs s.state = tqs
log.WithFields(s.logFields()).Infof("Master recovered from snapshot, scheduling pending task timeout check.")
for _, t := range s.state.Pending {
time.AfterFunc(s.timeoutDur, s.checkTimeoutFunc(t.Task.Meta.ID, t.Task.Meta.Epoch))
}
return true, nil return true, nil
} }
...@@ -208,7 +214,7 @@ func (s *Service) snapshot() error { ...@@ -208,7 +214,7 @@ func (s *Service) snapshot() error {
var buf bytes.Buffer var buf bytes.Buffer
gw := gzip.NewWriter(&buf) gw := gzip.NewWriter(&buf)
enc := gob.NewEncoder(gw) enc := gob.NewEncoder(gw)
err := enc.Encode(s.taskQueues) err := enc.Encode(s.state)
if err != nil { if err != nil {
return err return err
} }
...@@ -290,8 +296,7 @@ func (s *Service) SetDataset(globPaths []string, _ *int) error { ...@@ -290,8 +296,7 @@ func (s *Service) SetDataset(globPaths []string, _ *int) error {
return err return err
} }
s.jobTasks = partition(chunks, s.chunksPerTask) s.state.Todo = partition(chunks, s.chunksPerTask)
s.taskQueues.Todo = s.jobTasks
err = s.snapshot() err = s.snapshot()
if err != nil { if err != nil {
...@@ -319,17 +324,17 @@ func (s *Service) processFailedTask(t taskEntry, epoch int) { ...@@ -319,17 +324,17 @@ func (s *Service) processFailedTask(t taskEntry, epoch int) {
} }
}() }()
delete(s.taskQueues.Pending, t.Task.Meta.ID) delete(s.state.Pending, t.Task.Meta.ID)
t.NumFailure++ t.NumFailure++
if t.NumFailure > s.failureMax { if t.NumFailure > s.failureMax {
log.Warningf("Task %v failed %d times, discard.", t.Task, t.NumFailure) log.Warningf("Task %v failed %d times, discard.", t.Task, t.NumFailure)
s.taskQueues.Failed = append(s.taskQueues.Failed, t) s.state.Failed = append(s.state.Failed, t)
return return
} }
log.Warningf("Task %v failed %d times, re-dispatch.", t.Task, t.NumFailure) log.Warningf("Task %v failed %d times, re-dispatch.", t.Task, t.NumFailure)
s.taskQueues.Todo = append(s.taskQueues.Todo, t) s.state.Todo = append(s.state.Todo, t)
return return
} }
...@@ -338,7 +343,7 @@ func (s *Service) checkTimeoutFunc(taskID int, epoch int) func() { ...@@ -338,7 +343,7 @@ func (s *Service) checkTimeoutFunc(taskID int, epoch int) func() {
s.mu.Lock() s.mu.Lock()
defer s.mu.Unlock() defer s.mu.Unlock()
t, ok := s.taskQueues.Pending[taskID] t, ok := s.state.Pending[taskID]
if !ok { if !ok {
return return
} }
...@@ -350,10 +355,11 @@ func (s *Service) checkTimeoutFunc(taskID int, epoch int) func() { ...@@ -350,10 +355,11 @@ func (s *Service) checkTimeoutFunc(taskID int, epoch int) func() {
// must be called with lock held. // must be called with lock held.
func (s *Service) logFields() log.Fields { func (s *Service) logFields() log.Fields {
return log.Fields{ return log.Fields{
"todoLen": len(s.taskQueues.Todo), "todoLen": len(s.state.Todo),
"pendingLen": len(s.taskQueues.Pending), "pendingLen": len(s.state.Pending),
"doneLen": len(s.taskQueues.Done), "doneLen": len(s.state.Done),
"failedLen": len(s.taskQueues.Failed), "failedLen": len(s.state.Failed),
"curPass": s.state.CurPass,
} }
} }
...@@ -366,17 +372,17 @@ func (s *Service) GetTask(passID int, task *Task) error { ...@@ -366,17 +372,17 @@ func (s *Service) GetTask(passID int, task *Task) error {
s.mu.Lock() s.mu.Lock()
defer s.mu.Unlock() defer s.mu.Unlock()
if passID < s.currPass { if passID < s.state.CurPass {
return ErrPassBefore return ErrPassBefore
} }
if passID > s.currPass { if passID > s.state.CurPass {
// Client may get run to pass after master when one client faster than the // Client may get run to pass after master when one client faster than the
// other // other
return ErrPassAfter return ErrPassAfter
} }
if len(s.taskQueues.Todo) == 0 { if len(s.state.Todo) == 0 {
if len(s.taskQueues.Done) == 0 && len(s.taskQueues.Pending) == 0 { if len(s.state.Done) == 0 && len(s.state.Pending) == 0 {
log.WithFields(s.logFields()).Warningln("All tasks failed, may start next pass") log.WithFields(s.logFields()).Warningln("All tasks failed, may start next pass")
return ErrAllTaskFailed return ErrAllTaskFailed
} }
...@@ -384,10 +390,10 @@ func (s *Service) GetTask(passID int, task *Task) error { ...@@ -384,10 +390,10 @@ func (s *Service) GetTask(passID int, task *Task) error {
return ErrNoMoreAvailable return ErrNoMoreAvailable
} }
t := s.taskQueues.Todo[0] t := s.state.Todo[0]
t.Task.Meta.Epoch++ t.Task.Meta.Epoch++
s.taskQueues.Todo = s.taskQueues.Todo[1:] s.state.Todo = s.state.Todo[1:]
s.taskQueues.Pending[t.Task.Meta.ID] = t s.state.Pending[t.Task.Meta.ID] = t
err := s.snapshot() err := s.snapshot()
if err != nil { if err != nil {
return err return err
...@@ -409,7 +415,7 @@ func (s *Service) TaskFinished(taskID int, dummy *int) error { ...@@ -409,7 +415,7 @@ func (s *Service) TaskFinished(taskID int, dummy *int) error {
s.mu.Lock() s.mu.Lock()
defer s.mu.Unlock() defer s.mu.Unlock()
t, ok := s.taskQueues.Pending[taskID] t, ok := s.state.Pending[taskID]
if !ok { if !ok {
log.WithFields(s.logFields()).Warningln("Pending task #%d not found.", taskID) log.WithFields(s.logFields()).Warningln("Pending task #%d not found.", taskID)
return nil return nil
...@@ -417,18 +423,18 @@ func (s *Service) TaskFinished(taskID int, dummy *int) error { ...@@ -417,18 +423,18 @@ func (s *Service) TaskFinished(taskID int, dummy *int) error {
// task finished, reset timeout // task finished, reset timeout
t.NumFailure = 0 t.NumFailure = 0
s.taskQueues.Done = append(s.taskQueues.Done, t) s.state.Done = append(s.state.Done, t)
delete(s.taskQueues.Pending, taskID) delete(s.state.Pending, taskID)
log.WithFields(s.logFields()).Infof("Task #%d finished.", taskID) log.WithFields(s.logFields()).Infof("Task #%d finished.", taskID)
if len(s.taskQueues.Todo) == 0 && len(s.taskQueues.Pending) == 0 { if len(s.state.Todo) == 0 && len(s.state.Pending) == 0 {
// increase master side pass count if all tasks finished // increase master side pass count if all tasks finished
s.currPass++ s.state.CurPass++
s.taskQueues.Todo = s.jobTasks s.state.Todo = append(s.state.Done, s.state.Failed...)
s.taskQueues.Done = []taskEntry{} s.state.Done = []taskEntry{}
// TODO(typhoonzero): deal with failed tasks // TODO(typhoonzero): deal with failed tasks
s.taskQueues.Failed = []taskEntry{} s.state.Failed = []taskEntry{}
log.WithFields(s.logFields()).Warningf("all task finished, add new pass data, newpass: %d.", s.currPass) log.WithFields(s.logFields()).Warningf("all task finished, add new pass data, newpass: %d.", s.state.CurPass)
} }
err := s.snapshot() err := s.snapshot()
...@@ -447,7 +453,7 @@ func (s *Service) TaskFailed(meta TaskMeta, dummy *int) error { ...@@ -447,7 +453,7 @@ func (s *Service) TaskFailed(meta TaskMeta, dummy *int) error {
s.mu.Lock() s.mu.Lock()
defer s.mu.Unlock() defer s.mu.Unlock()
t, ok := s.taskQueues.Pending[meta.ID] t, ok := s.state.Pending[meta.ID]
if !ok { if !ok {
log.WithFields(s.logFields()).Warningln("TaskFailed:Pending task #%v not found.", t.Task.Meta) log.WithFields(s.logFields()).Warningln("TaskFailed:Pending task #%v not found.", t.Task.Meta)
return nil return nil
......
package master_test package master_test
import ( import (
"io/ioutil"
"net/url"
"os" "os"
"strings"
"testing" "testing"
"time" "time"
"github.com/PaddlePaddle/Paddle/go/master" "github.com/PaddlePaddle/Paddle/go/master"
"github.com/coreos/etcd/clientv3" "github.com/coreos/etcd/clientv3"
"github.com/coreos/etcd/embed" "github.com/coreos/etcd/embed"
"github.com/docker/docker/pkg/ioutils"
"github.com/stretchr/testify/assert" "github.com/stretchr/testify/assert"
) )
func TestNewServiceWithEtcd(t *testing.T) { func TestNewServiceWithEtcd(t *testing.T) {
// setup an embed etcd server // setup an embed etcd server
etcdDir, err := ioutils.TempDir("", "") etcdDir, err := ioutil.TempDir("", "")
if err != nil { if err != nil {
t.Fatal(err) t.Fatal(err)
} }
cfg := embed.NewConfig() cfg := embed.NewConfig()
lpurl, _ := url.Parse("http://localhost:0")
lcurl, _ := url.Parse("http://localhost:0")
cfg.LPUrls = []url.URL{*lpurl}
cfg.LCUrls = []url.URL{*lcurl}
cfg.Dir = etcdDir cfg.Dir = etcdDir
e, err := embed.StartEtcd(cfg) e, err := embed.StartEtcd(cfg)
if err != nil { if err != nil {
...@@ -30,15 +36,13 @@ func TestNewServiceWithEtcd(t *testing.T) { ...@@ -30,15 +36,13 @@ func TestNewServiceWithEtcd(t *testing.T) {
t.Fatal(err) t.Fatal(err)
} }
}() }()
select {
case <-e.Server.ReadyNotify():
t.Log("Server is ready!")
case <-time.After(60 * time.Second):
e.Server.Stop() // trigger a shutdown
t.Fatal("Server took too long to start!")
}
ep := []string{"127.0.0.1:2379"} <-e.Server.ReadyNotify()
port := strings.Split(e.Clients[0].Addr().String(), ":")[1]
endpoint := "127.0.0.1:" + port
ep := []string{endpoint}
masterAddr := "127.0.0.1:3306" masterAddr := "127.0.0.1:3306"
store, err := master.NewEtcdClient(ep, masterAddr, master.DefaultLockPath, master.DefaultAddrPath, master.DefaultStatePath, 30) store, err := master.NewEtcdClient(ep, masterAddr, master.DefaultLockPath, master.DefaultAddrPath, master.DefaultStatePath, 30)
if err != nil { if err != nil {
......
...@@ -90,8 +90,12 @@ func cArrayToSlice(p unsafe.Pointer, len int) []byte { ...@@ -90,8 +90,12 @@ func cArrayToSlice(p unsafe.Pointer, len int) []byte {
type selector bool type selector bool
func (s selector) Select() bool { func (s selector) Select() (bool, error) {
return bool(s) return bool(s), nil
}
func (s selector) Done() error {
return nil
} }
type lister []client.Server type lister []client.Server
...@@ -114,11 +118,10 @@ func paddle_new_pserver_client(addrs *C.char, selected int) C.paddle_pserver_cli ...@@ -114,11 +118,10 @@ func paddle_new_pserver_client(addrs *C.char, selected int) C.paddle_pserver_cli
} }
//export paddle_new_etcd_pserver_client //export paddle_new_etcd_pserver_client
func paddle_new_etcd_pserver_client(etcdEndpoints *C.char, selected int) C.paddle_pserver_client { func paddle_new_etcd_pserver_client(etcdEndpoints *C.char) C.paddle_pserver_client {
// TODO(Longfei: use etcd lock to decide which trainer to initialize the parameters)
addr := C.GoString(etcdEndpoints) addr := C.GoString(etcdEndpoints)
etcdClient := client.NewEtcd(addr) etcdClient := client.NewEtcd(addr)
c := client.NewClient(etcdClient, etcdClient.Desired(), selector(selected != 0)) c := client.NewClient(etcdClient, etcdClient.Desired(), etcdClient)
return add(c) return add(c)
} }
...@@ -136,7 +139,12 @@ func paddle_pserver_client_release(client C.paddle_pserver_client) { ...@@ -136,7 +139,12 @@ func paddle_pserver_client_release(client C.paddle_pserver_client) {
//export paddle_begin_init_params //export paddle_begin_init_params
func paddle_begin_init_params(client C.paddle_pserver_client) C.int { func paddle_begin_init_params(client C.paddle_pserver_client) C.int {
c := get(client) c := get(client)
if selected := c.BeginInitParams(); selected { selected, err := c.BeginInitParams()
if err != nil {
panic(err)
}
if selected {
return 1 return 1
} }
return 0 return 0
......
...@@ -17,12 +17,10 @@ def main(): ...@@ -17,12 +17,10 @@ def main():
# network config # network config
x = paddle.layer.data(name='x', type=paddle.data_type.dense_vector(13)) x = paddle.layer.data(name='x', type=paddle.data_type.dense_vector(13))
y_predict = paddle.layer.fc(input=x, y_predict = paddle.layer.fc(input=x,
param_attr=paddle.attr.Param( param_attr=paddle.attr.Param(name='w'),
name='w', learning_rate=1e-3),
size=1, size=1,
act=paddle.activation.Linear(), act=paddle.activation.Linear(),
bias_attr=paddle.attr.Param( bias_attr=paddle.attr.Param(name='b'))
name='b', learning_rate=1e-3))
y = paddle.layer.data(name='y', type=paddle.data_type.dense_vector(1)) y = paddle.layer.data(name='y', type=paddle.data_type.dense_vector(1))
cost = paddle.layer.mse_cost(input=y_predict, label=y) cost = paddle.layer.mse_cost(input=y_predict, label=y)
......
...@@ -27,9 +27,13 @@ import ( ...@@ -27,9 +27,13 @@ import (
// TODO(helin): add RPC call retry logic // TODO(helin): add RPC call retry logic
// Selector selects if the client should initialize parameter servers. // Selector selects if the client should initialize parameters and
// reports the initialization process done.
type Selector interface { type Selector interface {
Select() bool // Select selects if the client should initialize parameter servers.
Select() (bool, error)
// Done indicates the initialization process is done.
Done() error
} }
// Server is the identification of a parameter Server. // Server is the identification of a parameter Server.
...@@ -115,7 +119,7 @@ func (c *Client) monitorPservers(l Lister, pserverNum int) { ...@@ -115,7 +119,7 @@ func (c *Client) monitorPservers(l Lister, pserverNum int) {
// servers. Other trainers will be blocked until the initialization is // servers. Other trainers will be blocked until the initialization is
// done, and they need to get the initialized parameters from // done, and they need to get the initialized parameters from
// parameter servers using GetParams. // parameter servers using GetParams.
func (c *Client) BeginInitParams() bool { func (c *Client) BeginInitParams() (bool, error) {
return c.sel.Select() return c.sel.Select()
} }
......
...@@ -59,7 +59,7 @@ func initClient() [numPserver]int { ...@@ -59,7 +59,7 @@ func initClient() [numPserver]int {
go func(l net.Listener) { go func(l net.Listener) {
var cp pserver.Checkpoint var cp pserver.Checkpoint
s, err := pserver.NewService(0, 1, "", nil, cp) s, err := pserver.NewService(0, time.Hour, "", nil, cp)
if err != nil { if err != nil {
panic(err) panic(err)
} }
...@@ -124,8 +124,12 @@ func initEtcdClient() { ...@@ -124,8 +124,12 @@ func initEtcdClient() {
type selector bool type selector bool
func (s selector) Select() bool { func (s selector) Select() (bool, error) {
return bool(s) return bool(s), nil
}
func (s selector) Done() error {
return nil
} }
type lister []client.Server type lister []client.Server
...@@ -135,7 +139,11 @@ func (l lister) List() []client.Server { ...@@ -135,7 +139,11 @@ func (l lister) List() []client.Server {
} }
func testClient(t *testing.T, c *client.Client) { func testClient(t *testing.T, c *client.Client) {
selected := c.BeginInitParams() selected, err := c.BeginInitParams()
if err != nil {
t.Fatal(err)
}
if !selected { if !selected {
t.Fatal("should be selected.") t.Fatal("should be selected.")
} }
......
...@@ -16,53 +16,60 @@ package client ...@@ -16,53 +16,60 @@ package client
import ( import (
"context" "context"
"errors"
"fmt"
"strconv" "strconv"
"strings" "strings"
"time" "time"
"github.com/PaddlePaddle/Paddle/go/pserver" "github.com/PaddlePaddle/Paddle/go/pserver"
"github.com/coreos/etcd/clientv3" "github.com/coreos/etcd/clientv3"
"github.com/coreos/etcd/clientv3/concurrency"
log "github.com/sirupsen/logrus" log "github.com/sirupsen/logrus"
) )
const ( const (
defaultEtcdTimeout time.Duration = 5 * time.Second defaultEtcdTimeout time.Duration = 5 * time.Second
initLockPath = "/init_ps/lock"
initDonePath = "/init_ps/done"
initDoneVal = "1"
) )
// EtcdClient is used by pserver client that is a part of trainer process. // Etcd is used by pserver client that is a part of trainer process.
// TODO: // TODO:
// 1. add watcher to watch the change state of pservers) // 1. add watcher to watch the change state of pservers.
// 1. add etcd lock) type Etcd struct {
type EtcdClient struct {
client *clientv3.Client client *clientv3.Client
timeout time.Duration timeout time.Duration
endpoints []string endpoints []string
lock *concurrency.Mutex
} }
// Desired read ps desired number from etcd. // Desired read ps desired number from etcd.
func (p *EtcdClient) Desired() int { func (e *Etcd) Desired() int {
var psDesired int var psDesired int
for { for {
ctx, cancel := context.WithTimeout(context.Background(), p.timeout) ctx, cancel := context.WithTimeout(context.Background(), e.timeout)
resp, err := p.client.Get(ctx, pserver.PsDesired) resp, err := e.client.Get(ctx, pserver.PsDesired)
cancel() cancel()
if err != nil { if err != nil {
log.Errorf("Get ps dresire number failed! recnnectiong..., %v", err) log.Errorf("Get ps dresire number failed! recnnectiong..., %v", err)
time.Sleep(p.timeout) time.Sleep(e.timeout)
continue continue
} }
kvs := resp.Kvs kvs := resp.Kvs
if len(kvs) == 0 { if len(kvs) == 0 {
log.Infoln("Waiting for ps desired registered ...") log.Infoln("Waiting for ps desired registered ...")
time.Sleep(p.timeout) time.Sleep(e.timeout)
continue continue
} }
psDesired, err = strconv.Atoi(string(resp.Kvs[0].Value)) psDesired, err = strconv.Atoi(string(resp.Kvs[0].Value))
if err != nil { if err != nil {
log.Errorf("psDesired %d invalid %v", psDesired, err) log.Errorf("psDesired %d invalid %v", psDesired, err)
time.Sleep(p.timeout) time.Sleep(e.timeout)
continue continue
} }
...@@ -73,26 +80,26 @@ func (p *EtcdClient) Desired() int { ...@@ -73,26 +80,26 @@ func (p *EtcdClient) Desired() int {
} }
// List return the pserver list read from etcd. // List return the pserver list read from etcd.
func (p *EtcdClient) List() []Server { func (e *Etcd) List() []Server {
psDesired := p.Desired() psDesired := e.Desired()
servers := make([]Server, psDesired) servers := make([]Server, psDesired)
for { for {
for i := 0; i < psDesired; i++ { for i := 0; i < psDesired; i++ {
ctx, cancel := context.WithTimeout(context.Background(), p.timeout) ctx, cancel := context.WithTimeout(context.Background(), e.timeout)
psKey := pserver.PsPath + strconv.Itoa(i) psKey := pserver.PsPath + strconv.Itoa(i)
log.Debugf("checking %s", psKey) log.Debugf("checking %s", psKey)
resp, err := p.client.Get(ctx, psKey) resp, err := e.client.Get(ctx, psKey)
cancel() cancel()
if err != nil { if err != nil {
log.Infof("Get psKey= %s error, %v", psKey, err) log.Infof("Get psKey= %s error, %v", psKey, err)
time.Sleep(p.timeout) time.Sleep(e.timeout)
continue continue
} }
kvs := resp.Kvs kvs := resp.Kvs
if len(kvs) == 0 { if len(kvs) == 0 {
log.Infof("Waiting for ps addr registered ...") log.Infof("Waiting for ps addr registered ...")
time.Sleep(p.timeout) time.Sleep(e.timeout)
continue continue
} }
...@@ -100,10 +107,10 @@ func (p *EtcdClient) List() []Server { ...@@ -100,10 +107,10 @@ func (p *EtcdClient) List() []Server {
// TODO(Longfei) check the ps address // TODO(Longfei) check the ps address
if psAddr == "" { if psAddr == "" {
log.Infof("Get psKey = %s, psAddr is empty", psKey) log.Infof("Get psKey = %s, psAddr is empty", psKey)
time.Sleep(p.timeout) time.Sleep(e.timeout)
continue continue
} }
log.Infof("got value (%s) for key: %s", psAddr, psKey) log.Debugf("got value (%s) for key: %s", psAddr, psKey)
servers[i].Index = i servers[i].Index = i
servers[i].Addr = psAddr servers[i].Addr = psAddr
} }
...@@ -113,7 +120,7 @@ func (p *EtcdClient) List() []Server { ...@@ -113,7 +120,7 @@ func (p *EtcdClient) List() []Server {
} }
// NewEtcd create a etcd client to return the state of pserver on etcd. // NewEtcd create a etcd client to return the state of pserver on etcd.
func NewEtcd(endpoints string) *EtcdClient { func NewEtcd(endpoints string) *Etcd {
ep := strings.Split(endpoints, ",") ep := strings.Split(endpoints, ",")
var cli *clientv3.Client var cli *clientv3.Client
var err error var err error
...@@ -130,10 +137,118 @@ func NewEtcd(endpoints string) *EtcdClient { ...@@ -130,10 +137,118 @@ func NewEtcd(endpoints string) *EtcdClient {
break break
} }
log.Infof("Connected to etcd: %s\n", endpoints) log.Infof("Connected to etcd: %s\n", endpoints)
client := &EtcdClient{ client := &Etcd{
client: cli, client: cli,
timeout: defaultEtcdTimeout, timeout: defaultEtcdTimeout,
endpoints: ep, endpoints: ep,
} }
return client return client
} }
// Select indicates if the current trainer is selected to initialize
// the pserver parameters.
func (e *Etcd) Select() (bool, error) {
sess, err := concurrency.NewSession(e.client, concurrency.WithTTL(5))
if err != nil {
return false, err
}
lock := concurrency.NewMutex(sess, initLockPath)
log.Infof("Trying to acquire lock at %s.", initLockPath)
// Do not use timeout context here, since we don't know how
// long does it take for other trainers to initialize the
// parameters.
err = lock.Lock(context.Background())
if err != nil {
return false, err
}
log.Infof("Successfully acquired lock at %s.", initLockPath)
get := clientv3.OpGet(initDonePath)
ctx, cancel := context.WithTimeout(context.Background(), e.timeout)
tresp, err := e.client.Txn(ctx).If(lock.IsOwner()).Then(get).Commit()
cancel()
if err != nil {
return false, err
}
if !tresp.Succeeded {
return false, errors.New("no longer the owner of the lock")
}
resp := tresp.Responses[0].GetResponseRange()
if len(resp.Kvs) == 0 {
// Key value not set, select current trainer.
e.lock = lock
log.Infoln("Trainer selected.")
return true, nil
}
if string(resp.Kvs[0].Value) == initDoneVal {
log.Infoln("Initialization is already done.")
ctx, cancel = context.WithTimeout(context.Background(), e.timeout)
err = lock.Unlock(ctx)
cancel()
if err != nil {
log.Errorln(err)
}
return false, nil
}
return false, fmt.Errorf("key %s have unexpected value: %v", initDonePath, resp.Kvs[0].Value)
}
// Done indicates the parameter initialization process is done.
func (e *Etcd) Done() error {
if e.lock == nil {
return errors.New("lock is nil, Done called unexpectedly")
}
put := clientv3.OpPut(initDonePath, initDoneVal)
ctx, cancel := context.WithTimeout(context.Background(), e.timeout)
tresp, err := e.client.Txn(ctx).If(e.lock.IsOwner()).Then(put).Commit()
cancel()
if err != nil {
return err
}
if !tresp.Succeeded {
return errors.New("no longer the owner of the lock")
}
ctx, cancel = context.WithTimeout(context.Background(), e.timeout)
err = e.lock.Unlock(ctx)
cancel()
if err != nil {
log.Errorln(err)
} else {
e.lock = nil
}
return nil
}
// Close closes the etcd client.
func (e *Etcd) Close() error {
var err error
if e.lock != nil {
ctx, cancel := context.WithTimeout(context.Background(), e.timeout)
err = e.lock.Unlock(ctx)
cancel()
if err == nil {
e.lock = nil
}
}
cErr := e.client.Close()
if cErr != nil {
if err != nil {
log.Errorln(cErr)
return err
}
return cErr
}
return err
}
package client_test
import (
"io/ioutil"
"net/url"
"os"
"strings"
"sync"
"testing"
"github.com/PaddlePaddle/Paddle/go/pserver/client"
"github.com/coreos/etcd/embed"
)
func TestSelector(t *testing.T) {
etcdDir, err := ioutil.TempDir("", "")
if err != nil {
t.Fatal(err)
}
cfg := embed.NewConfig()
lpurl, _ := url.Parse("http://localhost:0")
lcurl, _ := url.Parse("http://localhost:0")
cfg.LPUrls = []url.URL{*lpurl}
cfg.LCUrls = []url.URL{*lcurl}
cfg.Dir = etcdDir
e, err := embed.StartEtcd(cfg)
if err != nil {
t.Fatal(err)
}
defer func() {
e.Close()
if err := os.RemoveAll(etcdDir); err != nil {
t.Fatal(err)
}
}()
<-e.Server.ReadyNotify()
port := strings.Split(e.Clients[0].Addr().String(), ":")[1]
endpoint := "127.0.0.1:" + port
var mu sync.Mutex
selectedCount := 0
var wg sync.WaitGroup
selectAndDone := func(c *client.Etcd) {
defer wg.Done()
selected, err := c.Select()
if err != nil {
panic(err)
}
if selected {
mu.Lock()
selectedCount++
mu.Unlock()
err = c.Done()
if err != nil {
t.Fatal(err)
}
}
}
c0 := client.NewEtcd(endpoint)
c1 := client.NewEtcd(endpoint)
c2 := client.NewEtcd(endpoint)
c3 := client.NewEtcd(endpoint)
wg.Add(3)
go selectAndDone(c0)
go selectAndDone(c1)
go selectAndDone(c2)
wg.Wait()
// simulate trainer crashed and restarted after the
// initialization process.
wg.Add(1)
go selectAndDone(c3)
wg.Wait()
mu.Lock()
if selectedCount != 1 {
t.Fatal("selected count wrong:", selectedCount)
}
mu.Unlock()
err = c0.Close()
if err != nil {
t.Fatal(err)
}
err = c1.Close()
if err != nil {
t.Fatal(err)
}
err = c2.Close()
if err != nil {
t.Fatal(err)
}
err = c3.Close()
if err != nil {
t.Fatal(err)
}
}
...@@ -206,6 +206,7 @@ func (e *EtcdClient) GetKey(key string, timeout time.Duration) ([]byte, error) { ...@@ -206,6 +206,7 @@ func (e *EtcdClient) GetKey(key string, timeout time.Duration) ([]byte, error) {
if err != nil { if err != nil {
return []byte{}, err return []byte{}, err
} }
kvs := resp.Kvs kvs := resp.Kvs
if len(kvs) == 0 { if len(kvs) == 0 {
return []byte{}, nil return []byte{}, nil
...@@ -215,9 +216,14 @@ func (e *EtcdClient) GetKey(key string, timeout time.Duration) ([]byte, error) { ...@@ -215,9 +216,14 @@ func (e *EtcdClient) GetKey(key string, timeout time.Duration) ([]byte, error) {
} }
// PutKey put into etcd with value by key specified // PutKey put into etcd with value by key specified
func (e *EtcdClient) PutKey(key string, value []byte, timeout time.Duration) error { func (e *EtcdClient) PutKey(key string, value []byte, timeout time.Duration, withLease bool) error {
ctx, cancel := context.WithTimeout(context.Background(), timeout) ctx, cancel := context.WithTimeout(context.Background(), timeout)
_, err := e.client.Put(ctx, key, string(value), clientv3.WithLease(e.sess.Lease())) var err error
if withLease {
_, err = e.client.Put(ctx, key, string(value), clientv3.WithLease(e.sess.Lease()))
} else {
_, err = e.client.Put(ctx, key, string(value))
}
cancel() cancel()
return err return err
} }
......
...@@ -32,6 +32,7 @@ type optimizer struct { ...@@ -32,6 +32,7 @@ type optimizer struct {
opt *C.struct_paddle_optimizer opt *C.struct_paddle_optimizer
elementType ElementType elementType ElementType
contentLen int contentLen int
config []byte
} }
func cArrayToSlice(p unsafe.Pointer, len int) []byte { func cArrayToSlice(p unsafe.Pointer, len int) []byte {
...@@ -70,6 +71,7 @@ func newOptimizer(paramWithConfigs ParameterWithConfig, State []byte) *optimizer ...@@ -70,6 +71,7 @@ func newOptimizer(paramWithConfigs ParameterWithConfig, State []byte) *optimizer
cstate = unsafe.Pointer(&s[0]) cstate = unsafe.Pointer(&s[0])
} }
o.config = c
o.opt = C.paddle_create_optimizer((*C.uchar)(&c[0]), C.int(len(c)), o.opt = C.paddle_create_optimizer((*C.uchar)(&c[0]), C.int(len(c)),
C.paddle_element_type(p.ElementType), cbuffer, C.int(paramBufferSize), (*C.char)(cstate), C.int(len(s))) C.paddle_element_type(p.ElementType), cbuffer, C.int(paramBufferSize), (*C.char)(cstate), C.int(len(s)))
return o return o
......
...@@ -25,11 +25,13 @@ import ( ...@@ -25,11 +25,13 @@ import (
"fmt" "fmt"
"io/ioutil" "io/ioutil"
"os" "os"
"path/filepath" "path"
"strconv" "strconv"
"sync" "sync"
"time" "time"
uuid "github.com/satori/go.uuid"
log "github.com/sirupsen/logrus" log "github.com/sirupsen/logrus"
) )
...@@ -44,7 +46,7 @@ var ErrCheckpointNotFound = errors.New("checkpoint not found") ...@@ -44,7 +46,7 @@ var ErrCheckpointNotFound = errors.New("checkpoint not found")
const ( const (
AlreadyInitialized = "pserver already initialized" AlreadyInitialized = "pserver already initialized"
Uninitialized = "pserver not fully initialized" Uninitialized = "pserver not fully initialized"
CheckpointMD5Failed = "checkpoint file MD5 validation failed" WrongChecksum = "checkpoint file checksum validation failed"
) )
// Supported element types. // Supported element types.
...@@ -73,11 +75,12 @@ type ParameterWithConfig struct { ...@@ -73,11 +75,12 @@ type ParameterWithConfig struct {
// checkpointMeta saves checkpoint metadata // checkpointMeta saves checkpoint metadata
type checkpointMeta struct { type checkpointMeta struct {
UUID string `json:"uuid"` UUID string `json:"uuid"`
Path string `json:"path"`
MD5 string `json:"md5"` MD5 string `json:"md5"`
Timestamp int64 `json:"timestamp"` Timestamp int64 `json:"timestamp"`
} }
// Checkpoint is the pserver shard persist in file // Checkpoint is the pserver shard persist in file.
type Checkpoint []parameterCheckpoint type Checkpoint []parameterCheckpoint
// Gradient is the gradient of the parameter. // Gradient is the gradient of the parameter.
...@@ -90,50 +93,58 @@ type Service struct { ...@@ -90,50 +93,58 @@ type Service struct {
checkpointInterval time.Duration checkpointInterval time.Duration
checkpointPath string checkpointPath string
client *EtcdClient client *EtcdClient
mu sync.Mutex mu sync.Mutex
optMap map[string]*optimizer optMap map[string]*optimizer
} }
// parameterCheckpoint saves parameter checkpoint // parameterCheckpoint saves parameter checkpoint.
type parameterCheckpoint struct { type parameterCheckpoint struct {
ParameterWithConfig ParameterWithConfig
State []byte State []byte
} }
// NewCheckpointFromFile loads parameters and state from checkpoint file func loadMeta(e *EtcdClient, idx int) (meta checkpointMeta, err error) {
func NewCheckpointFromFile(cpPath string, idx int, e *EtcdClient) (Checkpoint, error) { v, err := e.GetKey(PsCheckpoint+strconv.Itoa(idx), 3*time.Second)
v, err := e.GetKey(PsPath+string(idx), 3*time.Second)
if err != nil { if err != nil {
return nil, err return
} }
if len(v) == 0 { if len(v) == 0 {
return nil, ErrCheckpointNotFound err = ErrCheckpointNotFound
return
} }
var cpMeta checkpointMeta if err = json.Unmarshal(v, &meta); err != nil {
if err = json.Unmarshal(v, &cpMeta); err != nil { return
return nil, err
} }
fn := filepath.Join(cpPath, cpMeta.UUID) return
if _, err = os.Stat(fn); os.IsNotExist(err) { }
// LoadCheckpoint loads checkpoint from file.
func LoadCheckpoint(e *EtcdClient, idx int) (Checkpoint, error) {
cpMeta, err := loadMeta(e, idx)
if err != nil {
return nil, err return nil, err
} }
content, err := ioutil.ReadFile(fn)
content, err := ioutil.ReadFile(cpMeta.Path)
if err != nil { if err != nil {
return nil, err return nil, err
} }
// TODO(helin): change MD5 to CRC since CRC is better for file
// checksum in our use case (emphasize speed over security).
h := md5.New() h := md5.New()
md5 := hex.EncodeToString(h.Sum(content)) md5 := hex.EncodeToString(h.Sum(content))
if md5 != cpMeta.MD5 { if md5 != cpMeta.MD5 {
return nil, errors.New(CheckpointMD5Failed) return nil, errors.New(WrongChecksum)
} }
dec := gob.NewDecoder(bytes.NewReader(content)) dec := gob.NewDecoder(bytes.NewReader(content))
cp := Checkpoint{} var cp Checkpoint
if err = dec.Decode(cp); err != nil { if err = dec.Decode(&cp); err != nil {
return nil, err return nil, err
} }
return cp, nil return cp, nil
...@@ -193,6 +204,15 @@ func (s *Service) FinishInitParams(_ int, _ *int) error { ...@@ -193,6 +204,15 @@ func (s *Service) FinishInitParams(_ int, _ *int) error {
} }
close(s.initialized) close(s.initialized)
go func() {
t := time.Tick(s.checkpointInterval)
for range t {
err := s.checkpoint()
if err != nil {
log.Errorln(err)
}
}
}()
return nil return nil
} }
...@@ -240,23 +260,36 @@ func (s *Service) GetParam(name string, parameter *Parameter) error { ...@@ -240,23 +260,36 @@ func (s *Service) GetParam(name string, parameter *Parameter) error {
return nil return nil
} }
// pserver save checkpoint func traceTime(start time.Time, name string) {
func (s *Service) doCheckpoint() (err error) { elapsed := time.Since(start)
<-s.initialized log.Infof("%s took %v", name, elapsed)
s.mu.Lock() }
defer s.mu.Unlock()
// checkpoint saves checkpoint to disk.
//
// checkpoint should be only called after the parameters are
// initialized.
func (s *Service) checkpoint() (err error) {
log.Infoln("Begin save checkpoint.")
defer traceTime(time.Now(), "save checkpoint")
s.mu.Lock()
cp := make([]parameterCheckpoint, len(s.optMap)) cp := make([]parameterCheckpoint, len(s.optMap))
index := 0 index := 0
// TODO(helin): write checkpoint incrementally to reduce memory
// footprint during checkpoint.
for name, opt := range s.optMap { for name, opt := range s.optMap {
var pc parameterCheckpoint var pc parameterCheckpoint
pc.Param.Name = name pc.Param.Name = name
pc.Param.ElementType = opt.elementType pc.Param.ElementType = opt.elementType
pc.Param.Content = opt.GetWeights() pc.Param.Content = opt.GetWeights()
pc.Config = opt.config
pc.State = opt.GetStates() pc.State = opt.GetStates()
cp[index] = pc cp[index] = pc
index++ index++
} }
s.mu.Unlock()
var buf bytes.Buffer var buf bytes.Buffer
encoder := gob.NewEncoder(&buf) encoder := gob.NewEncoder(&buf)
err = encoder.Encode(cp) err = encoder.Encode(cp)
...@@ -264,32 +297,9 @@ func (s *Service) doCheckpoint() (err error) { ...@@ -264,32 +297,9 @@ func (s *Service) doCheckpoint() (err error) {
return return
} }
cpMeta := checkpointMeta{} id := uuid.NewV4().String()
cpMeta.UUID = s.checkpointPath + strconv.Itoa(s.idx) p := path.Join(s.checkpointPath, id)
cpMeta.Timestamp = time.Now().UnixNano() f, err := os.Create(p)
h := md5.New()
cpMeta.MD5 = hex.EncodeToString(h.Sum(buf.Bytes()))
cpMetajson, err := json.Marshal(cpMeta)
if err != nil {
return
}
err = s.client.PutKey(filepath.Join(PsCheckpoint, strconv.Itoa(s.idx)), cpMetajson, 3*time.Second)
if err != nil {
return
}
if _, err = os.Stat(cpMeta.UUID); os.IsNotExist(err) {
log.Info("checkpoint does not exists.")
} else {
err = os.Remove(cpMeta.UUID)
if err != nil {
log.Infof("Removing checkpoint %s failed", cpMeta.UUID)
} else {
log.Infof("checkpoint %s already exsits, removing ", cpMeta.UUID)
}
}
f, err := os.Create(cpMeta.UUID)
if err != nil { if err != nil {
return return
} }
...@@ -317,5 +327,43 @@ func (s *Service) doCheckpoint() (err error) { ...@@ -317,5 +327,43 @@ func (s *Service) doCheckpoint() (err error) {
return return
} }
oldMeta, err := loadMeta(s.client, s.idx)
if err == ErrCheckpointNotFound {
log.Infoln("Do not have existing checkpoint.")
err = nil
}
if err != nil {
return
}
h := md5.New()
md5 := hex.EncodeToString(h.Sum(buf.Bytes()))
cpMeta := checkpointMeta{
UUID: id,
Timestamp: time.Now().UnixNano(),
MD5: md5,
Path: p,
}
json, err := json.Marshal(cpMeta)
if err != nil {
return
}
err = s.client.PutKey(PsCheckpoint+strconv.Itoa(s.idx), json, 3*time.Second, false)
if err != nil {
return
}
if oldMeta.Path != "" {
rmErr := os.Remove(oldMeta.Path)
if rmErr != nil {
// log error, but still treat checkpoint as
// successful.
log.Errorln(rmErr)
}
}
return return
} }
...@@ -30,7 +30,7 @@ const ( ...@@ -30,7 +30,7 @@ const (
func TestServiceFull(t *testing.T) { func TestServiceFull(t *testing.T) {
var cp pserver.Checkpoint var cp pserver.Checkpoint
s, err := pserver.NewService(0, 1, "", nil, cp) s, err := pserver.NewService(0, time.Hour, "", nil, cp)
if err != nil { if err != nil {
t.Error(err) t.Error(err)
} }
...@@ -102,7 +102,7 @@ func TestServiceFull(t *testing.T) { ...@@ -102,7 +102,7 @@ func TestServiceFull(t *testing.T) {
func TestMultipleInit(t *testing.T) { func TestMultipleInit(t *testing.T) {
var cp pserver.Checkpoint var cp pserver.Checkpoint
s, err := pserver.NewService(0, 1, "", nil, cp) s, err := pserver.NewService(0, time.Hour, "", nil, cp)
if err != nil { if err != nil {
t.Fatal(err) t.Fatal(err)
} }
...@@ -119,7 +119,7 @@ func TestMultipleInit(t *testing.T) { ...@@ -119,7 +119,7 @@ func TestMultipleInit(t *testing.T) {
func TestUninitialized(t *testing.T) { func TestUninitialized(t *testing.T) {
var cp pserver.Checkpoint var cp pserver.Checkpoint
s, err := pserver.NewService(0, 1, "", nil, cp) s, err := pserver.NewService(0, time.Hour, "", nil, cp)
err = s.SendGrad(pserver.Gradient{}, nil) err = s.SendGrad(pserver.Gradient{}, nil)
if err.Error() != pserver.Uninitialized { if err.Error() != pserver.Uninitialized {
t.Fatal(err) t.Fatal(err)
...@@ -128,7 +128,7 @@ func TestUninitialized(t *testing.T) { ...@@ -128,7 +128,7 @@ func TestUninitialized(t *testing.T) {
func TestBlockUntilInitialized(t *testing.T) { func TestBlockUntilInitialized(t *testing.T) {
var cp pserver.Checkpoint var cp pserver.Checkpoint
s, err := pserver.NewService(0, 1, "", nil, cp) s, err := pserver.NewService(0, time.Hour, "", nil, cp)
if err != nil { if err != nil {
t.Error(err) t.Error(err)
} }
......
...@@ -21,22 +21,15 @@ ...@@ -21,22 +21,15 @@
# #
# It same as PYTHONPATH=${YOUR_PYTHON_PATH}:$PYTHONPATH {exec...} # It same as PYTHONPATH=${YOUR_PYTHON_PATH}:$PYTHONPATH {exec...}
# #
PYPATH=""
if ! python -c "import paddle" >/dev/null 2>/dev/null; then set -x
PYPATH="" while getopts "d:" opt; do
set -x
while getopts "d:" opt; do
case $opt in case $opt in
d) d)
PYPATH=$OPTARG PYPATH=$OPTARG
;; ;;
esac esac
done done
shift $(($OPTIND - 1)) shift $(($OPTIND - 1))
export PYTHONPATH=$PYPATH:$PYTHONPATH export PYTHONPATH=$PYPATH:$PYTHONPATH
$@ $@
else
echo "paddle package is already in your PYTHONPATH. But unittest need a clean environment."
echo "Please uninstall paddle package before start unittest. Try to 'pip uninstall paddle'"
exit 1
fi
...@@ -22,7 +22,5 @@ if(WITH_C_API) ...@@ -22,7 +22,5 @@ if(WITH_C_API)
endif() endif()
if(WITH_SWIG_PY) if(WITH_SWIG_PY)
configure_file(${CMAKE_CURRENT_SOURCE_DIR}/setup.py.in
${CMAKE_CURRENT_SOURCE_DIR}/setup.py)
add_subdirectory(api) add_subdirectory(api)
endif() endif()
...@@ -19,9 +19,9 @@ add_library(paddle_api STATIC ${API_SOURCES}) ...@@ -19,9 +19,9 @@ add_library(paddle_api STATIC ${API_SOURCES})
add_dependencies(paddle_api paddle_proto paddle_trainer_lib) add_dependencies(paddle_api paddle_proto paddle_trainer_lib)
INCLUDE(${SWIG_USE_FILE}) INCLUDE(${SWIG_USE_FILE})
INCLUDE_DIRECTORIES(${PROJ_ROOT}/paddle) INCLUDE_DIRECTORIES(${PADDLE_SOURCE_DIR}/paddle)
FILE(GLOB PY_PADDLE_PYTHON_FILES ${PROJ_ROOT}/paddle/py_paddle/*.py) FILE(GLOB PY_PADDLE_PYTHON_FILES ${PADDLE_SOURCE_DIR}/paddle/py_paddle/*.py)
SET_SOURCE_FILES_PROPERTIES(Paddle.i PROPERTIES CPLUSPLUS ON) SET_SOURCE_FILES_PROPERTIES(Paddle.i PROPERTIES CPLUSPLUS ON)
...@@ -79,22 +79,16 @@ SWIG_LINK_LIBRARIES(swig_paddle ...@@ -79,22 +79,16 @@ SWIG_LINK_LIBRARIES(swig_paddle
${START_END} ${START_END}
) )
add_custom_command(OUTPUT ${PROJ_ROOT}/paddle/py_paddle/_swig_paddle.so add_custom_command(OUTPUT ${PADDLE_SOURCE_DIR}/paddle/py_paddle/_swig_paddle.so
COMMAND cp ${CMAKE_CURRENT_BINARY_DIR}/swig_paddle.py ${PROJ_ROOT}/paddle/py_paddle COMMAND cp ${CMAKE_CURRENT_BINARY_DIR}/swig_paddle.py ${PADDLE_SOURCE_DIR}/paddle/py_paddle
COMMAND cp ${CMAKE_CURRENT_BINARY_DIR}/_swig_paddle.so ${PROJ_ROOT}/paddle/py_paddle COMMAND cp ${CMAKE_CURRENT_BINARY_DIR}/_swig_paddle.so ${PADDLE_SOURCE_DIR}/paddle/py_paddle
COMMAND env ${py_env} ${PYTHON_EXECUTABLE} setup.py bdist_wheel COMMAND ${CMAKE_COMMAND} -E touch .timestamp
COMMAND ${CMAKE_COMMAND} -E touch dist/.timestamp WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle
COMMAND rm -rf py_paddle.egg-info build
WORKING_DIRECTORY ${PROJ_ROOT}/paddle
DEPENDS _swig_paddle DEPENDS _swig_paddle
) )
# TODO(yuyang18) : make wheel name calculated by cmake # TODO(yuyang18) : make wheel name calculated by cmake
add_custom_target(python_api_wheel ALL DEPENDS ${PROJ_ROOT}/paddle/py_paddle/_swig_paddle.so) add_custom_target(python_api_wheel ALL DEPENDS ${PADDLE_SOURCE_DIR}/paddle/py_paddle/_swig_paddle.so)
install(DIRECTORY ${CMAKE_SOURCE_DIR}/paddle/dist/
DESTINATION opt/paddle/share/wheels
)
if(WITH_TESTING) if(WITH_TESTING)
IF(NOT PY_PIP_FOUND) IF(NOT PY_PIP_FOUND)
...@@ -108,7 +102,7 @@ if(WITH_TESTING) ...@@ -108,7 +102,7 @@ if(WITH_TESTING)
BUILD_COMMAND "" BUILD_COMMAND ""
INSTALL_COMMAND env ${py_env} ${PYTHON_EXECUTABLE} setup.py install INSTALL_COMMAND env ${py_env} ${PYTHON_EXECUTABLE} setup.py install
BUILD_IN_SOURCE 1 BUILD_IN_SOURCE 1
DEPENDS python setuptools python_api_wheel #DEPENDS python setuptools python_api_wheel
) )
ENDIF() ENDIF()
add_subdirectory(test) add_subdirectory(test)
......
...@@ -41,7 +41,7 @@ ParameterUpdater *ParameterUpdater::createNewRemoteUpdater( ...@@ -41,7 +41,7 @@ ParameterUpdater *ParameterUpdater::createNewRemoteUpdater(
config->m->getConfig(), pserverSpec, useEtcd)); config->m->getConfig(), pserverSpec, useEtcd));
return updater; return updater;
#else #else
throw UnsupportError(); throw UnsupportError("not compiled with WITH_GOLANG");
#endif #endif
} }
......
add_python_test(test_swig_api py_test(testTrain SRCS testTrain.py)
testArguments.py testGradientMachine.py testMatrix.py testVector.py testTrain.py testTrainer.py) py_test(testMatrix SRCS testMatrix.py)
py_test(testVector SRCS testVector.py)
py_test(testTrainer SRCS testTrainer.py)
py_test(testArguments SRCS testArguments.py)
py_test(testGradientMachine SRCS testGradientMachine.py)
...@@ -90,6 +90,18 @@ paddle_error paddle_arguments_set_ids(paddle_arguments args, ...@@ -90,6 +90,18 @@ paddle_error paddle_arguments_set_ids(paddle_arguments args,
return kPD_NO_ERROR; return kPD_NO_ERROR;
} }
paddle_error paddle_arguments_set_frame_shape(paddle_arguments args,
uint64_t ID,
uint64_t frameHeight,
uint64_t frameWidth) {
if (args == nullptr) return kPD_NULLPTR;
auto a = castArg(args);
if (ID >= a->args.size()) return kPD_OUT_OF_RANGE;
a->args[ID].setFrameHeight(frameHeight);
a->args[ID].setFrameWidth(frameWidth);
return kPD_NO_ERROR;
}
paddle_error paddle_arguments_set_sequence_start_pos(paddle_arguments args, paddle_error paddle_arguments_set_sequence_start_pos(paddle_arguments args,
uint64_t ID, uint64_t ID,
uint32_t nestedLevel, uint32_t nestedLevel,
......
...@@ -111,6 +111,20 @@ PD_API paddle_error paddle_arguments_set_ids(paddle_arguments args, ...@@ -111,6 +111,20 @@ PD_API paddle_error paddle_arguments_set_ids(paddle_arguments args,
uint64_t ID, uint64_t ID,
paddle_ivector ids); paddle_ivector ids);
/**
* @brief paddle_arguments_set_frame_shape Set the fram size of one argument
* in array, which index is `ID`.
* @param [in] args arguments array
* @param [in] ID array index
* @param [in] frameHeight maximum height of input images
* @param [in] frameWidth maximum width of input images
* @return paddle_error
*/
PD_API paddle_error paddle_arguments_set_frame_shape(paddle_arguments args,
uint64_t ID,
uint64_t frameHeight,
uint64_t frameWidth);
/** /**
* @brief PDArgsSetSequenceStartPos Set sequence start position vector of one * @brief PDArgsSetSequenceStartPos Set sequence start position vector of one
* argument in array, which index is `ID`. * argument in array, which index is `ID`.
......
...@@ -7,14 +7,17 @@ ...@@ -7,14 +7,17 @@
do { \ do { \
paddle_error __err__ = stmt; \ paddle_error __err__ = stmt; \
if (__err__ != kPD_NO_ERROR) { \ if (__err__ != kPD_NO_ERROR) { \
fprintf(stderr, "Invoke paddle error %d \n" #stmt, __err__); \ fprintf(stderr, "Invoke paddle error %d in " #stmt "\n", __err__); \
exit(__err__); \ exit(__err__); \
} \ } \
} while (0) } while (0)
void* read_config(const char* filename, long* size) { void* read_config(const char* filename, long* size) {
FILE* file = fopen(filename, "r"); FILE* file = fopen(filename, "r");
if (file == NULL) return NULL; if (file == NULL) {
fprintf(stderr, "Open %s error\n", filename);
return NULL;
}
fseek(file, 0L, SEEK_END); fseek(file, 0L, SEEK_END);
*size = ftell(file); *size = ftell(file);
fseek(file, 0L, SEEK_SET); fseek(file, 0L, SEEK_SET);
......
...@@ -54,6 +54,31 @@ paddle_error paddle_gradient_machine_create_for_inference( ...@@ -54,6 +54,31 @@ paddle_error paddle_gradient_machine_create_for_inference(
return kPD_NO_ERROR; return kPD_NO_ERROR;
} }
paddle_error paddle_gradient_machine_create_for_inference_with_parameters(
paddle_gradient_machine* machine, void* mergedModel, uint64_t size) {
if (mergedModel == nullptr) return kPD_NULLPTR;
std::istringstream is(std::string(static_cast<char*>(mergedModel), size));
int64_t modelConfigSize = 0;
is.read((char*)(&modelConfigSize), sizeof(modelConfigSize));
std::string modelConfigProtobuf;
modelConfigProtobuf.resize(modelConfigSize);
is.read(&modelConfigProtobuf[0], modelConfigSize);
paddle::TrainerConfig config;
if (!config.ParseFromString(modelConfigProtobuf) || !config.IsInitialized()) {
return kPD_PROTOBUF_ERROR;
}
auto ptr = new paddle::capi::CGradientMachine();
ptr->machine.reset(paddle::GradientMachine::create(
config.model_config(), CREATE_MODE_TESTING, {paddle::PARAMETER_VALUE}));
std::vector<paddle::ParameterPtr>& parameters = ptr->machine->getParameters();
for (auto& para : parameters) {
para->load(is);
}
*machine = ptr;
return kPD_NO_ERROR;
}
paddle_error paddle_gradient_machine_destroy(paddle_gradient_machine machine) { paddle_error paddle_gradient_machine_destroy(paddle_gradient_machine machine) {
delete cast(machine); delete cast(machine);
return kPD_NO_ERROR; return kPD_NO_ERROR;
......
...@@ -36,6 +36,18 @@ typedef void* paddle_gradient_machine; ...@@ -36,6 +36,18 @@ typedef void* paddle_gradient_machine;
PD_API paddle_error paddle_gradient_machine_create_for_inference( PD_API paddle_error paddle_gradient_machine_create_for_inference(
paddle_gradient_machine* machine, void* modelConfigProtobuf, int size); paddle_gradient_machine* machine, void* modelConfigProtobuf, int size);
/**
* @brief Create a gradient machine used for model inference, using config with
* parameters which is generated by `paddle merge_model`.
* @param [out] machine that used for model inference.
* @param [in] mergedModel
* @param [in] size
* @return paddle_error
*/
PD_API paddle_error
paddle_gradient_machine_create_for_inference_with_parameters(
paddle_gradient_machine* machine, void* mergedModel, uint64_t size);
/** /**
* @brief Load parameter from disk. * @brief Load parameter from disk.
* @param machine Gradient Machine. * @param machine Gradient Machine.
......
...@@ -10,5 +10,5 @@ target_include_directories(capi_test_gradientMachine PUBLIC ...@@ -10,5 +10,5 @@ target_include_directories(capi_test_gradientMachine PUBLIC
${PADDLE_CAPI_INC_PATH}) ${PADDLE_CAPI_INC_PATH})
target_link_libraries(capi_test_gradientMachine paddle_capi) target_link_libraries(capi_test_gradientMachine paddle_capi)
add_test(NAME capi_test_gradientMachine add_test(NAME capi_test_gradientMachine
COMMAND ${PROJ_ROOT}/paddle/.set_python_path.sh -d ${PROJ_ROOT}/python ${CMAKE_CURRENT_BINARY_DIR}/capi_test_gradientMachine COMMAND ${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh -d ${PADDLE_SOURCE_DIR}/python ${CMAKE_CURRENT_BINARY_DIR}/capi_test_gradientMachine
WORKING_DIRECTORY ${PROJ_ROOT}/paddle/capi/tests) WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle/capi/tests)
...@@ -39,6 +39,7 @@ set(CUDA_CU_SOURCES ...@@ -39,6 +39,7 @@ set(CUDA_CU_SOURCES
src/hl_cuda_lstm.cu src/hl_cuda_lstm.cu
src/hl_top_k.cu src/hl_top_k.cu
src/hl_batch_transpose.cu src/hl_batch_transpose.cu
src/hl_batch_norm.cu
src/hl_cuda_sequence.cu src/hl_cuda_sequence.cu
src/hl_table_apply.cu) src/hl_table_apply.cu)
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifndef HL_BATCH_NORM_H_
#define HL_BATCH_NORM_H_
#include "hl_base.h"
/**
* @brief batch norm inferece.
*
* @param[in] input input data.
* @param[out] output output data.
* @param[in] scale batch normalization scale parameter (in original
* paper scale is referred to as gamma).
* @param[in] bias batch normalization bias parameter (in original
* paper scale is referred to as beta).
* @param[in] estimatedMean
* @param[in] estimatedVar The moving mean and variance
* accumulated during the training phase are passed
* as inputs here.
* @param[in] epsilon Epsilon value used in the batch
* normalization formula.
*/
extern void hl_batch_norm_cuda_inference(const real* input,
real* output,
const real* scale,
const real* bias,
const real* estimatedMean,
const real* estimatedVar,
const double epsilon,
size_t batchSize,
size_t channel,
size_t height,
size_t width);
#endif // HL_BATCH_NORM_H_
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "hl_batch_norm.h"
__global__ void batchNormInference(real* output,
const real* input,
const real* scale,
const real* bias,
const real* estimatedMean,
const real* estimatedVar,
const double epsilon,
size_t batchSize,
size_t channel,
size_t height,
size_t width) {
const int tid = threadIdx.x;
const int num = channel * height * width;
const int batch = blockIdx.x;
for (int i = tid; i < num; i += blockDim.x) {
const int c = i / (height * width);
const int id = batch * num + i;
real val = input[id] - estimatedMean[c];
val /= sqrt(estimatedVar[c] + epsilon);
val *= scale[c];
val += bias[c];
output[id] = val;
}
}
void hl_batch_norm_cuda_inference(const real* input,
real* output,
const real* scale,
const real* bias,
const real* estimatedMean,
const real* estimatedVar,
const double epsilon,
size_t batchSize,
size_t channel,
size_t height,
size_t width) {
batchNormInference<<<batchSize, 256, 0, STREAM_DEFAULT>>>(output,
input,
scale,
bias,
estimatedMean,
estimatedVar,
epsilon,
batchSize,
channel,
height,
width);
CHECK_SYNC("hl_batch_norm_cuda_inference failed!");
}
...@@ -12,17 +12,15 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,17 +12,15 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "hl_batch_transpose.h"
#include "hl_base.h" #include "hl_base.h"
#include "hl_batch_transpose.h"
const int TILE_DIM = 64; const int TILE_DIM = 64;
const int BLOCK_ROWS = 16; const int BLOCK_ROWS = 16;
// No bank-conflict transpose for a batch of data. // No bank-conflict transpose for a batch of data.
__global__ void batchTransposeNoBankConflicts(real* odata, __global__ void batchTransposeNoBankConflicts(
const real* idata, real* odata, const real* idata, int numSamples, int width, int height) {
int numSamples, int width,
int height) {
__shared__ float tile[TILE_DIM][TILE_DIM + 1]; __shared__ float tile[TILE_DIM][TILE_DIM + 1];
const int x = blockIdx.x * TILE_DIM + threadIdx.x; const int x = blockIdx.x * TILE_DIM + threadIdx.x;
...@@ -50,12 +48,12 @@ __global__ void batchTransposeNoBankConflicts(real* odata, ...@@ -50,12 +48,12 @@ __global__ void batchTransposeNoBankConflicts(real* odata,
newX] = tile[threadIdx.x][j]; newX] = tile[threadIdx.x][j];
} }
void batchTranspose(const real* input, real* output, int width, int height, void batchTranspose(
int batchSize) { const real* input, real* output, int width, int height, int batchSize) {
dim3 dimBlock(TILE_DIM, BLOCK_ROWS, 1); dim3 dimBlock(TILE_DIM, BLOCK_ROWS, 1);
dim3 dimGrid(DIVUP(width, TILE_DIM), DIVUP(height, TILE_DIM), batchSize); dim3 dimGrid(DIVUP(width, TILE_DIM), DIVUP(height, TILE_DIM), batchSize);
batchTransposeNoBankConflicts<<<dimGrid, dimBlock, 0, STREAM_DEFAULT>>> batchTransposeNoBankConflicts<<<dimGrid, dimBlock, 0, STREAM_DEFAULT>>>(
(output, input, batchSize, width, height); output, input, batchSize, width, height);
CHECK_SYNC("batchTranspose failed!"); CHECK_SYNC("batchTranspose failed!");
} }
...@@ -12,27 +12,23 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,27 +12,23 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "hl_aggregate.h"
#include "hl_base.h" #include "hl_base.h"
#include "hl_cuda.h" #include "hl_cuda.h"
#include "hl_cuda.ph" #include "hl_cuda.ph"
#include "hl_aggregate.h"
#include "hl_thread.ph"
#include "hl_matrix_base.cuh" #include "hl_matrix_base.cuh"
#include "hl_thread.ph"
#include "paddle/utils/Logging.h" #include "paddle/utils/Logging.h"
/** /**
* @brief matrix row operator. * @brief matrix row operator.
*/ */
template<class Agg, int blockSize> template <class Agg, int blockSize>
__global__ void KeMatrixRowOp(Agg agg, __global__ void KeMatrixRowOp(Agg agg, real *E, real *Sum, int dimN) {
real *E,
real *Sum,
int dimN) {
__shared__ real sum_s[blockSize]; __shared__ real sum_s[blockSize];
int cnt = (dimN + blockSize -1) / blockSize; int cnt = (dimN + blockSize - 1) / blockSize;
int rowId = blockIdx.x + blockIdx.y*gridDim.x; int rowId = blockIdx.x + blockIdx.y * gridDim.x;
int index = rowId*dimN; int index = rowId * dimN;
int tid = threadIdx.x; int tid = threadIdx.x;
int lmt = tid; int lmt = tid;
...@@ -44,7 +40,7 @@ __global__ void KeMatrixRowOp(Agg agg, ...@@ -44,7 +40,7 @@ __global__ void KeMatrixRowOp(Agg agg,
sum_s[tid] = tmp; sum_s[tid] = tmp;
__syncthreads(); __syncthreads();
for (int stride = blockSize/2; stride > 0; stride = stride/2) { for (int stride = blockSize / 2; stride > 0; stride = stride / 2) {
if (tid < stride) { if (tid < stride) {
sum_s[tid] = agg(sum_s[tid], sum_s[tid + stride]); sum_s[tid] = agg(sum_s[tid], sum_s[tid + stride]);
} }
...@@ -58,29 +54,21 @@ __global__ void KeMatrixRowOp(Agg agg, ...@@ -58,29 +54,21 @@ __global__ void KeMatrixRowOp(Agg agg,
} }
template <class Agg> template <class Agg>
void hl_matrix_row_op(Agg agg, void hl_matrix_row_op(Agg agg, real *A_d, real *C_d, int dimM, int dimN) {
real *A_d,
real *C_d,
int dimM,
int dimN) {
int blocksX = dimM; int blocksX = dimM;
int blocksY = 1; int blocksY = 1;
dim3 threads(128, 1); dim3 threads(128, 1);
dim3 grid(blocksX, blocksY); dim3 grid(blocksX, blocksY);
KeMatrixRowOp<Agg, 128><<< grid, threads, 0, STREAM_DEFAULT >>> KeMatrixRowOp<Agg, 128><<<grid, threads, 0, STREAM_DEFAULT>>>(
(agg, A_d, C_d, dimN); agg, A_d, C_d, dimN);
} }
void hl_matrix_row_sum(real *A_d, real *C_d, int dimM, int dimN) { void hl_matrix_row_sum(real *A_d, real *C_d, int dimM, int dimN) {
CHECK_NOTNULL(A_d); CHECK_NOTNULL(A_d);
CHECK_NOTNULL(C_d); CHECK_NOTNULL(C_d);
hl_matrix_row_op(aggregate::sum(), hl_matrix_row_op(aggregate::sum(), A_d, C_d, dimM, dimN);
A_d,
C_d,
dimM,
dimN);
CHECK_SYNC("hl_matrix_row_sum failed"); CHECK_SYNC("hl_matrix_row_sum failed");
} }
...@@ -88,11 +76,7 @@ void hl_matrix_row_max(real *A_d, real *C_d, int dimM, int dimN) { ...@@ -88,11 +76,7 @@ void hl_matrix_row_max(real *A_d, real *C_d, int dimM, int dimN) {
CHECK_NOTNULL(A_d); CHECK_NOTNULL(A_d);
CHECK_NOTNULL(C_d); CHECK_NOTNULL(C_d);
hl_matrix_row_op(aggregate::max(), hl_matrix_row_op(aggregate::max(), A_d, C_d, dimM, dimN);
A_d,
C_d,
dimM,
dimN);
CHECK_SYNC("hl_matrix_row_max failed"); CHECK_SYNC("hl_matrix_row_max failed");
} }
...@@ -100,23 +84,16 @@ void hl_matrix_row_min(real *A_d, real *C_d, int dimM, int dimN) { ...@@ -100,23 +84,16 @@ void hl_matrix_row_min(real *A_d, real *C_d, int dimM, int dimN) {
CHECK_NOTNULL(A_d); CHECK_NOTNULL(A_d);
CHECK_NOTNULL(C_d); CHECK_NOTNULL(C_d);
hl_matrix_row_op(aggregate::min(), hl_matrix_row_op(aggregate::min(), A_d, C_d, dimM, dimN);
A_d,
C_d,
dimM,
dimN);
CHECK_SYNC("hl_matrix_row_min failed"); CHECK_SYNC("hl_matrix_row_min failed");
} }
/** /**
* @brief matrix column operator. * @brief matrix column operator.
*/ */
template<class Agg> template <class Agg>
__global__ void KeMatrixColumnOp(Agg agg, __global__ void KeMatrixColumnOp(
real *E, Agg agg, real *E, real *Sum, int dimM, int dimN) {
real *Sum,
int dimM,
int dimN) {
int rowIdx = blockIdx.x * blockDim.x + threadIdx.x; int rowIdx = blockIdx.x * blockDim.x + threadIdx.x;
real tmp = agg.init(); real tmp = agg.init();
if (rowIdx < dimN) { if (rowIdx < dimN) {
...@@ -127,13 +104,10 @@ __global__ void KeMatrixColumnOp(Agg agg, ...@@ -127,13 +104,10 @@ __global__ void KeMatrixColumnOp(Agg agg,
} }
} }
template<class Agg, int blockDimX, int blockDimY> template <class Agg, int blockDimX, int blockDimY>
__global__ void KeMatrixColumnOp_S(Agg agg, __global__ void KeMatrixColumnOp_S(
real *E, Agg agg, real *E, real *Sum, int dimM, int dimN) {
real *Sum, __shared__ real _sum[blockDimX * blockDimY];
int dimM,
int dimN) {
__shared__ real _sum[blockDimX*blockDimY];
int rowIdx = blockIdx.x * blockDim.x + threadIdx.x; int rowIdx = blockIdx.x * blockDim.x + threadIdx.x;
int index = threadIdx.y; int index = threadIdx.y;
...@@ -144,14 +118,14 @@ __global__ void KeMatrixColumnOp_S(Agg agg, ...@@ -144,14 +118,14 @@ __global__ void KeMatrixColumnOp_S(Agg agg,
index += blockDimY; index += blockDimY;
} }
} }
_sum[threadIdx.x + threadIdx.y*blockDimX] = tmp; _sum[threadIdx.x + threadIdx.y * blockDimX] = tmp;
__syncthreads(); __syncthreads();
if (rowIdx < dimN) { if (rowIdx < dimN) {
if (threadIdx.y ==0) { if (threadIdx.y == 0) {
real tmp = agg.init(); real tmp = agg.init();
for (int i=0; i < blockDimY; i++) { for (int i = 0; i < blockDimY; i++) {
tmp = agg(tmp, _sum[threadIdx.x + i*blockDimX]); tmp = agg(tmp, _sum[threadIdx.x + i * blockDimX]);
} }
Sum[rowIdx] = tmp; Sum[rowIdx] = tmp;
} }
...@@ -159,25 +133,21 @@ __global__ void KeMatrixColumnOp_S(Agg agg, ...@@ -159,25 +133,21 @@ __global__ void KeMatrixColumnOp_S(Agg agg,
} }
template <class Agg> template <class Agg>
void hl_matrix_column_op(Agg agg, void hl_matrix_column_op(Agg agg, real *A_d, real *C_d, int dimM, int dimN) {
real *A_d,
real *C_d,
int dimM,
int dimN) {
if (dimN >= 8192) { if (dimN >= 8192) {
int blocksX = (dimN + 128 -1) / 128; int blocksX = (dimN + 128 - 1) / 128;
int blocksY = 1; int blocksY = 1;
dim3 threads(128, 1); dim3 threads(128, 1);
dim3 grid(blocksX, blocksY); dim3 grid(blocksX, blocksY);
KeMatrixColumnOp<Agg><<< grid, threads, 0, STREAM_DEFAULT >>> KeMatrixColumnOp<Agg><<<grid, threads, 0, STREAM_DEFAULT>>>(
(agg, A_d, C_d, dimM, dimN); agg, A_d, C_d, dimM, dimN);
} else { } else {
int blocksX = (dimN + 32 -1) / 32; int blocksX = (dimN + 32 - 1) / 32;
int blocksY = 1; int blocksY = 1;
dim3 threads(32, 32); dim3 threads(32, 32);
dim3 grid(blocksX, blocksY); dim3 grid(blocksX, blocksY);
KeMatrixColumnOp_S<Agg, 32, 32><<< grid, threads, 0, STREAM_DEFAULT>>> KeMatrixColumnOp_S<Agg, 32, 32><<<grid, threads, 0, STREAM_DEFAULT>>>(
(agg, A_d, C_d, dimM, dimN); agg, A_d, C_d, dimM, dimN);
} }
return; return;
...@@ -187,11 +157,7 @@ void hl_matrix_column_sum(real *A_d, real *C_d, int dimM, int dimN) { ...@@ -187,11 +157,7 @@ void hl_matrix_column_sum(real *A_d, real *C_d, int dimM, int dimN) {
CHECK_NOTNULL(A_d); CHECK_NOTNULL(A_d);
CHECK_NOTNULL(C_d); CHECK_NOTNULL(C_d);
hl_matrix_column_op(aggregate::sum(), hl_matrix_column_op(aggregate::sum(), A_d, C_d, dimM, dimN);
A_d,
C_d,
dimM,
dimN);
CHECK_SYNC("hl_matrix_column_sum failed"); CHECK_SYNC("hl_matrix_column_sum failed");
} }
...@@ -200,11 +166,7 @@ void hl_matrix_column_max(real *A_d, real *C_d, int dimM, int dimN) { ...@@ -200,11 +166,7 @@ void hl_matrix_column_max(real *A_d, real *C_d, int dimM, int dimN) {
CHECK_NOTNULL(A_d); CHECK_NOTNULL(A_d);
CHECK_NOTNULL(C_d); CHECK_NOTNULL(C_d);
hl_matrix_column_op(aggregate::max(), hl_matrix_column_op(aggregate::max(), A_d, C_d, dimM, dimN);
A_d,
C_d,
dimM,
dimN);
CHECK_SYNC("hl_matrix_column_max failed"); CHECK_SYNC("hl_matrix_column_max failed");
} }
...@@ -213,11 +175,7 @@ void hl_matrix_column_min(real *A_d, real *C_d, int dimM, int dimN) { ...@@ -213,11 +175,7 @@ void hl_matrix_column_min(real *A_d, real *C_d, int dimM, int dimN) {
CHECK_NOTNULL(A_d); CHECK_NOTNULL(A_d);
CHECK_NOTNULL(C_d); CHECK_NOTNULL(C_d);
hl_matrix_column_op(aggregate::min(), hl_matrix_column_op(aggregate::min(), A_d, C_d, dimM, dimN);
A_d,
C_d,
dimM,
dimN);
CHECK_SYNC("hl_matrix_column_min failed"); CHECK_SYNC("hl_matrix_column_min failed");
} }
...@@ -226,16 +184,16 @@ template <int blockSize> ...@@ -226,16 +184,16 @@ template <int blockSize>
__global__ void KeVectorSum(real *E, real *Sum, int dimM) { __global__ void KeVectorSum(real *E, real *Sum, int dimM) {
__shared__ double sum_s[blockSize]; __shared__ double sum_s[blockSize];
int tid = threadIdx.x; int tid = threadIdx.x;
int index = blockIdx.y*blockDim.x+threadIdx.x; int index = blockIdx.y * blockDim.x + threadIdx.x;
sum_s[tid] = 0.0f; sum_s[tid] = 0.0f;
while (index < dimM) { while (index < dimM) {
sum_s[tid] += E[index]; sum_s[tid] += E[index];
index += blockDim.x*gridDim.y; index += blockDim.x * gridDim.y;
} }
__syncthreads(); __syncthreads();
for (int stride = blockSize/2; stride > 0; stride = stride/2) { for (int stride = blockSize / 2; stride > 0; stride = stride / 2) {
if (tid < stride) { if (tid < stride) {
sum_s[tid] += sum_s[tid + stride]; sum_s[tid] += sum_s[tid + stride];
} }
...@@ -261,36 +219,37 @@ void hl_vector_sum(real *A_d, real *C_h, int dimM) { ...@@ -261,36 +219,37 @@ void hl_vector_sum(real *A_d, real *C_h, int dimM) {
struct _hl_event_st hl_event_st = {.cu_event = t_resource.event}; struct _hl_event_st hl_event_st = {.cu_event = t_resource.event};
hl_event_t hl_event = &hl_event_st; hl_event_t hl_event = &hl_event_st;
while (!hl_cuda_event_is_ready(hl_event)) {} while (!hl_cuda_event_is_ready(hl_event)) {
}
KeVectorSum<128><<< grid, threads, 0, STREAM_DEFAULT >>> KeVectorSum<128><<<grid, threads, 0, STREAM_DEFAULT>>>(
(A_d, t_resource.gpu_mem, dimM); A_d, t_resource.gpu_mem, dimM);
KeVectorSum<128><<< 1, threads, 0, STREAM_DEFAULT >>> KeVectorSum<128><<<1, threads, 0, STREAM_DEFAULT>>>(
(t_resource.gpu_mem, t_resource.cpu_mem, 128); t_resource.gpu_mem, t_resource.cpu_mem, 128);
hl_memcpy_async(C_h, t_resource.cpu_mem, sizeof(real), HPPL_STREAM_DEFAULT); hl_memcpy_async(C_h, t_resource.cpu_mem, sizeof(real), HPPL_STREAM_DEFAULT);
hl_stream_record_event(HPPL_STREAM_DEFAULT, hl_event); hl_stream_record_event(HPPL_STREAM_DEFAULT, hl_event);
hl_stream_synchronize(HPPL_STREAM_DEFAULT); hl_stream_synchronize(HPPL_STREAM_DEFAULT);
cudaError_t err = (cudaError_t)hl_get_device_last_error(); cudaError_t err = (cudaError_t)hl_get_device_last_error();
CHECK_EQ(cudaSuccess, err) CHECK_EQ(cudaSuccess, err) << "CUDA error: "
<< "CUDA error: " << hl_get_device_error_string((size_t)err); << hl_get_device_error_string((size_t)err);
} }
template <int blockSize> template <int blockSize>
__global__ void KeVectorAbsSum(real *E, real *Sum, int dimM) { __global__ void KeVectorAbsSum(real *E, real *Sum, int dimM) {
__shared__ double sum_s[blockSize]; __shared__ double sum_s[blockSize];
int tid = threadIdx.x; int tid = threadIdx.x;
int index = blockIdx.y*blockDim.x+threadIdx.x; int index = blockIdx.y * blockDim.x + threadIdx.x;
sum_s[tid] = 0.0f; sum_s[tid] = 0.0f;
while (index < dimM) { while (index < dimM) {
sum_s[tid] += abs(E[index]); sum_s[tid] += abs(E[index]);
index += blockDim.x*gridDim.y; index += blockDim.x * gridDim.y;
} }
__syncthreads(); __syncthreads();
for (int stride = blockSize/2; stride > 0; stride = stride/2) { for (int stride = blockSize / 2; stride > 0; stride = stride / 2) {
if (tid < stride) { if (tid < stride) {
sum_s[tid] += sum_s[tid + stride]; sum_s[tid] += sum_s[tid + stride];
} }
...@@ -316,18 +275,19 @@ void hl_vector_abs_sum(real *A_d, real *C_h, int dimM) { ...@@ -316,18 +275,19 @@ void hl_vector_abs_sum(real *A_d, real *C_h, int dimM) {
struct _hl_event_st hl_event_st = {.cu_event = t_resource.event}; struct _hl_event_st hl_event_st = {.cu_event = t_resource.event};
hl_event_t hl_event = &hl_event_st; hl_event_t hl_event = &hl_event_st;
while (!hl_cuda_event_is_ready(hl_event)) {} while (!hl_cuda_event_is_ready(hl_event)) {
}
KeVectorAbsSum<128><<< grid, threads, 0, STREAM_DEFAULT >>> KeVectorAbsSum<128><<<grid, threads, 0, STREAM_DEFAULT>>>(
(A_d, t_resource.gpu_mem, dimM); A_d, t_resource.gpu_mem, dimM);
KeVectorAbsSum<128><<< 1, threads, 0, STREAM_DEFAULT >>> KeVectorAbsSum<128><<<1, threads, 0, STREAM_DEFAULT>>>(
(t_resource.gpu_mem, t_resource.cpu_mem, 128); t_resource.gpu_mem, t_resource.cpu_mem, 128);
hl_memcpy_async(C_h, t_resource.cpu_mem, sizeof(real), HPPL_STREAM_DEFAULT); hl_memcpy_async(C_h, t_resource.cpu_mem, sizeof(real), HPPL_STREAM_DEFAULT);
hl_stream_record_event(HPPL_STREAM_DEFAULT, hl_event); hl_stream_record_event(HPPL_STREAM_DEFAULT, hl_event);
hl_stream_synchronize(HPPL_STREAM_DEFAULT); hl_stream_synchronize(HPPL_STREAM_DEFAULT);
cudaError_t err = (cudaError_t)hl_get_device_last_error(); cudaError_t err = (cudaError_t)hl_get_device_last_error();
CHECK_EQ(cudaSuccess, err) CHECK_EQ(cudaSuccess, err) << "CUDA error: "
<< "CUDA error: " << hl_get_device_error_string((size_t)err); << hl_get_device_error_string((size_t)err);
} }
此差异已折叠。
...@@ -1023,14 +1023,6 @@ void hl_batch_norm_forward_inference(hl_tensor_descriptor inputDesc, ...@@ -1023,14 +1023,6 @@ void hl_batch_norm_forward_inference(hl_tensor_descriptor inputDesc,
real beta = 1.0f; real beta = 1.0f;
cudnnBatchNormMode_t mode = CUDNN_BATCHNORM_SPATIAL; cudnnBatchNormMode_t mode = CUDNN_BATCHNORM_SPATIAL;
int batch_size = ((cudnn_tensor_descriptor)inputDesc)->batch_size;
if (batch_size > 1024 && g_cudnn_lib_version < 6000) {
LOG(INFO) << " To process current batch data with size " << batch_size
<< " (>1024), cudnnBatchNorm requires cuDNN version >= 6000."
<< " If there is an error complaining CUDNN_STATUS_NOT_SUPPORTED,"
<< " just recompile PaddlePaddle with cuDNN >= 6000, replacing"
<< " current version " << g_cudnn_lib_version;
}
CHECK_CUDNN( CHECK_CUDNN(
dynload::cudnnBatchNormalizationForwardInference(t_resource.cudnn_handle, dynload::cudnnBatchNormalizationForwardInference(t_resource.cudnn_handle,
mode, mode,
......
此差异已折叠。
...@@ -12,22 +12,21 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,22 +12,21 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "hl_base.h" #include "hl_base.h"
#include "hl_device_functions.cuh"
#include "hl_gpu_matrix_kernel.cuh"
#include "hl_matrix.h" #include "hl_matrix.h"
#include "hl_matrix_ops.cuh"
#include "hl_matrix_apply.cuh" #include "hl_matrix_apply.cuh"
#include "hl_matrix_ops.cuh"
#include "hl_sequence.h" #include "hl_sequence.h"
#include "hl_sparse.ph" #include "hl_sparse.ph"
#include "paddle/utils/Logging.h" #include "paddle/utils/Logging.h"
#include "hl_device_functions.cuh"
#include "hl_gpu_matrix_kernel.cuh"
DEFINE_MATRIX_UNARY_OP(Zero, a = 0); DEFINE_MATRIX_UNARY_OP(Zero, a = 0);
DEFINE_MATRIX_TERNARY_PARAMETER_OP(_add, TWO_PARAMETER, c = p1*a + p2*b); DEFINE_MATRIX_TERNARY_PARAMETER_OP(_add, TWO_PARAMETER, c = p1 * a + p2 * b);
void hl_matrix_add(real *A_d, void hl_matrix_add(real* A_d,
real *B_d, real* B_d,
real *C_d, real* C_d,
int dimM, int dimM,
int dimN, int dimN,
real alpha, real alpha,
...@@ -36,8 +35,8 @@ void hl_matrix_add(real *A_d, ...@@ -36,8 +35,8 @@ void hl_matrix_add(real *A_d,
CHECK_NOTNULL(B_d); CHECK_NOTNULL(B_d);
CHECK_NOTNULL(C_d); CHECK_NOTNULL(C_d);
hl_gpu_apply_ternary_op hl_gpu_apply_ternary_op<real, ternary::_add<real>, 0, 0>(
<real, ternary::_add<real>, 0, 0>(ternary::_add<real>(alpha, beta), ternary::_add<real>(alpha, beta),
A_d, A_d,
B_d, B_d,
C_d, C_d,
...@@ -50,12 +49,11 @@ void hl_matrix_add(real *A_d, ...@@ -50,12 +49,11 @@ void hl_matrix_add(real *A_d,
} }
#ifdef PADDLE_TYPE_DOUBLE #ifdef PADDLE_TYPE_DOUBLE
#define THRESHOLD 128 #define THRESHOLD 128
#else #else
#define THRESHOLD 64 #define THRESHOLD 64
#endif #endif
__device__ __forceinline__ __device__ __forceinline__ void findMax(real* I,
void findMax(real* I,
real* dfMax_s, real* dfMax_s,
int blockSize, int blockSize,
int base, int base,
...@@ -89,8 +87,7 @@ void findMax(real* I, ...@@ -89,8 +87,7 @@ void findMax(real* I,
__syncthreads(); __syncthreads();
} }
__device__ __forceinline__ __device__ __forceinline__ void subMaxAndExp(real* I,
void subMaxAndExp(real* I,
real* O, real* O,
int curIdx, int curIdx,
int nextIdx, int nextIdx,
...@@ -115,8 +112,7 @@ void subMaxAndExp(real* I, ...@@ -115,8 +112,7 @@ void subMaxAndExp(real* I,
__syncthreads(); __syncthreads();
} }
__device__ __forceinline__ __device__ __forceinline__ void valueSum(real* O,
void valueSum(real* O,
real* dfMax_s, real* dfMax_s,
int blockSize, int blockSize,
int base, int base,
...@@ -141,13 +137,8 @@ void valueSum(real* O, ...@@ -141,13 +137,8 @@ void valueSum(real* O,
__syncthreads(); __syncthreads();
} }
__device__ __forceinline__ __device__ __forceinline__ void divSum(
void divSum(real* O, real* O, real sum, int curIdx, int nextIdx, int blockSize, int dimN) {
real sum,
int curIdx,
int nextIdx,
int blockSize,
int dimN) {
while (curIdx < dimN) { while (curIdx < dimN) {
O[nextIdx] /= sum; O[nextIdx] /= sum;
nextIdx += blockSize; nextIdx += blockSize;
...@@ -155,8 +146,7 @@ void divSum(real* O, ...@@ -155,8 +146,7 @@ void divSum(real* O,
} }
} }
__device__ __forceinline__ __device__ __forceinline__ void softmax(real* I,
void softmax(real* I,
real* O, real* O,
real* dfMax_s, real* dfMax_s,
int blockSize, int blockSize,
...@@ -167,8 +157,7 @@ void softmax(real* I, ...@@ -167,8 +157,7 @@ void softmax(real* I,
__shared__ real max; __shared__ real max;
// find the max number // find the max number
findMax(I, dfMax_s, blockSize, base, curIdx, findMax(I, dfMax_s, blockSize, base, curIdx, nextIdx, dimN, &max);
nextIdx, dimN, &max);
// sub max Value and do Exp operation // sub max Value and do Exp operation
subMaxAndExp(I, O, base, nextIdx, blockSize, dimN, max); subMaxAndExp(I, O, base, nextIdx, blockSize, dimN, max);
...@@ -181,8 +170,8 @@ void softmax(real* I, ...@@ -181,8 +170,8 @@ void softmax(real* I,
divSum(O, dfMax_s[0], curIdx, nextIdx, blockSize, dimN); divSum(O, dfMax_s[0], curIdx, nextIdx, blockSize, dimN);
} }
template<int blockSize> template <int blockSize>
__global__ void KeMatrixSoftMax(real *O, real *I, int dimN) { __global__ void KeMatrixSoftMax(real* O, real* I, int dimN) {
int base = threadIdx.x; int base = threadIdx.x;
__shared__ real dfMax_s[blockSize]; __shared__ real dfMax_s[blockSize];
int nextIdx = blockIdx.x * dimN + base; int nextIdx = blockIdx.x * dimN + base;
...@@ -191,19 +180,18 @@ __global__ void KeMatrixSoftMax(real *O, real *I, int dimN) { ...@@ -191,19 +180,18 @@ __global__ void KeMatrixSoftMax(real *O, real *I, int dimN) {
softmax(I, O, dfMax_s, blockSize, base, curIdx, nextIdx, dimN); softmax(I, O, dfMax_s, blockSize, base, curIdx, nextIdx, dimN);
} }
void hl_matrix_softmax(real *A_d, real *C_d, int dimM, int dimN) { void hl_matrix_softmax(real* A_d, real* C_d, int dimM, int dimN) {
CHECK_NOTNULL(A_d); CHECK_NOTNULL(A_d);
CHECK_NOTNULL(C_d); CHECK_NOTNULL(C_d);
dim3 block(512, 1); dim3 block(512, 1);
dim3 grid(dimM, 1); dim3 grid(dimM, 1);
KeMatrixSoftMax<512> KeMatrixSoftMax<512><<<grid, block, 0, STREAM_DEFAULT>>>(C_d, A_d, dimN);
<<<grid, block, 0, STREAM_DEFAULT>>>(C_d, A_d, dimN);
CHECK_SYNC("hl_matrix_softmax failed"); CHECK_SYNC("hl_matrix_softmax failed");
} }
template<int blockSize> template <int blockSize>
__global__ void KeSequenceSoftMax(real *O, real *I, const int* index) { __global__ void KeSequenceSoftMax(real* O, real* I, const int* index) {
int base = threadIdx.x; int base = threadIdx.x;
int bid = blockIdx.x; int bid = blockIdx.x;
__shared__ real dfMax_s[blockSize]; __shared__ real dfMax_s[blockSize];
...@@ -217,8 +205,8 @@ __global__ void KeSequenceSoftMax(real *O, real *I, const int* index) { ...@@ -217,8 +205,8 @@ __global__ void KeSequenceSoftMax(real *O, real *I, const int* index) {
softmax(I, O, dfMax_s, blockSize, base, curIdx, nextIdx, dimN); softmax(I, O, dfMax_s, blockSize, base, curIdx, nextIdx, dimN);
} }
void hl_sequence_softmax_forward(real *A_d, void hl_sequence_softmax_forward(real* A_d,
real *C_d, real* C_d,
const int* index, const int* index,
int numSequence) { int numSequence) {
CHECK_NOTNULL(A_d); CHECK_NOTNULL(A_d);
...@@ -226,59 +214,48 @@ void hl_sequence_softmax_forward(real *A_d, ...@@ -226,59 +214,48 @@ void hl_sequence_softmax_forward(real *A_d,
dim3 block(512, 1); dim3 block(512, 1);
dim3 grid(numSequence, 1); dim3 grid(numSequence, 1);
KeSequenceSoftMax<512> KeSequenceSoftMax<512><<<grid, block, 0, STREAM_DEFAULT>>>(C_d, A_d, index);
<<<grid, block, 0, STREAM_DEFAULT>>>(C_d, A_d, index);
CHECK_SYNC("hl_sequence_softmax_forward failed"); CHECK_SYNC("hl_sequence_softmax_forward failed");
} }
__global__ void KeMatrixDerivative(real *grad_d, __global__ void KeMatrixDerivative(
real *output_d, real* grad_d, real* output_d, real* sftmaxSum_d, int dimM, int dimN) {
real *sftmaxSum_d, int rowIdx = blockIdx.x * blockDim.x + threadIdx.x;
int dimM, int colIdx = blockIdx.y * blockDim.y + threadIdx.y;
int dimN) {
int rowIdx = blockIdx.x*blockDim.x + threadIdx.x;
int colIdx = blockIdx.y*blockDim.y + threadIdx.y;
int index; int index;
if (rowIdx < dimM && colIdx < dimN) { if (rowIdx < dimM && colIdx < dimN) {
index = rowIdx*dimN + colIdx; index = rowIdx * dimN + colIdx;
grad_d[index] = output_d[index] * (grad_d[index] - sftmaxSum_d[rowIdx]); grad_d[index] = output_d[index] * (grad_d[index] - sftmaxSum_d[rowIdx]);
} }
} }
void hl_matrix_softmax_derivative(real *grad_d, void hl_matrix_softmax_derivative(
real *output_d, real* grad_d, real* output_d, real* sftmaxSum_d, int dimM, int dimN) {
real *sftmaxSum_d,
int dimM,
int dimN) {
CHECK_NOTNULL(grad_d); CHECK_NOTNULL(grad_d);
CHECK_NOTNULL(output_d); CHECK_NOTNULL(output_d);
CHECK_NOTNULL(sftmaxSum_d); CHECK_NOTNULL(sftmaxSum_d);
int blocksX = (dimM + 0) / 1; int blocksX = (dimM + 0) / 1;
int blocksY = (dimN + 1024 -1) / 1024; int blocksY = (dimN + 1024 - 1) / 1024;
dim3 threads(1, 1024); dim3 threads(1, 1024);
dim3 grid(blocksX, blocksY); dim3 grid(blocksX, blocksY);
KeMatrixDerivative<<< grid, threads, 0, STREAM_DEFAULT >>> KeMatrixDerivative<<<grid, threads, 0, STREAM_DEFAULT>>>(
(grad_d, output_d, sftmaxSum_d, dimM, dimN); grad_d, output_d, sftmaxSum_d, dimM, dimN);
CHECK_SYNC("hl_matrix_softmax_derivative failed"); CHECK_SYNC("hl_matrix_softmax_derivative failed");
} }
__global__ void KeMatrixMultiBinaryCrossEntropy(real* output, __global__ void KeMatrixMultiBinaryCrossEntropy(
real* entropy, real* output, real* entropy, int* row, int* col, int dimM, int dimN) {
int* row,
int* col,
int dimM,
int dimN) {
int index = blockIdx.x * blockDim.x + threadIdx.x; int index = blockIdx.x * blockDim.x + threadIdx.x;
if (index < dimM) { if (index < dimM) {
for (int i = 0; i < dimN; i ++) { for (int i = 0; i < dimN; i++) {
entropy[index] -= log(1 - output[index * dimN + i]); entropy[index] -= log(1 - output[index * dimN + i]);
} }
int *row_col = col + row[index]; int* row_col = col + row[index];
int col_num = row[index + 1] - row[index]; int col_num = row[index + 1] - row[index];
for (int i = 0; i < col_num; i ++) { for (int i = 0; i < col_num; i++) {
real o = output[index * dimN + row_col[i]]; real o = output[index * dimN + row_col[i]];
entropy[index] -= log(o / (1 - o)); entropy[index] -= log(o / (1 - o));
} }
...@@ -299,37 +276,30 @@ void hl_matrix_multi_binary_cross_entropy(real* output, ...@@ -299,37 +276,30 @@ void hl_matrix_multi_binary_cross_entropy(real* output,
dim3 threads(n_threads); dim3 threads(n_threads);
dim3 grid(blocks); dim3 grid(blocks);
hl_csr_matrix mat = (hl_csr_matrix)(csr_mat->matrix); hl_csr_matrix mat = (hl_csr_matrix)(csr_mat->matrix);
KeMatrixMultiBinaryCrossEntropy<<< grid, threads, 0, STREAM_DEFAULT >>> KeMatrixMultiBinaryCrossEntropy<<<grid, threads, 0, STREAM_DEFAULT>>>(
(output, entropy, mat->csr_row, mat->csr_col, dimM, dimN); output, entropy, mat->csr_row, mat->csr_col, dimM, dimN);
CHECK_SYNC("hl_matrix_multi_binary_cross_entropy failed"); CHECK_SYNC("hl_matrix_multi_binary_cross_entropy failed");
} }
__global__ void KeMatrixMultiBinaryCrossEntropyBp(real* output, __global__ void KeMatrixMultiBinaryCrossEntropyBp(
real* grad, real* output, real* grad, int* row, int* col, int dimM, int dimN) {
int* row,
int* col,
int dimM,
int dimN) {
int row_idx = blockIdx.x * blockDim.x + threadIdx.x; int row_idx = blockIdx.x * blockDim.x + threadIdx.x;
if (row_idx < dimM) { if (row_idx < dimM) {
for (int i = 0; i < dimN; i ++) { for (int i = 0; i < dimN; i++) {
int index = row_idx * dimN + i; int index = row_idx * dimN + i;
grad[index] += 1.0 / (1 - output[index]); grad[index] += 1.0 / (1 - output[index]);
} }
int col_num = row[row_idx + 1] - row[row_idx]; int col_num = row[row_idx + 1] - row[row_idx];
int *row_col = col + row[row_idx]; int* row_col = col + row[row_idx];
for (int i = 0; i < col_num; i ++) { for (int i = 0; i < col_num; i++) {
int index = row_idx * dimN + row_col[i]; int index = row_idx * dimN + row_col[i];
grad[index] -= 1.0 / (output[index] * (1 - output[index])); grad[index] -= 1.0 / (output[index] * (1 - output[index]));
} }
} }
} }
void hl_matrix_multi_binary_cross_entropy_bp(real* output, void hl_matrix_multi_binary_cross_entropy_bp(
real* grad, real* output, real* grad, hl_sparse_matrix_s csr_mat, int dimM, int dimN) {
hl_sparse_matrix_s csr_mat,
int dimM,
int dimN) {
CHECK_NOTNULL(output); CHECK_NOTNULL(output);
CHECK_NOTNULL(grad); CHECK_NOTNULL(grad);
CHECK_NOTNULL(csr_mat); CHECK_NOTNULL(csr_mat);
...@@ -339,16 +309,13 @@ void hl_matrix_multi_binary_cross_entropy_bp(real* output, ...@@ -339,16 +309,13 @@ void hl_matrix_multi_binary_cross_entropy_bp(real* output,
dim3 threads(n_threads); dim3 threads(n_threads);
dim3 grid(blocks); dim3 grid(blocks);
hl_csr_matrix mat = (hl_csr_matrix)(csr_mat->matrix); hl_csr_matrix mat = (hl_csr_matrix)(csr_mat->matrix);
KeMatrixMultiBinaryCrossEntropyBp<<< grid, threads, 0, STREAM_DEFAULT >>> KeMatrixMultiBinaryCrossEntropyBp<<<grid, threads, 0, STREAM_DEFAULT>>>(
(output, grad, mat->csr_row, mat->csr_col, dimM, dimN); output, grad, mat->csr_row, mat->csr_col, dimM, dimN);
CHECK_SYNC("hl_matrix_multi_binary_cross_entropy_bp failed"); CHECK_SYNC("hl_matrix_multi_binary_cross_entropy_bp failed");
} }
__global__ void KeMatrixCrossEntropy(real* O, __global__ void KeMatrixCrossEntropy(
real* E, real* O, real* E, int* label, int dimM, int dimN) {
int* label,
int dimM,
int dimN) {
int index = blockIdx.x * blockDim.x + threadIdx.x; int index = blockIdx.x * blockDim.x + threadIdx.x;
int newBase; int newBase;
if (index < dimM) { if (index < dimM) {
...@@ -358,59 +325,49 @@ __global__ void KeMatrixCrossEntropy(real* O, ...@@ -358,59 +325,49 @@ __global__ void KeMatrixCrossEntropy(real* O,
} }
} }
void hl_matrix_cross_entropy(real* A_d, void hl_matrix_cross_entropy(
real* C_d, real* A_d, real* C_d, int* label_d, int dimM, int dimN) {
int* label_d,
int dimM,
int dimN) {
CHECK_NOTNULL(A_d); CHECK_NOTNULL(A_d);
CHECK_NOTNULL(C_d); CHECK_NOTNULL(C_d);
int blocks = (dimM + 1024 - 1) / 1024; int blocks = (dimM + 1024 - 1) / 1024;
dim3 threads(1024, 1); dim3 threads(1024, 1);
dim3 grid(blocks, 1); dim3 grid(blocks, 1);
KeMatrixCrossEntropy<<< grid, threads, 0, STREAM_DEFAULT >>> KeMatrixCrossEntropy<<<grid, threads, 0, STREAM_DEFAULT>>>(
(A_d, C_d, label_d, dimM, dimN); A_d, C_d, label_d, dimM, dimN);
CHECK_SYNC("hl_matrix_cross_entropy failed"); CHECK_SYNC("hl_matrix_cross_entropy failed");
} }
__global__ void KeMatrixCrossEntropyBp(real* grad_d, __global__ void KeMatrixCrossEntropyBp(
real* output_d, real* grad_d, real* output_d, int* label_d, int dimM, int dimN) {
int* label_d, int rowIdx = blockIdx.x * blockDim.x + threadIdx.x;
int dimM, int colIdx = blockIdx.y * blockDim.y + threadIdx.y;
int dimN) {
int rowIdx = blockIdx.x*blockDim.x + threadIdx.x;
int colIdx = blockIdx.y*blockDim.y + threadIdx.y;
int index; int index;
if (rowIdx < dimM && colIdx < dimN) { if (rowIdx < dimM && colIdx < dimN) {
index = rowIdx*dimN + colIdx; index = rowIdx * dimN + colIdx;
if (label_d[rowIdx] == colIdx) { if (label_d[rowIdx] == colIdx) {
grad_d[index] -= 1.0f / output_d[index]; grad_d[index] -= 1.0f / output_d[index];
} }
} }
} }
void hl_matrix_cross_entropy_bp(real* grad_d, void hl_matrix_cross_entropy_bp(
real* output_d, real* grad_d, real* output_d, int* label_d, int dimM, int dimN) {
int* label_d,
int dimM,
int dimN) {
CHECK_NOTNULL(grad_d); CHECK_NOTNULL(grad_d);
CHECK_NOTNULL(output_d); CHECK_NOTNULL(output_d);
CHECK_NOTNULL(label_d); CHECK_NOTNULL(label_d);
int blocksX = (dimM + 0)/1; int blocksX = (dimM + 0) / 1;
int blocksY = (dimN + 1024 -1) / 1024; int blocksY = (dimN + 1024 - 1) / 1024;
dim3 threads(1, 1024); dim3 threads(1, 1024);
dim3 grid(blocksX, blocksY); dim3 grid(blocksX, blocksY);
KeMatrixCrossEntropyBp<<< grid, threads, 0, STREAM_DEFAULT >>> KeMatrixCrossEntropyBp<<<grid, threads, 0, STREAM_DEFAULT>>>(
(grad_d, output_d, label_d, dimM, dimN); grad_d, output_d, label_d, dimM, dimN);
CHECK_SYNC("hl_matrix_cross_entropy_bp failed"); CHECK_SYNC("hl_matrix_cross_entropy_bp failed");
} }
void hl_matrix_zero_mem(real* data, int num) { void hl_matrix_zero_mem(real* data, int num) {
hl_gpu_apply_unary_op( hl_gpu_apply_unary_op(unary::Zero<real>(), data, 1, num, num);
unary::Zero<real>(), data, 1, num, num);
} }
__global__ void KeParamReluForward(real* output, __global__ void KeParamReluForward(real* output,
...@@ -423,8 +380,8 @@ __global__ void KeParamReluForward(real* output, ...@@ -423,8 +380,8 @@ __global__ void KeParamReluForward(real* output,
int ty = blockIdx.y * blockDim.y + threadIdx.y; int ty = blockIdx.y * blockDim.y + threadIdx.y;
if (tx < width && ty < height) { if (tx < width && ty < height) {
int index = ty * width + tx; int index = ty * width + tx;
output[index] = input[index] > 0 ? input[index] : output[index] =
input[index] * w[tx / partial_sum]; input[index] > 0 ? input[index] : input[index] * w[tx / partial_sum];
} }
} }
...@@ -439,14 +396,14 @@ void hl_param_relu_forward(real* output, ...@@ -439,14 +396,14 @@ void hl_param_relu_forward(real* output,
CHECK_NOTNULL(w); CHECK_NOTNULL(w);
dim3 threads(16, 16); dim3 threads(16, 16);
int blockX = (width + 16 - 1) / 16; int blockX = (width + 16 - 1) / 16;
int blockY = (height + 16 -1) / 16; int blockY = (height + 16 - 1) / 16;
dim3 grid(blockX, blockY); dim3 grid(blockX, blockY);
KeParamReluForward<<<grid, threads, 0, STREAM_DEFAULT>>> KeParamReluForward<<<grid, threads, 0, STREAM_DEFAULT>>>(
(output, input, w, width, height, partial_sum); output, input, w, width, height, partial_sum);
CHECK_SYNC("hl_param_relu_forward failed"); CHECK_SYNC("hl_param_relu_forward failed");
} }
template<int blockSize> template <int blockSize>
__global__ void KeParamReluBackWardW(real* grad_w, __global__ void KeParamReluBackWardW(real* grad_w,
real* grad_o, real* grad_o,
real* input, real* input,
...@@ -491,8 +448,8 @@ void hl_param_relu_backward_w(real* grad_w, ...@@ -491,8 +448,8 @@ void hl_param_relu_backward_w(real* grad_w,
int grid_num = width / partial_sum; int grid_num = width / partial_sum;
dim3 threads(blockSize, 1); dim3 threads(blockSize, 1);
dim3 grid(grid_num, 1); dim3 grid(grid_num, 1);
KeParamReluBackWardW<blockSize><<<grid, threads, 0, STREAM_DEFAULT>>> KeParamReluBackWardW<blockSize><<<grid, threads, 0, STREAM_DEFAULT>>>(
(grad_w, grad_o, input, width, height, partial_sum); grad_w, grad_o, input, width, height, partial_sum);
CHECK_SYNC("hl_param_relu_backward_w failed"); CHECK_SYNC("hl_param_relu_backward_w failed");
} }
...@@ -524,19 +481,15 @@ void hl_param_relu_backward_diff(real* grad_o, ...@@ -524,19 +481,15 @@ void hl_param_relu_backward_diff(real* grad_o,
CHECK_NOTNULL(diff); CHECK_NOTNULL(diff);
dim3 threads(16, 16); dim3 threads(16, 16);
int blockX = (width + 16 - 1) / 16; int blockX = (width + 16 - 1) / 16;
int blockY = (height + 16 -1) / 16; int blockY = (height + 16 - 1) / 16;
dim3 grid(blockX, blockY); dim3 grid(blockX, blockY);
KeParamReluBackwardDiff<<<grid, threads, 0, STREAM_DEFAULT>>> KeParamReluBackwardDiff<<<grid, threads, 0, STREAM_DEFAULT>>>(
(grad_o, data, w, diff, width, height, partial_sum); grad_o, data, w, diff, width, height, partial_sum);
CHECK_SYNC("hl_param_relu_backward_diff failed"); CHECK_SYNC("hl_param_relu_backward_diff failed");
} }
__global__ void KeMatrixAddSharedBias(real* A, __global__ void KeMatrixAddSharedBias(
real* B, real* A, real* B, const int channel, const int M, const int N, real scale) {
const int channel,
const int M,
const int N,
real scale) {
int index = blockIdx.x * blockDim.x + threadIdx.x; int index = blockIdx.x * blockDim.x + threadIdx.x;
int dim = N / channel; int dim = N / channel;
if (index < M * N) { if (index < M * N) {
...@@ -554,15 +507,14 @@ void hl_matrix_add_shared_bias(real* A_d, ...@@ -554,15 +507,14 @@ void hl_matrix_add_shared_bias(real* A_d,
real scale) { real scale) {
const int blocks = 512; const int blocks = 512;
const int grids = DIVUP(dimM * dimN, blocks); const int grids = DIVUP(dimM * dimN, blocks);
KeMatrixAddSharedBias<<<grids, blocks, 0, STREAM_DEFAULT>>> KeMatrixAddSharedBias<<<grids, blocks, 0, STREAM_DEFAULT>>>(
(A_d, B_d, channel, dimM, dimN, scale); A_d, B_d, channel, dimM, dimN, scale);
CHECK_SYNC("hl_matrix_add_shared_bias failed"); CHECK_SYNC("hl_matrix_add_shared_bias failed");
} }
template <int blockSize> template <int blockSize>
__global__ void KeMatrixCollectSharedBias(real *B, __global__ void KeMatrixCollectSharedBias(real* B,
real *A, real* A,
const int channel, const int channel,
const int M, const int M,
const int N, const int N,
...@@ -611,14 +563,13 @@ void hl_matrix_collect_shared_bias(real* B_d, ...@@ -611,14 +563,13 @@ void hl_matrix_collect_shared_bias(real* B_d,
const int limit = 64; const int limit = 64;
int grids = (dimM * dim) < limit ? DIVUP(channel, blocks) : channel; int grids = (dimM * dim) < limit ? DIVUP(channel, blocks) : channel;
KeMatrixCollectSharedBias<blocks> KeMatrixCollectSharedBias<blocks><<<grids, blocks, 0, STREAM_DEFAULT>>>(
<<< grids, blocks, 0, STREAM_DEFAULT>>> B_d, A_d, channel, dimM, dimN, dim, limit, scale);
(B_d, A_d, channel, dimM, dimN, dim, limit, scale);
CHECK_SYNC("hl_matrix_collect_shared_bias failed"); CHECK_SYNC("hl_matrix_collect_shared_bias failed");
} }
__global__ void keMatrixRotate(real* mat, real* matRot, __global__ void keMatrixRotate(
int dimM, int dimN, bool clockWise) { real* mat, real* matRot, int dimM, int dimN, bool clockWise) {
int idx = blockIdx.x * blockDim.x + threadIdx.x; int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < dimM * dimN) { if (idx < dimM * dimN) {
int i = idx / dimN; int i = idx / dimN;
...@@ -631,13 +582,13 @@ __global__ void keMatrixRotate(real* mat, real* matRot, ...@@ -631,13 +582,13 @@ __global__ void keMatrixRotate(real* mat, real* matRot,
} }
} }
void hl_matrix_rotate(real *mat, real* matRot, void hl_matrix_rotate(
int dimM, int dimN, bool clockWise) { real* mat, real* matRot, int dimM, int dimN, bool clockWise) {
CHECK_NOTNULL(mat); CHECK_NOTNULL(mat);
CHECK_NOTNULL(matRot); CHECK_NOTNULL(matRot);
const int threads = 512; const int threads = 512;
const int blocks = DIVUP(dimM * dimN, threads); const int blocks = DIVUP(dimM * dimN, threads);
keMatrixRotate<<< blocks, threads, 0, STREAM_DEFAULT >>> keMatrixRotate<<<blocks, threads, 0, STREAM_DEFAULT>>>(
(mat, matRot, dimM, dimN, clockWise); mat, matRot, dimM, dimN, clockWise);
CHECK_SYNC("hl_matrix_rotate failed"); CHECK_SYNC("hl_matrix_rotate failed");
} }
...@@ -16,36 +16,36 @@ limitations under the License. */ ...@@ -16,36 +16,36 @@ limitations under the License. */
#include "hl_device_functions.cuh" #include "hl_device_functions.cuh"
#include "paddle/utils/Logging.h" #include "paddle/utils/Logging.h"
__global__ void KeMaxSequenceForward(real *input, __global__ void KeMaxSequenceForward(real* input,
const int *sequence, const int* sequence,
real* output, real* output,
int *index, int* index,
int numSequences, int numSequences,
int dim) { int dim) {
int dimIdx = threadIdx.x; int dimIdx = threadIdx.x;
int sequenceId = blockIdx.x; int sequenceId = blockIdx.x;
if (sequenceId >= numSequences) return; if (sequenceId >= numSequences) return;
int start = sequence[sequenceId]; int start = sequence[sequenceId];
int end = sequence[sequenceId+1]; int end = sequence[sequenceId + 1];
for (int i = dimIdx; i < dim; i += blockDim.x) { for (int i = dimIdx; i < dim; i += blockDim.x) {
real tmp = -HL_FLOAT_MAX; real tmp = -HL_FLOAT_MAX;
int tmpId = -1; int tmpId = -1;
for (int insId = start; insId < end; insId++) { for (int insId = start; insId < end; insId++) {
if (tmp < input[insId*dim + i]) { if (tmp < input[insId * dim + i]) {
tmp = input[insId*dim + i]; tmp = input[insId * dim + i];
tmpId = insId; tmpId = insId;
} }
} }
output[sequenceId*dim + i] = tmp; output[sequenceId * dim + i] = tmp;
index[sequenceId*dim + i] = tmpId; index[sequenceId * dim + i] = tmpId;
} }
} }
void hl_max_sequence_forward(real* input, void hl_max_sequence_forward(real* input,
const int* sequence, const int* sequence,
real* output, real* output,
int *index, int* index,
int numSequences, int numSequences,
int dim) { int dim) {
CHECK_NOTNULL(input); CHECK_NOTNULL(input);
...@@ -55,29 +55,23 @@ void hl_max_sequence_forward(real* input, ...@@ -55,29 +55,23 @@ void hl_max_sequence_forward(real* input,
dim3 threads(256, 1); dim3 threads(256, 1);
dim3 grid(numSequences, 1); dim3 grid(numSequences, 1);
KeMaxSequenceForward<<< grid, threads, 0, STREAM_DEFAULT >>> KeMaxSequenceForward<<<grid, threads, 0, STREAM_DEFAULT>>>(
(input, sequence, output, index, numSequences, dim); input, sequence, output, index, numSequences, dim);
CHECK_SYNC("hl_max_sequence_forward failed"); CHECK_SYNC("hl_max_sequence_forward failed");
} }
__global__ void KeMaxSequenceBackward(real *outputGrad, __global__ void KeMaxSequenceBackward(
int *index, real* outputGrad, int* index, real* inputGrad, int numSequences, int dim) {
real* inputGrad,
int numSequences,
int dim) {
int idx = threadIdx.x + blockIdx.x * blockDim.x; int idx = threadIdx.x + blockIdx.x * blockDim.x;
int colIdx = idx % dim; int colIdx = idx % dim;
if (idx < numSequences*dim) { if (idx < numSequences * dim) {
int insId = index[idx]; int insId = index[idx];
inputGrad[insId * dim + colIdx] += outputGrad[idx]; inputGrad[insId * dim + colIdx] += outputGrad[idx];
} }
} }
void hl_max_sequence_backward(real* outputGrad, void hl_max_sequence_backward(
int *index, real* outputGrad, int* index, real* inputGrad, int numSequences, int dim) {
real* inputGrad,
int numSequences,
int dim) {
CHECK_NOTNULL(outputGrad); CHECK_NOTNULL(outputGrad);
CHECK_NOTNULL(index); CHECK_NOTNULL(index);
CHECK_NOTNULL(inputGrad); CHECK_NOTNULL(inputGrad);
...@@ -85,12 +79,12 @@ void hl_max_sequence_backward(real* outputGrad, ...@@ -85,12 +79,12 @@ void hl_max_sequence_backward(real* outputGrad,
unsigned int blocks = (numSequences * dim + 128 - 1) / 128; unsigned int blocks = (numSequences * dim + 128 - 1) / 128;
dim3 threads(128, 1); dim3 threads(128, 1);
dim3 grid(blocks, 1); dim3 grid(blocks, 1);
KeMaxSequenceBackward<<< grid, threads, 0, STREAM_DEFAULT >>> KeMaxSequenceBackward<<<grid, threads, 0, STREAM_DEFAULT>>>(
(outputGrad, index, inputGrad, numSequences, dim); outputGrad, index, inputGrad, numSequences, dim);
CHECK_SYNC("hl_max_sequence_backward failed"); CHECK_SYNC("hl_max_sequence_backward failed");
} }
template<int blockDimX, int blockDimY, int gridDimX, bool AddRow> template <int blockDimX, int blockDimY, int gridDimX, bool AddRow>
__global__ void KeMatrixAddRows(real* output, __global__ void KeMatrixAddRows(real* output,
real* table, real* table,
int* ids, int* ids,
...@@ -104,8 +98,8 @@ __global__ void KeMatrixAddRows(real* output, ...@@ -104,8 +98,8 @@ __global__ void KeMatrixAddRows(real* output,
while (sampleId < numSamples) { while (sampleId < numSamples) {
int tableId = ids[sampleId]; int tableId = ids[sampleId];
if ((0 <= tableId) && (tableId < tableSize)) { if ((0 <= tableId) && (tableId < tableSize)) {
real *outputData = output + sampleId * dim; real* outputData = output + sampleId * dim;
real *tableData = table + tableId * dim; real* tableData = table + tableId * dim;
for (int i = idx; i < dim; i += blockDimX) { for (int i = idx; i < dim; i += blockDimX) {
if (AddRow == 0) { if (AddRow == 0) {
outputData[i] += tableData[i]; outputData[i] += tableData[i];
...@@ -114,15 +108,18 @@ __global__ void KeMatrixAddRows(real* output, ...@@ -114,15 +108,18 @@ __global__ void KeMatrixAddRows(real* output,
} }
} }
} }
sampleId += blockDimY*gridDimX; sampleId += blockDimY * gridDimX;
} }
} }
template<int blockDimX, int blockDimY, int gridDimX, bool seq2batch, bool isAdd> template <int blockDimX,
__global__ int blockDimY,
void KeSequence2Batch(real *batch, int gridDimX,
real *sequence, bool seq2batch,
const int *batchIndex, bool isAdd>
__global__ void KeSequence2Batch(real* batch,
real* sequence,
const int* batchIndex,
int seqWidth, int seqWidth,
int batchCount) { int batchCount) {
int idx = threadIdx.x; int idx = threadIdx.x;
...@@ -130,8 +127,8 @@ void KeSequence2Batch(real *batch, ...@@ -130,8 +127,8 @@ void KeSequence2Batch(real *batch,
int id = blockIdx.x + idy * gridDimX; int id = blockIdx.x + idy * gridDimX;
while (id < batchCount) { while (id < batchCount) {
int seqId = batchIndex[id]; int seqId = batchIndex[id];
real* batchData = batch + id*seqWidth; real* batchData = batch + id * seqWidth;
real* seqData = sequence + seqId*seqWidth; real* seqData = sequence + seqId * seqWidth;
for (int i = idx; i < seqWidth; i += blockDimX) { for (int i = idx; i < seqWidth; i += blockDimX) {
if (seq2batch) { if (seq2batch) {
if (isAdd) { if (isAdd) {
...@@ -147,13 +144,13 @@ void KeSequence2Batch(real *batch, ...@@ -147,13 +144,13 @@ void KeSequence2Batch(real *batch,
} }
} }
} }
id += blockDimY*gridDimX; id += blockDimY * gridDimX;
} }
} }
void hl_sequence2batch_copy(real *batch, void hl_sequence2batch_copy(real* batch,
real *sequence, real* sequence,
const int *batchIndex, const int* batchIndex,
int seqWidth, int seqWidth,
int batchCount, int batchCount,
bool seq2batch) { bool seq2batch) {
...@@ -164,18 +161,18 @@ void hl_sequence2batch_copy(real *batch, ...@@ -164,18 +161,18 @@ void hl_sequence2batch_copy(real *batch,
dim3 threads(128, 8); dim3 threads(128, 8);
dim3 grid(8, 1); dim3 grid(8, 1);
if (seq2batch) { if (seq2batch) {
KeSequence2Batch<128, 8, 8, 1, 0><<< grid, threads, 0, STREAM_DEFAULT >>> KeSequence2Batch<128, 8, 8, 1, 0><<<grid, threads, 0, STREAM_DEFAULT>>>(
(batch, sequence, batchIndex, seqWidth, batchCount); batch, sequence, batchIndex, seqWidth, batchCount);
} else { } else {
KeSequence2Batch<128, 8, 8, 0, 0><<< grid, threads, 0, STREAM_DEFAULT >>> KeSequence2Batch<128, 8, 8, 0, 0><<<grid, threads, 0, STREAM_DEFAULT>>>(
(batch, sequence, batchIndex, seqWidth, batchCount); batch, sequence, batchIndex, seqWidth, batchCount);
} }
CHECK_SYNC("hl_sequence2batch_copy failed"); CHECK_SYNC("hl_sequence2batch_copy failed");
} }
void hl_sequence2batch_add(real *batch, void hl_sequence2batch_add(real* batch,
real *sequence, real* sequence,
int *batchIndex, int* batchIndex,
int seqWidth, int seqWidth,
int batchCount, int batchCount,
bool seq2batch) { bool seq2batch) {
...@@ -186,18 +183,17 @@ void hl_sequence2batch_add(real *batch, ...@@ -186,18 +183,17 @@ void hl_sequence2batch_add(real *batch,
dim3 threads(128, 8); dim3 threads(128, 8);
dim3 grid(8, 1); dim3 grid(8, 1);
if (seq2batch) { if (seq2batch) {
KeSequence2Batch<128, 8, 8, 1, 1><<< grid, threads, 0, STREAM_DEFAULT >>> KeSequence2Batch<128, 8, 8, 1, 1><<<grid, threads, 0, STREAM_DEFAULT>>>(
(batch, sequence, batchIndex, seqWidth, batchCount); batch, sequence, batchIndex, seqWidth, batchCount);
} else { } else {
KeSequence2Batch<128, 8, 8, 0, 1><<< grid, threads, 0, STREAM_DEFAULT >>> KeSequence2Batch<128, 8, 8, 0, 1><<<grid, threads, 0, STREAM_DEFAULT>>>(
(batch, sequence, batchIndex, seqWidth, batchCount); batch, sequence, batchIndex, seqWidth, batchCount);
} }
CHECK_SYNC("hl_sequence2batch_add failed"); CHECK_SYNC("hl_sequence2batch_add failed");
} }
template<bool normByTimes, bool seq2batch> template <bool normByTimes, bool seq2batch>
__global__ __global__ void KeSequence2BatchPadding(real* batch,
void KeSequence2BatchPadding(real* batch,
real* sequence, real* sequence,
const int* sequenceStartPositions, const int* sequenceStartPositions,
const size_t sequenceWidth, const size_t sequenceWidth,
...@@ -276,37 +272,49 @@ void hl_sequence2batch_copy_padding(real* batch, ...@@ -276,37 +272,49 @@ void hl_sequence2batch_copy_padding(real* batch,
if (seq2batch) { if (seq2batch) {
/* sequence -> batch */ /* sequence -> batch */
if (normByTimes) { if (normByTimes) {
KeSequence2BatchPadding<1, 1><<< grid, threads, 0, STREAM_DEFAULT >>>( KeSequence2BatchPadding<1, 1><<<grid, threads, 0, STREAM_DEFAULT>>>(
batch, sequence, sequenceStartPositions, batch,
sequenceWidth, maxSequenceLength, numSequences); sequence,
sequenceStartPositions,
sequenceWidth,
maxSequenceLength,
numSequences);
} else { } else {
KeSequence2BatchPadding<0, 1><<< grid, threads, 0, STREAM_DEFAULT >>>( KeSequence2BatchPadding<0, 1><<<grid, threads, 0, STREAM_DEFAULT>>>(
batch, sequence, sequenceStartPositions, batch,
sequenceWidth, maxSequenceLength, numSequences); sequence,
sequenceStartPositions,
sequenceWidth,
maxSequenceLength,
numSequences);
} }
} else { } else {
/* batch -> sequence */ /* batch -> sequence */
if (normByTimes) { if (normByTimes) {
KeSequence2BatchPadding<1, 0><<< grid, threads, 0, STREAM_DEFAULT >>>( KeSequence2BatchPadding<1, 0><<<grid, threads, 0, STREAM_DEFAULT>>>(
batch, sequence, sequenceStartPositions, batch,
sequenceWidth, maxSequenceLength, numSequences); sequence,
sequenceStartPositions,
sequenceWidth,
maxSequenceLength,
numSequences);
} else { } else {
KeSequence2BatchPadding<0, 0><<< grid, threads, 0, STREAM_DEFAULT >>>( KeSequence2BatchPadding<0, 0><<<grid, threads, 0, STREAM_DEFAULT>>>(
batch, sequence, sequenceStartPositions, batch,
sequenceWidth, maxSequenceLength, numSequences); sequence,
sequenceStartPositions,
sequenceWidth,
maxSequenceLength,
numSequences);
} }
} }
CHECK_SYNC("hl_sequence2batch_copy_padding failed"); CHECK_SYNC("hl_sequence2batch_copy_padding failed");
} }
__device__ inline float my_rsqrt(float x) { __device__ inline float my_rsqrt(float x) { return rsqrtf(x); }
return rsqrtf(x);
}
__device__ inline double my_rsqrt(double x) { __device__ inline double my_rsqrt(double x) { return rsqrt(x); }
return rsqrt(x);
}
__global__ void KeSequenceAvgForward(real* dst, __global__ void KeSequenceAvgForward(real* dst,
real* src, real* src,
...@@ -327,8 +335,8 @@ __global__ void KeSequenceAvgForward(real* dst, ...@@ -327,8 +335,8 @@ __global__ void KeSequenceAvgForward(real* dst,
for (int i = start; i < end; i++) { for (int i = start; i < end; i++) {
sum += src[i * width + col]; sum += src[i * width + col];
} }
sum = mode == 1 ? sum : sum = mode == 1 ? sum : (mode == 0 ? sum / seqLength
(mode == 0 ? sum / seqLength : sum * my_rsqrt((real)seqLength)); : sum * my_rsqrt((real)seqLength));
dst[gid] += sum; dst[gid] += sum;
} }
} }
...@@ -349,8 +357,8 @@ void hl_sequence_avg_forward(real* dst, ...@@ -349,8 +357,8 @@ void hl_sequence_avg_forward(real* dst,
CHECK(mode == 0 || mode == 1 || mode == 2) CHECK(mode == 0 || mode == 1 || mode == 2)
<< "mode error in hl_sequence_avg_forward!"; << "mode error in hl_sequence_avg_forward!";
KeSequenceAvgForward<<< grid, block, 0, STREAM_DEFAULT >>> KeSequenceAvgForward<<<grid, block, 0, STREAM_DEFAULT>>>(
(dst, src, starts, height, width, mode); dst, src, starts, height, width, mode);
CHECK_SYNC("hl_sequence_avg_forward failed"); CHECK_SYNC("hl_sequence_avg_forward failed");
} }
...@@ -370,8 +378,8 @@ __global__ void KeSequenceAvgBackward(real* dst, ...@@ -370,8 +378,8 @@ __global__ void KeSequenceAvgBackward(real* dst,
int seqLength = end - start; int seqLength = end - start;
if (seqLength == 0) return; if (seqLength == 0) return;
real grad = src[gid]; real grad = src[gid];
grad = mode == 1 ? grad : grad = mode == 1 ? grad : (mode == 0 ? grad / seqLength
(mode == 0 ? grad / seqLength : grad * my_rsqrt((real)seqLength)); : grad * my_rsqrt((real)seqLength));
for (int i = start; i < end; i++) { for (int i = start; i < end; i++) {
dst[i * width + col] += grad; dst[i * width + col] += grad;
} }
...@@ -394,7 +402,7 @@ void hl_sequence_avg_backward(real* dst, ...@@ -394,7 +402,7 @@ void hl_sequence_avg_backward(real* dst,
CHECK(mode == 0 || mode == 1 || mode == 2) CHECK(mode == 0 || mode == 1 || mode == 2)
<< "mode error in hl_sequence_avg_backward!"; << "mode error in hl_sequence_avg_backward!";
KeSequenceAvgBackward<<< grid, block, 0, STREAM_DEFAULT >>> KeSequenceAvgBackward<<<grid, block, 0, STREAM_DEFAULT>>>(
(dst, src, starts, height, width, mode); dst, src, starts, height, width, mode);
CHECK_SYNC("hl_sequence_avg_backward failed"); CHECK_SYNC("hl_sequence_avg_backward failed");
} }
此差异已折叠。
...@@ -12,13 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,13 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include <cmath>
#include <stdlib.h> #include <stdlib.h>
#include "hl_cuda.h" #include <cmath>
#include "hl_time.h"
#include "hl_base.h" #include "hl_base.h"
#include "hl_cuda.h"
#include "hl_perturbation_util.cuh" #include "hl_perturbation_util.cuh"
#include "hl_time.h"
#define _USE_MATH_DEFINES #define _USE_MATH_DEFINES
...@@ -30,10 +29,16 @@ limitations under the License. */ ...@@ -30,10 +29,16 @@ limitations under the License. */
* centerX, centerY: translation. * centerX, centerY: translation.
* sourceX, sourceY: output coordinates in the original image. * sourceX, sourceY: output coordinates in the original image.
*/ */
__device__ void getTranformCoord(int x, int y, real theta, real scale, __device__ void getTranformCoord(int x,
real tgtCenter, real imgCenter, int y,
real centerR, real centerC, real theta,
int* sourceX, int* sourceY) { real scale,
real tgtCenter,
real imgCenter,
real centerR,
real centerC,
int* sourceX,
int* sourceY) {
real H[4] = {cosf(-theta), -sinf(-theta), sinf(-theta), cosf(-theta)}; real H[4] = {cosf(-theta), -sinf(-theta), sinf(-theta), cosf(-theta)};
// compute coornidates in the rotated and scaled image // compute coornidates in the rotated and scaled image
...@@ -57,11 +62,17 @@ __device__ void getTranformCoord(int x, int y, real theta, real scale, ...@@ -57,11 +62,17 @@ __device__ void getTranformCoord(int x, int y, real theta, real scale,
* created by Wei Xu (genome), converted by Jiang Wang * created by Wei Xu (genome), converted by Jiang Wang
*/ */
__global__ void kSamplingPatches(const real* imgs, real* targets, __global__ void kSamplingPatches(const real* imgs,
int imgSize, int tgtSize, const int channels, real* targets,
int samplingRate, const real* thetas, int imgSize,
const real* scales, const int* centerRs, int tgtSize,
const int* centerCs, const real padValue, const int channels,
int samplingRate,
const real* thetas,
const real* scales,
const int* centerRs,
const int* centerCs,
const real padValue,
const int numImages) { const int numImages) {
const int caseIdx = blockIdx.x * 4 + threadIdx.x; const int caseIdx = blockIdx.x * 4 + threadIdx.x;
const int pxIdx = blockIdx.y * 128 + threadIdx.y; const int pxIdx = blockIdx.y * 128 + threadIdx.y;
...@@ -80,8 +91,15 @@ __global__ void kSamplingPatches(const real* imgs, real* targets, ...@@ -80,8 +91,15 @@ __global__ void kSamplingPatches(const real* imgs, real* targets,
const int pxY = pxIdx / tgtSize; const int pxY = pxIdx / tgtSize;
int srcPxX, srcPxY; int srcPxX, srcPxY;
getTranformCoord(pxX, pxY, thetas[imgIdx], scales[imgIdx], tgtCenter, getTranformCoord(pxX,
imgCenter, centerCs[caseIdx], centerRs[caseIdx], &srcPxX, pxY,
thetas[imgIdx],
scales[imgIdx],
tgtCenter,
imgCenter,
centerCs[caseIdx],
centerRs[caseIdx],
&srcPxX,
&srcPxY); &srcPxY);
imgs += (imgIdx * imgPixels + srcPxY * imgSize + srcPxX) * channels; imgs += (imgIdx * imgPixels + srcPxY * imgSize + srcPxX) * channels;
...@@ -100,10 +118,15 @@ __global__ void kSamplingPatches(const real* imgs, real* targets, ...@@ -100,10 +118,15 @@ __global__ void kSamplingPatches(const real* imgs, real* targets,
* *
* created by Wei Xu * created by Wei Xu
*/ */
void hl_generate_disturb_params(real*& gpuAngle, real*& gpuScaleRatio, void hl_generate_disturb_params(real*& gpuAngle,
int*& gpuCenterR, int*& gpuCenterC, real*& gpuScaleRatio,
int numImages, int imgSize, real rotateAngle, int*& gpuCenterR,
real scaleRatio, int samplingRate, int*& gpuCenterC,
int numImages,
int imgSize,
real rotateAngle,
real scaleRatio,
int samplingRate,
bool isTrain) { bool isTrain) {
// The number of output samples. // The number of output samples.
int numPatches = numImages * samplingRate; int numPatches = numImages * samplingRate;
...@@ -123,7 +146,8 @@ void hl_generate_disturb_params(real*& gpuAngle, real*& gpuScaleRatio, ...@@ -123,7 +146,8 @@ void hl_generate_disturb_params(real*& gpuAngle, real*& gpuScaleRatio,
for (int i = 0; i < numImages; i++) { for (int i = 0; i < numImages; i++) {
r_angle[i] = r_angle[i] =
(rotateAngle * M_PI / 180.0) * (rand() / (RAND_MAX + 1.0) // NOLINT (rotateAngle * M_PI / 180.0) * (rand() / (RAND_MAX + 1.0) // NOLINT
- 0.5); -
0.5);
s_ratio[i] = s_ratio[i] =
1 + (rand() / (RAND_MAX + 1.0) - 0.5) * scaleRatio; // NOLINT 1 + (rand() / (RAND_MAX + 1.0) - 0.5) * scaleRatio; // NOLINT
} }
...@@ -140,8 +164,10 @@ void hl_generate_disturb_params(real*& gpuAngle, real*& gpuScaleRatio, ...@@ -140,8 +164,10 @@ void hl_generate_disturb_params(real*& gpuAngle, real*& gpuScaleRatio,
int pxY = int pxY =
(int)(real(imgSize - 1) * rand() / (RAND_MAX + 1.0)); // NOLINT (int)(real(imgSize - 1) * rand() / (RAND_MAX + 1.0)); // NOLINT
const real H[4] = {cos(-r_angle[i]), -sin(-r_angle[i]), const real H[4] = {cos(-r_angle[i]),
sin(-r_angle[i]), cos(-r_angle[i])}; -sin(-r_angle[i]),
sin(-r_angle[i]),
cos(-r_angle[i])};
real x = pxX - imgCenter; real x = pxX - imgCenter;
real y = pxY - imgCenter; real y = pxY - imgCenter;
real xx = H[0] * x + H[1] * y; real xx = H[0] * x + H[1] * y;
...@@ -185,9 +211,12 @@ void hl_generate_disturb_params(real*& gpuAngle, real*& gpuScaleRatio, ...@@ -185,9 +211,12 @@ void hl_generate_disturb_params(real*& gpuAngle, real*& gpuScaleRatio,
delete[] center_c; delete[] center_c;
} }
void hl_conv_random_disturb_with_params(const real* images, int imgSize, void hl_conv_random_disturb_with_params(const real* images,
int tgtSize, int channels, int imgSize,
int numImages, int samplingRate, int tgtSize,
int channels,
int numImages,
int samplingRate,
const real* gpuRotationAngle, const real* gpuRotationAngle,
const real* gpuScaleRatio, const real* gpuScaleRatio,
const int* gpuCenterR, const int* gpuCenterR,
...@@ -202,29 +231,59 @@ void hl_conv_random_disturb_with_params(const real* images, int imgSize, ...@@ -202,29 +231,59 @@ void hl_conv_random_disturb_with_params(const real* images, int imgSize,
dim3 threadsPerBlock(4, 128); dim3 threadsPerBlock(4, 128);
dim3 numBlocks(DIVUP(numPatches, 4), DIVUP(targetSize, 128)); dim3 numBlocks(DIVUP(numPatches, 4), DIVUP(targetSize, 128));
kSamplingPatches <<<numBlocks, threadsPerBlock>>> kSamplingPatches<<<numBlocks, threadsPerBlock>>>(images,
(images, target, imgSize, tgtSize, channels, samplingRate, target,
gpuRotationAngle, gpuScaleRatio, gpuCenterR, gpuCenterC, imgSize,
paddingValue, numImages); tgtSize,
channels,
samplingRate,
gpuRotationAngle,
gpuScaleRatio,
gpuCenterR,
gpuCenterC,
paddingValue,
numImages);
hl_device_synchronize(); hl_device_synchronize();
} }
void hl_conv_random_disturb(const real* images, int imgSize, void hl_conv_random_disturb(const real* images,
int tgtSize, int channels, int numImages, int imgSize,
real scaleRatio, real rotateAngle, int tgtSize,
int samplingRate, real* gpu_r_angle, int channels,
real* gpu_s_ratio, int* gpu_center_r, int numImages,
int* gpu_center_c, int paddingValue, real scaleRatio,
bool isTrain, real* targets) { real rotateAngle,
int samplingRate,
real* gpu_r_angle,
real* gpu_s_ratio,
int* gpu_center_r,
int* gpu_center_c,
int paddingValue,
bool isTrain,
real* targets) {
// generate the random disturbance sequence and the sampling locations // generate the random disturbance sequence and the sampling locations
hl_generate_disturb_params(gpu_r_angle, gpu_s_ratio, gpu_center_r, hl_generate_disturb_params(gpu_r_angle,
gpu_center_c, numImages, imgSize, rotateAngle, gpu_s_ratio,
scaleRatio, samplingRate, isTrain); gpu_center_r,
gpu_center_c,
hl_conv_random_disturb_with_params( numImages,
images, imgSize, tgtSize, channels, numImages, imgSize,
samplingRate, gpu_r_angle, gpu_s_ratio, rotateAngle,
gpu_center_r, gpu_center_r, paddingValue, scaleRatio,
samplingRate,
isTrain);
hl_conv_random_disturb_with_params(images,
imgSize,
tgtSize,
channels,
numImages,
samplingRate,
gpu_r_angle,
gpu_s_ratio,
gpu_center_r,
gpu_center_r,
paddingValue,
targets); targets);
} }
此差异已折叠。
此差异已折叠。
此差异已折叠。
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
syntax="proto2";
package paddle.framework;
// Attribute Type for paddle's Op.
// Op contains many attributes. Each type of attributes could be different.
// The AttrType will be shared between AttrDesc and AttrProto.
enum AttrType {
INT = 0;
FLOAT = 1;
STRING = 2;
INTS = 3;
FLOATS = 4;
STRINGS = 5;
}
\ No newline at end of file
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册