未验证 提交 7324ffae 编写于 作者: L lujun 提交者: GitHub

Merge pull request #3 from PaddlePaddle/develop

merge to dev
...@@ -94,52 +94,52 @@ RUN localedef -i en_US -f UTF-8 en_US.UTF-8 ...@@ -94,52 +94,52 @@ RUN localedef -i en_US -f UTF-8 en_US.UTF-8
# specify sphinx version as 1.5.6 and remove -U option for [pip install -U # specify sphinx version as 1.5.6 and remove -U option for [pip install -U
# sphinx-rtd-theme] since -U option will cause sphinx being updated to newest # sphinx-rtd-theme] since -U option will cause sphinx being updated to newest
# version(1.7.1 for now), which causes building documentation failed. # version(1.7.1 for now), which causes building documentation failed.
RUN pip3 install -U wheel && \ RUN pip3 --no-cache-dir install -U wheel && \
pip3 install -U docopt PyYAML sphinx==1.5.6 && \ pip3 --no-cache-dir install -U docopt PyYAML sphinx==1.5.6 && \
pip3 install sphinx-rtd-theme==0.1.9 recommonmark && \ pip3 --no-cache-dir install sphinx-rtd-theme==0.1.9 recommonmark && \
pip3.6 install -U wheel && \ pip3.6 --no-cache-dir install -U wheel && \
pip3.6 install -U docopt PyYAML sphinx==1.5.6 && \ pip3.6 --no-cache-dir install -U docopt PyYAML sphinx==1.5.6 && \
pip3.6 install sphinx-rtd-theme==0.1.9 recommonmark && \ pip3.6 --no-cache-dir install sphinx-rtd-theme==0.1.9 recommonmark && \
pip3.7 install -U wheel && \ pip3.7 --no-cache-dir install -U wheel && \
pip3.7 install -U docopt PyYAML sphinx==1.5.6 && \ pip3.7 --no-cache-dir install -U docopt PyYAML sphinx==1.5.6 && \
pip3.7 install sphinx-rtd-theme==0.1.9 recommonmark && \ pip3.7 --no-cache-dir install sphinx-rtd-theme==0.1.9 recommonmark && \
easy_install -U pip && \ easy_install -U pip && \
pip install -U pip setuptools wheel && \ pip --no-cache-dir install -U pip setuptools wheel && \
pip install -U docopt PyYAML sphinx==1.5.6 && \ pip --no-cache-dir install -U docopt PyYAML sphinx==1.5.6 && \
pip install sphinx-rtd-theme==0.1.9 recommonmark pip --no-cache-dir install sphinx-rtd-theme==0.1.9 recommonmark
RUN pip3 install 'pre-commit==1.10.4' 'ipython==5.3.0' && \ RUN pip3 --no-cache-dir install 'pre-commit==1.10.4' 'ipython==5.3.0' && \
pip3 install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \ pip3 --no-cache-dir install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \
pip3 install opencv-python && \ pip3 --no-cache-dir install opencv-python && \
pip3.6 install 'pre-commit==1.10.4' 'ipython==5.3.0' && \ pip3.6 --no-cache-dir install 'pre-commit==1.10.4' 'ipython==5.3.0' && \
pip3.6 install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \ pip3.6 --no-cache-dir install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \
pip3.6 install opencv-python && \ pip3.6 --no-cache-dir install opencv-python && \
pip3.7 install 'pre-commit==1.10.4' 'ipython==5.3.0' && \ pip3.7 --no-cache-dir install 'pre-commit==1.10.4' 'ipython==5.3.0' && \
pip3.7 install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \ pip3.7 --no-cache-dir install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \
pip3.7 install opencv-python && \ pip3.7 --no-cache-dir install opencv-python && \
pip install 'pre-commit==1.10.4' 'ipython==5.3.0' && \ pip --no-cache-dir install 'pre-commit==1.10.4' 'ipython==5.3.0' && \
pip install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \ pip --no-cache-dir install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \
pip install opencv-python pip --no-cache-dir install opencv-python
#For docstring checker #For docstring checker
RUN pip3 install pylint pytest astroid isort RUN pip3 --no-cache-dir install pylint pytest astroid isort
RUN pip3.6 install pylint pytest astroid isort RUN pip3.6 --no-cache-dir install pylint pytest astroid isort
RUN pip3.7 install pylint pytest astroid isort RUN pip3.7 --no-cache-dir install pylint pytest astroid isort
RUN pip install pylint pytest astroid isort LinkChecker RUN pip --no-cache-dir install pylint pytest astroid isort LinkChecker
COPY ./python/requirements.txt /root/ COPY ./python/requirements.txt /root/
RUN pip3 install -r /root/requirements.txt RUN pip3 --no-cache-dir install -r /root/requirements.txt
RUN pip3.6 install -r /root/requirements.txt RUN pip3.6 --no-cache-dir install -r /root/requirements.txt
RUN pip3.7 install -r /root/requirements.txt RUN pip3.7 --no-cache-dir install -r /root/requirements.txt
RUN pip install -r /root/requirements.txt RUN pip --no-cache-dir install -r /root/requirements.txt
# To fix https://github.com/PaddlePaddle/Paddle/issues/1954, we use # To fix https://github.com/PaddlePaddle/Paddle/issues/1954, we use
# the solution in https://urllib3.readthedocs.io/en/latest/user-guide.html#ssl-py2 # the solution in https://urllib3.readthedocs.io/en/latest/user-guide.html#ssl-py2
RUN apt-get install -y libssl-dev libffi-dev RUN apt-get install -y libssl-dev libffi-dev && apt-get clean -y
RUN pip3 install certifi urllib3[secure] RUN pip3 --no-cache-dir install certifi urllib3[secure]
RUN pip3.6 install certifi urllib3[secure] RUN pip3.6 --no-cache-dir install certifi urllib3[secure]
RUN pip3.7 install certifi urllib3[secure] RUN pip3.7 --no-cache-dir install certifi urllib3[secure]
RUN pip install certifi urllib3[secure] RUN pip --no-cache-dir install certifi urllib3[secure]
# Install woboq_codebrowser to /woboq # Install woboq_codebrowser to /woboq
...@@ -149,6 +149,14 @@ RUN git clone https://github.com/woboq/woboq_codebrowser /woboq && \ ...@@ -149,6 +149,14 @@ RUN git clone https://github.com/woboq/woboq_codebrowser /woboq && \
-DCMAKE_BUILD_TYPE=Release . \ -DCMAKE_BUILD_TYPE=Release . \
make) make)
# ar mishandles 4GB files
# https://sourceware.org/bugzilla/show_bug.cgi?id=14625
# remove them when apt-get support 2.27 and higher version
RUN wget -q https://launchpad.net/ubuntu/+archive/primary/+sourcefiles/binutils/2.27-9ubuntu1/binutils_2.27.orig.tar.gz && \
tar -xzf binutils_2.27.orig.tar.gz && \
cd binutils-2.27 && \
./configure && make -j && make install && cd .. && rm -rf binutils-2.27 binutils_2.27.orig.tar.gz
# Configure OpenSSH server. c.f. https://docs.docker.com/engine/examples/running_ssh_service # Configure OpenSSH server. c.f. https://docs.docker.com/engine/examples/running_ssh_service
RUN mkdir /var/run/sshd RUN mkdir /var/run/sshd
RUN echo 'root:root' | chpasswd RUN echo 'root:root' | chpasswd
......
...@@ -139,10 +139,12 @@ endfunction() ...@@ -139,10 +139,12 @@ endfunction()
message(STATUS "CUDA detected: " ${CUDA_VERSION}) message(STATUS "CUDA detected: " ${CUDA_VERSION})
if (${CUDA_VERSION} LESS 7.0) if (${CUDA_VERSION} LESS 7.0)
set(paddle_known_gpu_archs ${paddle_known_gpu_archs}) set(paddle_known_gpu_archs ${paddle_known_gpu_archs})
add_definitions("-DPADDLE_CUDA_BINVER=\"60\"")
elseif (${CUDA_VERSION} LESS 8.0) # CUDA 7.x elseif (${CUDA_VERSION} LESS 8.0) # CUDA 7.x
set(paddle_known_gpu_archs ${paddle_known_gpu_archs7}) set(paddle_known_gpu_archs ${paddle_known_gpu_archs7})
list(APPEND CUDA_NVCC_FLAGS "-D_MWAITXINTRIN_H_INCLUDED") list(APPEND CUDA_NVCC_FLAGS "-D_MWAITXINTRIN_H_INCLUDED")
list(APPEND CUDA_NVCC_FLAGS "-D__STRICT_ANSI__") list(APPEND CUDA_NVCC_FLAGS "-D__STRICT_ANSI__")
add_definitions("-DPADDLE_CUDA_BINVER=\"70\"")
elseif (${CUDA_VERSION} LESS 9.0) # CUDA 8.x elseif (${CUDA_VERSION} LESS 9.0) # CUDA 8.x
set(paddle_known_gpu_archs ${paddle_known_gpu_archs8}) set(paddle_known_gpu_archs ${paddle_known_gpu_archs8})
list(APPEND CUDA_NVCC_FLAGS "-D_MWAITXINTRIN_H_INCLUDED") list(APPEND CUDA_NVCC_FLAGS "-D_MWAITXINTRIN_H_INCLUDED")
...@@ -150,6 +152,7 @@ elseif (${CUDA_VERSION} LESS 9.0) # CUDA 8.x ...@@ -150,6 +152,7 @@ elseif (${CUDA_VERSION} LESS 9.0) # CUDA 8.x
# CUDA 8 may complain that sm_20 is no longer supported. Suppress the # CUDA 8 may complain that sm_20 is no longer supported. Suppress the
# warning for now. # warning for now.
list(APPEND CUDA_NVCC_FLAGS "-Wno-deprecated-gpu-targets") list(APPEND CUDA_NVCC_FLAGS "-Wno-deprecated-gpu-targets")
add_definitions("-DPADDLE_CUDA_BINVER=\"80\"")
endif() endif()
include_directories(${CUDA_INCLUDE_DIRS}) include_directories(${CUDA_INCLUDE_DIRS})
......
...@@ -89,6 +89,7 @@ if(CUDNN_FOUND) ...@@ -89,6 +89,7 @@ if(CUDNN_FOUND)
if(NOT CUDNN_MAJOR_VERSION) if(NOT CUDNN_MAJOR_VERSION)
set(CUDNN_VERSION "???") set(CUDNN_VERSION "???")
else() else()
add_definitions("-DPADDLE_CUDNN_BINVER=\"${CUDNN_MAJOR_VERSION}\"")
math(EXPR CUDNN_VERSION math(EXPR CUDNN_VERSION
"${CUDNN_MAJOR_VERSION} * 1000 + "${CUDNN_MAJOR_VERSION} * 1000 +
${CUDNN_MINOR_VERSION} * 100 + ${CUDNN_PATCHLEVEL_VERSION}") ${CUDNN_MINOR_VERSION} * 100 + ${CUDNN_PATCHLEVEL_VERSION}")
......
...@@ -32,4 +32,4 @@ endif() ...@@ -32,4 +32,4 @@ endif()
add_dependencies(cub extern_cub) add_dependencies(cub extern_cub)
LIST(APPEND externl_project_dependencies cub) LIST(APPEND external_project_dependencies cub)
...@@ -28,4 +28,4 @@ endif() ...@@ -28,4 +28,4 @@ endif()
add_dependencies(dlpack extern_dlpack) add_dependencies(dlpack extern_dlpack)
LIST(APPEND externl_project_dependencies dlpack) LIST(APPEND external_project_dependencies dlpack)
...@@ -106,10 +106,10 @@ else(WIN32) ...@@ -106,10 +106,10 @@ else(WIN32)
SET(MKLDNN_SHARED_LIB ${MKLDNN_INSTALL_DIR}/libmkldnn.so.0) SET(MKLDNN_SHARED_LIB ${MKLDNN_INSTALL_DIR}/libmkldnn.so.0)
ADD_CUSTOM_COMMAND(OUTPUT ${MKLDNN_SHARED_LIB} ADD_CUSTOM_COMMAND(OUTPUT ${MKLDNN_SHARED_LIB}
COMMAND ${CMAKE_COMMAND} -E copy ${MKLDNN_LIB} ${MKLDNN_SHARED_LIB} COMMAND ${CMAKE_COMMAND} -E copy ${MKLDNN_LIB} ${MKLDNN_SHARED_LIB}
DEPENDS mkldnn) DEPENDS mkldnn shared_mkldnn)
endif(WIN32) endif(WIN32)
ADD_CUSTOM_TARGET(mkldnn_shared_lib ALL DEPENDS ${MKLDNN_SHARED_LIB}) ADD_CUSTOM_TARGET(mkldnn_shared_lib ALL DEPENDS ${MKLDNN_SHARED_LIB})
ADD_DEPENDENCIES(mkldnn_shared_lib ${MKLDNN_PROJECT} mkldnn)
IF(WITH_C_API) IF(WITH_C_API)
INSTALL(FILES ${MKLDNN_SHARED_LIB} DESTINATION lib) INSTALL(FILES ${MKLDNN_SHARED_LIB} DESTINATION lib)
ENDIF() ENDIF()
......
...@@ -37,13 +37,12 @@ INCLUDE(GNUInstallDirs) ...@@ -37,13 +37,12 @@ INCLUDE(GNUInstallDirs)
INCLUDE(ExternalProject) INCLUDE(ExternalProject)
SET(NGRAPH_PROJECT "extern_ngraph") SET(NGRAPH_PROJECT "extern_ngraph")
SET(NGRAPH_VERSION "0.9") SET(NGRAPH_GIT_TAG "v0.10.1")
SET(NGRAPH_GIT_TAG "f9fd9d4cc318dc59dd4b68448e7fbb5f67a28bd0")
SET(NGRAPH_SOURCES_DIR ${THIRD_PARTY_PATH}/ngraph) SET(NGRAPH_SOURCES_DIR ${THIRD_PARTY_PATH}/ngraph)
SET(NGRAPH_INSTALL_DIR ${THIRD_PARTY_PATH}/install/ngraph) SET(NGRAPH_INSTALL_DIR ${THIRD_PARTY_PATH}/install/ngraph)
SET(NGRAPH_INC_DIR ${NGRAPH_INSTALL_DIR}/include) SET(NGRAPH_INC_DIR ${NGRAPH_INSTALL_DIR}/include)
SET(NGRAPH_LIB_DIR ${NGRAPH_INSTALL_DIR}/${CMAKE_INSTALL_LIBDIR}) SET(NGRAPH_LIB_DIR ${NGRAPH_INSTALL_DIR}/${CMAKE_INSTALL_LIBDIR})
SET(NGRAPH_SHARED_LIB_NAME libngraph.so.${NGRAPH_VERSION}) SET(NGRAPH_SHARED_LIB_NAME libngraph.so)
SET(NGRAPH_CPU_LIB_NAME libcpu_backend.so) SET(NGRAPH_CPU_LIB_NAME libcpu_backend.so)
SET(NGRAPH_TBB_LIB_NAME libtbb.so.2) SET(NGRAPH_TBB_LIB_NAME libtbb.so.2)
SET(NGRAPH_GIT_REPO "https://github.com/NervanaSystems/ngraph.git") SET(NGRAPH_GIT_REPO "https://github.com/NervanaSystems/ngraph.git")
......
...@@ -136,7 +136,7 @@ if (WITH_MKLDNN) ...@@ -136,7 +136,7 @@ if (WITH_MKLDNN)
copy(mkldnn_lib copy(mkldnn_lib
SRCS ${MKLDNN_INC_DIR} ${MKLDNN_SHARED_LIB} SRCS ${MKLDNN_INC_DIR} ${MKLDNN_SHARED_LIB}
DSTS ${dst_dir} ${dst_dir}/lib DSTS ${dst_dir} ${dst_dir}/lib
DEPS mkldnn DEPS mkldnn_shared_lib
) )
endif () endif ()
......
...@@ -110,7 +110,7 @@ function(op_library TARGET) ...@@ -110,7 +110,7 @@ function(op_library TARGET)
# Define operators that don't need pybind here. # Define operators that don't need pybind here.
foreach(manual_pybind_op "compare_op" "logical_op" "nccl_op" foreach(manual_pybind_op "compare_op" "logical_op" "nccl_op"
"tensor_array_read_write_op" "tensorrt_engine_op" "conv_fusion_op" "tensor_array_read_write_op" "tensorrt_engine_op" "conv_fusion_op"
"fusion_transpose_flatten_concat_op") "fusion_transpose_flatten_concat_op" "fusion_conv_inception_op")
if ("${TARGET}" STREQUAL "${manual_pybind_op}") if ("${TARGET}" STREQUAL "${manual_pybind_op}")
set(pybind_flag 1) set(pybind_flag 1)
endif() endif()
......
...@@ -57,46 +57,43 @@ int main() ...@@ -57,46 +57,43 @@ int main()
return 0; return 0;
}" SSE3_FOUND) }" SSE3_FOUND)
# disable AVX by default on windows # Check AVX
if(NOT WIN32) set(CMAKE_REQUIRED_FLAGS ${AVX_FLAG})
# Check AVX set(AVX_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE)
set(CMAKE_REQUIRED_FLAGS ${AVX_FLAG}) CHECK_CXX_SOURCE_RUNS("
set(AVX_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE) #include <immintrin.h>
CHECK_CXX_SOURCE_RUNS(" int main()
#include <immintrin.h> {
int main() __m256 a = _mm256_set_ps (-1.0f, 2.0f, -3.0f, 4.0f, -1.0f, 2.0f, -3.0f, 4.0f);
{ __m256 b = _mm256_set_ps (1.0f, 2.0f, 3.0f, 4.0f, 1.0f, 2.0f, 3.0f, 4.0f);
__m256 a = _mm256_set_ps (-1.0f, 2.0f, -3.0f, 4.0f, -1.0f, 2.0f, -3.0f, 4.0f); __m256 result = _mm256_add_ps (a, b);
__m256 b = _mm256_set_ps (1.0f, 2.0f, 3.0f, 4.0f, 1.0f, 2.0f, 3.0f, 4.0f); return 0;
__m256 result = _mm256_add_ps (a, b); }" AVX_FOUND)
return 0;
}" AVX_FOUND)
# Check AVX 2 # Check AVX 2
set(CMAKE_REQUIRED_FLAGS ${AVX2_FLAG}) set(CMAKE_REQUIRED_FLAGS ${AVX2_FLAG})
set(AVX2_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE) set(AVX2_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE)
CHECK_CXX_SOURCE_RUNS(" CHECK_CXX_SOURCE_RUNS("
#include <immintrin.h> #include <immintrin.h>
int main() int main()
{ {
__m256i a = _mm256_set_epi32 (-1, 2, -3, 4, -1, 2, -3, 4); __m256i a = _mm256_set_epi32 (-1, 2, -3, 4, -1, 2, -3, 4);
__m256i result = _mm256_abs_epi32 (a); __m256i result = _mm256_abs_epi32 (a);
return 0; return 0;
}" AVX2_FOUND) }" AVX2_FOUND)
# Check AVX512F # Check AVX512F
set(CMAKE_REQUIRED_FLAGS ${AVX512F_FLAG}) set(CMAKE_REQUIRED_FLAGS ${AVX512F_FLAG})
set(AVX512F_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE) set(AVX512F_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE)
CHECK_CXX_SOURCE_RUNS(" CHECK_CXX_SOURCE_RUNS("
#include <immintrin.h> #include <immintrin.h>
int main() int main()
{ {
__m512i a = _mm512_set_epi32 (-1, 2, -3, 4, -1, 2, -3, 4, __m512i a = _mm512_set_epi32 (-1, 2, -3, 4, -1, 2, -3, 4,
13, -5, 6, -7, 9, 2, -6, 3); 13, -5, 6, -7, 9, 2, -6, 3);
__m512i result = _mm512_abs_epi32 (a); __m512i result = _mm512_abs_epi32 (a);
return 0; return 0;
}" AVX512F_FOUND) }" AVX512F_FOUND)
endif(NOT WIN32)
set(CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS_RETAINED}) set(CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS_RETAINED})
mark_as_advanced(MMX_FOUND SSE2_FOUND SSE3_FOUND AVX_FOUND AVX2_FOUND AVX512F_FOUND) mark_as_advanced(MMX_FOUND SSE2_FOUND SSE3_FOUND AVX_FOUND AVX2_FOUND AVX512F_FOUND)
...@@ -60,7 +60,7 @@ class Float16Transpiler: ...@@ -60,7 +60,7 @@ class Float16Transpiler:
raise TypeError("place should be as CPUPlace/CUDAPlace type") raise TypeError("place should be as CPUPlace/CUDAPlace type")
if scope is None: if scope is None:
scope = global_scope() scope = global_scope()
if not isinstance(scope, core.Scope): if not isinstance(scope, core._Scope):
raise TypeError("scope should be as Scope type or None") raise TypeError("scope should be as Scope type or None")
self.scope = scope self.scope = scope
......
...@@ -464,11 +464,7 @@ paddle.fluid.unique_name.switch ArgSpec(args=['new_generator'], varargs=None, ke ...@@ -464,11 +464,7 @@ paddle.fluid.unique_name.switch ArgSpec(args=['new_generator'], varargs=None, ke
paddle.fluid.unique_name.guard ArgSpec(args=[], varargs='args', keywords='kwds', defaults=None) paddle.fluid.unique_name.guard ArgSpec(args=[], varargs='args', keywords='kwds', defaults=None)
paddle.fluid.recordio_writer.convert_reader_to_recordio_file ArgSpec(args=['filename', 'reader_creator', 'feeder', 'compressor', 'max_num_records', 'feed_order'], varargs=None, keywords=None, defaults=(Compressor.Snappy, 1000, None)) paddle.fluid.recordio_writer.convert_reader_to_recordio_file ArgSpec(args=['filename', 'reader_creator', 'feeder', 'compressor', 'max_num_records', 'feed_order'], varargs=None, keywords=None, defaults=(Compressor.Snappy, 1000, None))
paddle.fluid.recordio_writer.convert_reader_to_recordio_files ArgSpec(args=['filename', 'batch_per_file', 'reader_creator', 'feeder', 'compressor', 'max_num_records', 'feed_order'], varargs=None, keywords=None, defaults=(Compressor.Snappy, 1000, None)) paddle.fluid.recordio_writer.convert_reader_to_recordio_files ArgSpec(args=['filename', 'batch_per_file', 'reader_creator', 'feeder', 'compressor', 'max_num_records', 'feed_order'], varargs=None, keywords=None, defaults=(Compressor.Snappy, 1000, None))
paddle.fluid.Scope.__init__ __init__(self: paddle.fluid.core.Scope) -> None paddle.fluid.Scope Scope() -> paddle.fluid.core._Scope
paddle.fluid.Scope.drop_kids drop_kids(self: paddle.fluid.core.Scope) -> None
paddle.fluid.Scope.find_var find_var(self: paddle.fluid.core.Scope, arg0: unicode) -> paddle.fluid.core.Variable
paddle.fluid.Scope.new_scope new_scope(self: paddle.fluid.core.Scope) -> paddle.fluid.core.Scope
paddle.fluid.Scope.var var(self: paddle.fluid.core.Scope, arg0: unicode) -> paddle.fluid.core.Variable
paddle.reader.map_readers ArgSpec(args=['func'], varargs='readers', keywords=None, defaults=None) paddle.reader.map_readers ArgSpec(args=['func'], varargs='readers', keywords=None, defaults=None)
paddle.reader.buffered ArgSpec(args=['reader', 'size'], varargs=None, keywords=None, defaults=None) paddle.reader.buffered ArgSpec(args=['reader', 'size'], varargs=None, keywords=None, defaults=None)
paddle.reader.compose ArgSpec(args=[], varargs='readers', keywords='kwargs', defaults=None) paddle.reader.compose ArgSpec(args=[], varargs='readers', keywords='kwargs', defaults=None)
......
...@@ -7,27 +7,17 @@ function(windows_symbolic TARGET) ...@@ -7,27 +7,17 @@ function(windows_symbolic TARGET)
cmake_parse_arguments(windows_symbolic "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) cmake_parse_arguments(windows_symbolic "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
set(final_path ${CMAKE_CURRENT_SOURCE_DIR}/${windows_symbolic_PATH}) set(final_path ${CMAKE_CURRENT_SOURCE_DIR}/${windows_symbolic_PATH})
foreach(src ${windows_symbolic_SRCS}) foreach(src ${windows_symbolic_SRCS})
get_filename_component(src ${src} NAME_WE) get_filename_component(src ${src} NAME_WE)
if (NOT EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${src}.cc OR NOT EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${src}.cu) if (NOT EXISTS ${final_path}/${src}.cc OR NOT EXISTS ${final_path}/${src}.cu)
message(FATAL " ${src}.cc and ${src}.cu must exsits, and ${src}.cu must be symbolic file.") message(FATAL " ${src}.cc and ${src}.cu must exsits, and ${src}.cu must be symbolic file.")
endif() endif()
#only copy the xx.cu to.xx.cu when the content are modified file(GENERATE OUTPUT ${final_path}/.${src}.cu INPUT ${final_path}/${src}.cc)
set(copy_flag 1)
if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/.${src}.cu) add_custom_command(OUTPUT ${final_path}/.${src}.cu
file(READ ${CMAKE_CURRENT_SOURCE_DIR}/${src}.cc SOURCE_STR) COMMAND ${CMAKE_COMMAND} -E copy_if_different "${final_path}/${src}.cc" "${final_path}/.${src}.cu"
file(READ ${CMAKE_CURRENT_SOURCE_DIR}/.${src}.cu TARGET_STR) COMMENT "create hidden file of ${src}.cu")
if (SOURCE_STR STREQUAL TARGET_STR) add_custom_target(${TARGET} ALL DEPENDS .${src}.cu)
set(copy_flag 0)
endif()
endif()
if (copy_flag)
add_custom_command(OUTPUT .${src}.cu
COMMAND ${CMAKE_COMMAND} -E remove ${CMAKE_CURRENT_SOURCE_DIR}/.${src}.cu
COMMAND ${CMAKE_COMMAND} -E copy "${CMAKE_CURRENT_SOURCE_DIR}/${src}.cc" "${CMAKE_CURRENT_SOURCE_DIR}/.${src}.cu"
COMMENT "create hidden file of ${src}.cu")
endif(copy_flag)
add_custom_target(${TARGET} ALL DEPENDS .${src}.cu)
endforeach() endforeach()
endfunction() endfunction()
...@@ -37,9 +27,10 @@ add_subdirectory(details) ...@@ -37,9 +27,10 @@ add_subdirectory(details)
proto_library(framework_proto SRCS framework.proto) proto_library(framework_proto SRCS framework.proto)
proto_library(async_executor_proto SRCS data_feed.proto) proto_library(async_executor_proto SRCS data_feed.proto)
cc_library(ddim SRCS ddim.cc DEPS eigen3 boost) cc_library(ddim SRCS ddim.cc DEPS eigen3 boost enforce)
cc_test(ddim_test SRCS ddim_test.cc DEPS ddim) cc_test(ddim_test SRCS ddim_test.cc DEPS ddim)
nv_test(dim_test SRCS dim_test.cu DEPS ddim) nv_test(dim_test SRCS dim_test.cu DEPS ddim)
cc_test(unroll_array_ops_test SRCS unroll_array_ops_test.cc)
cc_library(data_type SRCS data_type.cc DEPS framework_proto ddim device_context) cc_library(data_type SRCS data_type.cc DEPS framework_proto ddim device_context)
cc_test(data_type_test SRCS data_type_test.cc DEPS data_type place tensor) cc_test(data_type_test SRCS data_type_test.cc DEPS data_type place tensor)
if(WITH_GPU) if(WITH_GPU)
...@@ -78,17 +69,23 @@ cc_library(garbage_collector SRCS garbage_collector.cc DEPS device_context memor ...@@ -78,17 +69,23 @@ cc_library(garbage_collector SRCS garbage_collector.cc DEPS device_context memor
cc_library(reader SRCS reader.cc DEPS lod_tensor ddim) cc_library(reader SRCS reader.cc DEPS lod_tensor ddim)
cc_test(reader_test SRCS reader_test.cc DEPS reader) cc_test(reader_test SRCS reader_test.cc DEPS reader)
cc_test(variable_test SRCS variable_test.cc)
cc_library(threadpool SRCS threadpool.cc DEPS enforce) cc_library(threadpool SRCS threadpool.cc DEPS enforce)
cc_test(threadpool_test SRCS threadpool_test.cc DEPS threadpool) cc_test(threadpool_test SRCS threadpool_test.cc DEPS threadpool)
cc_library(scope SRCS scope.cc DEPS glog threadpool) cc_library(var_type_traits SRCS var_type_traits DEPS lod_tensor selected_rows framework_proto)
if (WITH_GPU)
target_link_libraries(var_type_traits dynload_cuda)
endif()
cc_test(var_type_traits_test SRCS var_type_traits_test.cc DEPS var_type_traits)
cc_library(scope SRCS scope.cc DEPS glog threadpool xxhash var_type_traits)
cc_library(scope_pool SRCS scope_pool.cc DEPS scope)
cc_test(scope_test SRCS scope_test.cc DEPS scope) cc_test(scope_test SRCS scope_test.cc DEPS scope)
cc_test(variable_test SRCS variable_test.cc DEPS tensor var_type_traits)
cc_library(data_device_transform SRCS data_device_transform.cc DEPS tensor) cc_library(data_device_transform SRCS data_device_transform.cc DEPS tensor)
nv_test(data_device_transform_test SRCS data_device_transform_test.cu nv_test(data_device_transform_test SRCS data_device_transform_test.cu
DEPS operator op_registry device_context math_function) DEPS operator op_registry device_context math_function scope)
if(WITH_GPU) if(WITH_GPU)
if (WIN32) if (WIN32)
...@@ -133,11 +130,9 @@ cc_test(version_test SRCS version_test.cc DEPS version) ...@@ -133,11 +130,9 @@ cc_test(version_test SRCS version_test.cc DEPS version)
cc_library(proto_desc SRCS var_desc.cc op_desc.cc block_desc.cc program_desc.cc DEPS shape_inference op_info operator glog version) cc_library(proto_desc SRCS var_desc.cc op_desc.cc block_desc.cc program_desc.cc DEPS shape_inference op_info operator glog version)
if(WITH_NGRAPH) if(WITH_NGRAPH)
if(NOT WIN32) cc_library(ngraph_bridge SRCS ngraph_bridge.cc DEPS operator framework_proto ngraph)
cc_library(ngraph_bridge SRCS ngraph_bridge.cc DEPS operator framework_proto ngraph) cc_library(ngraph_operator SRCS ngraph_operator.cc DEPS ngraph_bridge operator op_info device_context tensor scope glog
cc_library(ngraph_operator SRCS ngraph_operator.cc DEPS ngraph_bridge operator op_info device_context tensor scope glog shape_inference data_transform lod_tensor profiler)
shape_inference data_transform lod_tensor profiler ngraph)
endif(NOT WIN32)
endif(WITH_NGRAPH) endif(WITH_NGRAPH)
cc_library(op_registry SRCS op_registry.cc DEPS op_proto_maker op_info operator glog proto_desc) cc_library(op_registry SRCS op_registry.cc DEPS op_proto_maker op_info operator glog proto_desc)
...@@ -179,11 +174,7 @@ if(WITH_DISTRIBUTE) ...@@ -179,11 +174,7 @@ if(WITH_DISTRIBUTE)
else() else()
if(WITH_NGRAPH) if(WITH_NGRAPH)
if(NOT WIN32) cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass ngraph_operator variable_helper)
cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass ngraph ngraph_operator variable_helper)
else(NOT WIN32)
cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass variable_helper)
endif(NOT WIN32)
else(WITH_NGRAPH) else(WITH_NGRAPH)
cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass variable_helper) cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass variable_helper)
endif(WITH_NGRAPH) endif(WITH_NGRAPH)
...@@ -198,9 +189,9 @@ cc_library(parallel_executor SRCS parallel_executor.cc DEPS ...@@ -198,9 +189,9 @@ cc_library(parallel_executor SRCS parallel_executor.cc DEPS
fast_threaded_ssa_graph_executor variable_helper) fast_threaded_ssa_graph_executor variable_helper)
if(WITH_PSLIB) if(WITH_PSLIB)
cc_library(async_executor SRCS async_executor.cc data_feed.cc data_feed_factory.cc executor_thread_worker.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass async_executor_proto variable_helper pslib_brpc pslib) cc_library(async_executor SRCS async_executor.cc data_feed.cc data_feed_factory.cc executor_thread_worker.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass async_executor_proto variable_helper pslib_brpc pslib timer)
else() else()
cc_library(async_executor SRCS async_executor.cc data_feed.cc data_feed_factory.cc executor_thread_worker.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass async_executor_proto variable_helper) cc_library(async_executor SRCS async_executor.cc data_feed.cc data_feed_factory.cc executor_thread_worker.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass async_executor_proto variable_helper timer)
endif(WITH_PSLIB) endif(WITH_PSLIB)
......
...@@ -15,34 +15,123 @@ ...@@ -15,34 +15,123 @@
#pragma once #pragma once
#include <cstdint> #include <cstdint>
#include "paddle/fluid/platform/hostdevice.h" #include "paddle/fluid/framework/unroll_array_ops.h"
#include "paddle/fluid/platform/enforce.h"
namespace paddle { namespace paddle {
namespace framework { namespace framework {
template <typename T, size_t N> template <typename T, size_t N>
class Array { class Array {
static_assert(N > 0, "The size of array must be larger than 0");
public: public:
HOSTDEVICE Array() {} static constexpr size_t kSize = N;
HOSTDEVICE inline Array() {}
HOSTDEVICE explicit Array(const T &val) { template <typename... Args>
for (size_t i = 0; i < N; ++i) data_[i] = val; HOSTDEVICE inline explicit Array(const T &val, Args... args) {
static_assert(N == sizeof...(Args) + 1, "Invalid argument");
UnrollVarArgsAssign<T>::Run(data_, val, args...);
} }
HOSTDEVICE const T *Get() const { return data_; } HOSTDEVICE inline void Fill(const T &val) {
UnrollFillConstant<N>::Run(data_, val);
}
HOSTDEVICE T *GetMutable() { return data_; } HOSTDEVICE inline const T *Get() const { return data_; }
HOSTDEVICE T &operator[](size_t index) { return data_[index]; } HOSTDEVICE inline T *GetMutable() { return data_; }
HOSTDEVICE const T &operator[](size_t index) const { return data_[index]; } HOSTDEVICE inline T &operator[](size_t i) { return *advance(data_, i); }
// Writing "return data_[i]" would cause compilation warning/error:
// "array subscript is above array bound" in Python 35 CI.
// It seems that it is a false warning of GCC if we do not check the bounds
// of array index. But for better performance, we do not check in operator[]
// like what is in STL. If users want to check the bounds, use at() instead
HOSTDEVICE inline const T &operator[](size_t i) const {
return *advance(data_, i);
}
HOSTDEVICE inline T &at(size_t i) {
#ifndef __CUDA_ARCH__
PADDLE_ENFORCE_LT(i, N, "Array index out of bounds");
#endif
return (*this)[i];
}
HOSTDEVICE inline const T &at(size_t i) const {
#ifndef __CUDA_ARCH__
PADDLE_ENFORCE_LT(i, N, "Array index out of bounds");
#endif
return (*this)[i];
}
HOSTDEVICE constexpr size_t size() const { return N; } HOSTDEVICE constexpr size_t size() const { return N; }
HOSTDEVICE inline bool operator==(const Array<T, N> &other) const {
return UnrollCompare<N>::Run(data_, other.data_);
}
HOSTDEVICE inline bool operator!=(const Array<T, N> &other) const {
return !(*this == other);
}
private: private:
template <typename U>
HOSTDEVICE static inline U *advance(U *ptr, size_t i) {
return ptr + i;
}
T data_[N]; T data_[N];
}; };
template <typename T>
class Array<T, 0> {
public:
static constexpr size_t kSize = 0;
HOSTDEVICE inline Array() {}
HOSTDEVICE inline void Fill(const T &val) {}
HOSTDEVICE inline constexpr T *Get() const { return nullptr; }
// Add constexpr to GetMutable() cause warning in MAC
HOSTDEVICE inline T *GetMutable() { return nullptr; }
HOSTDEVICE inline T &operator[](size_t) {
#ifdef __CUDA_ARCH__
static T obj();
return obj;
#else
PADDLE_THROW("Array<T, 0> has no element");
#endif
}
HOSTDEVICE inline const T &operator[](size_t) const {
#ifdef __CUDA_ARCH__
static const T obj();
return obj;
#else
PADDLE_THROW("Array<T, 0> has no element");
#endif
}
HOSTDEVICE inline T &at(size_t i) { return (*this)[i]; }
HOSTDEVICE inline const T &at(size_t i) const { return (*this)[i]; }
HOSTDEVICE constexpr size_t size() const { return 0; }
HOSTDEVICE constexpr bool operator==(const Array<T, 0> &other) const {
return true;
}
HOSTDEVICE constexpr bool operator!=(const Array<T, 0> &other) const {
return false;
}
};
} // namespace framework } // namespace framework
} // namespace paddle } // namespace paddle
...@@ -304,8 +304,13 @@ void AsyncExecutor::RunFromFile(const ProgramDesc& main_program, ...@@ -304,8 +304,13 @@ void AsyncExecutor::RunFromFile(const ProgramDesc& main_program,
// start executing ops in multiple threads // start executing ops in multiple threads
for (int thidx = 0; thidx < actual_thread_num; ++thidx) { for (int thidx = 0; thidx < actual_thread_num; ++thidx) {
threads.push_back( if (debug) {
std::thread(&ExecutorThreadWorker::TrainFiles, workers[thidx].get())); threads.push_back(std::thread(&ExecutorThreadWorker::TrainFilesWithTimer,
workers[thidx].get()));
} else {
threads.push_back(
std::thread(&ExecutorThreadWorker::TrainFiles, workers[thidx].get()));
}
} }
for (auto& th : threads) { for (auto& th : threads) {
......
...@@ -17,6 +17,7 @@ limitations under the License. */ ...@@ -17,6 +17,7 @@ limitations under the License. */
#include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/op_info.h" #include "paddle/fluid/framework/op_info.h"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/operators/elementwise/elementwise_op_function.h" #include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
#include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/device_context.h"
......
...@@ -18,312 +18,159 @@ limitations under the License. */ ...@@ -18,312 +18,159 @@ limitations under the License. */
namespace paddle { namespace paddle {
namespace framework { namespace framework {
/// @cond HIDDEN
template <int i>
Dim<i> make_dim(const int64_t* d) {
return Dim<i>(*d, make_dim<i - 1>(d + 1));
}
template <>
Dim<0> make_dim<0>(const int64_t* d) {
return Dim<0>(*d);
}
void make_ddim(DDim& ddim, const int64_t* dims, int n) {
switch (n) {
case 0:
ddim = make_dim<0>(dims);
break;
case 1:
ddim = make_dim<1>(dims);
break;
case 2:
ddim = make_dim<2>(dims);
break;
case 3:
ddim = make_dim<3>(dims);
break;
case 4:
ddim = make_dim<4>(dims);
break;
case 5:
ddim = make_dim<5>(dims);
break;
case 6:
ddim = make_dim<6>(dims);
break;
case 7:
ddim = make_dim<7>(dims);
break;
case 8:
ddim = make_dim<8>(dims);
break;
case 9:
ddim = make_dim<9>(dims);
break;
default:
PADDLE_THROW("Dynamic dimensions must have between [1, 9] dimensions.");
}
}
/// @endcond
DDim make_ddim(std::initializer_list<int64_t> dims) { DDim make_ddim(std::initializer_list<int64_t> dims) {
DDim result(make_dim(0)); return DDim(dims.begin(), dims.size());
make_ddim(result, dims.begin(), dims.size());
return result;
} }
DDim make_ddim(const std::vector<int64_t>& dims) { DDim make_ddim(const std::vector<int64_t>& dims) {
DDim result(make_dim(0)); return DDim(dims.data(), dims.size());
make_ddim(result, &dims[0], dims.size());
return result;
} }
DDim make_ddim(const std::vector<int>& dims) { DDim make_ddim(const std::vector<int>& dims) {
std::vector<int64_t> res(dims.size()); return DDim(dims.data(), dims.size());
std::transform(dims.begin(), dims.end(), res.begin(),
[](int d) { return static_cast<int64_t>(d); });
return make_ddim(res);
} }
/// @cond HIDDEN struct DDimEqualityVisitor {
// XXX For some reason, putting this in an anonymous namespace causes errors explicit DDimEqualityVisitor(const int64_t* d) : d_(d) {}
class DynamicMutableIndexer : public boost::static_visitor<int64_t&> {
public:
explicit DynamicMutableIndexer(int idx) : idx_(idx) {}
template <int D> template <int D>
int64_t& operator()(Dim<D>& dim) const { inline bool operator()(const Dim<D>& self) const {
return dim[idx_]; return UnrollCompare<D>::Run(self.Get(), d_);
} }
private: const int64_t* d_;
int idx_;
}; };
class DynamicConstIndexer : public boost::static_visitor<int64_t> { bool DDim::operator==(const DDim& d) const {
public: return size() == d.size() &&
explicit DynamicConstIndexer(int idx) : idx_(idx) {} this->apply_visitor(DDimEqualityVisitor(d.Get()));
template <int D>
int64_t operator()(const Dim<D>& dim) const {
return dim[idx_];
}
private:
int idx_;
};
/// @endcond
int64_t& DDim::operator[](int idx) {
return boost::apply_visitor(DynamicMutableIndexer(idx), var);
} }
int64_t DDim::operator[](int idx) const { bool DDim::operator!=(const DDim& d) const { return !(*this == d); }
return boost::apply_visitor(DynamicConstIndexer(idx), var);
}
int DDim::size() const { return arity(*this); } struct DDimPlusVisitor {
explicit DDimPlusVisitor(const int64_t* d1, const int64_t* d2)
: d1_(d1), d2_(d2) {}
bool DDim::operator==(DDim d) const { template <int D>
if (var.which() != d.getVar().which()) { inline void operator()(Dim<D>& self) const {
return false; UnrollAdd<D>::Run(d1_, d2_, self.GetMutable());
} else {
std::vector<int64_t> v1 = vectorize(*this);
std::vector<int64_t> v2 = vectorize(d);
for (unsigned int i = 0; i < v1.size(); i++) {
if (v1[i] != v2[i]) {
return false;
}
}
return true;
} }
}
bool DDim::operator!=(DDim d) const { return !(*this == d); }
DDim DDim::operator+(DDim d) const {
std::vector<int64_t> v1 = vectorize(*this);
std::vector<int64_t> v2 = vectorize(d);
std::vector<int64_t> v3;
assert(v1.size() == v2.size()); const int64_t* d1_;
const int64_t* d2_;
for (unsigned int i = 0; i < v1.size(); i++) { };
v3.push_back(v1[i] + v2[i]);
}
return make_ddim(v3); DDim DDim::operator+(const DDim& d) const {
PADDLE_ENFORCE(size() == d.size());
DDim ret;
ret.rank_ = rank_;
ret.apply_visitor(DDimPlusVisitor(Get(), d.Get()));
return ret;
} }
DDim DDim::operator*(DDim d) const { struct DDimMulVisitor {
std::vector<int64_t> v1 = vectorize(*this); explicit DDimMulVisitor(const int64_t* d1, const int64_t* d2)
std::vector<int64_t> v2 = vectorize(d); : d1_(d1), d2_(d2) {}
std::vector<int64_t> v3; template <int D>
inline void operator()(Dim<D>& self) const {
assert(v1.size() == v2.size()); UnrollMul<D>::Run(d1_, d2_, self.GetMutable());
for (unsigned int i = 0; i < v1.size(); i++) {
v3.push_back(v1[i] * v2[i]);
} }
return make_ddim(v3); const int64_t* d1_;
const int64_t* d2_;
};
DDim DDim::operator*(const DDim& d) const {
PADDLE_ENFORCE(size() == d.size());
DDim ret;
ret.rank_ = rank_;
ret.apply_visitor(DDimMulVisitor(Get(), d.Get()));
return ret;
} }
int64_t get(const DDim& ddim, int idx) { return ddim[idx]; } int64_t get(const DDim& ddim, int idx) { return ddim[idx]; }
void set(DDim& ddim, int idx, int value) { ddim[idx] = value; } void set(DDim& ddim, int idx, int value) { ddim[idx] = value; } // NOLINT
/// @cond HIDDEN
struct VectorizeVisitor : public boost::static_visitor<> {
std::vector<int64_t>& vector;
explicit VectorizeVisitor(std::vector<int64_t>& v) : vector(v) {}
template <typename T>
void operator()(const T& t) {
vector.push_back(t.head);
this->operator()(t.tail);
}
void operator()(const Dim<0>& t) {}
};
/// @endcond
std::vector<int64_t> vectorize(const DDim& ddim) { std::vector<int64_t> vectorize(const DDim& ddim) {
std::vector<int64_t> result; std::vector<int64_t> result(DDim::kMaxRank);
VectorizeVisitor visitor(result); dynamic_dim_assign(ddim.Get(), result.data(), ddim.size());
boost::apply_visitor(visitor, ddim); result.resize(ddim.size());
return result; return result;
} }
// NOTE: framework::vectorize converts to type int64_t // NOTE: framework::vectorize converts to type int64_t
// which does not fit cudnn inputs. // which does not fit cudnn inputs.
std::vector<int> vectorize2int(const DDim& ddim) { std::vector<int> vectorize2int(const DDim& ddim) {
std::vector<int64_t> temp = vectorize(ddim); std::vector<int> result(DDim::kMaxRank);
std::vector<int> result(temp.begin(), temp.end()); dynamic_dim_assign(ddim.Get(), result.data(), ddim.size());
result.resize(ddim.size());
return result; return result;
} }
struct ProductVisitor : public boost::static_visitor<int64_t> { struct ProductVisitor {
template <int D> template <int D>
int64_t operator()(const Dim<D>& dim) { inline int64_t operator()(const Dim<D>& dim) {
return product(dim); return product(dim);
} }
}; };
int64_t product(const DDim& ddim) { int64_t product(const DDim& ddim) {
ProductVisitor visitor; return ddim.apply_visitor(ProductVisitor());
return boost::apply_visitor(visitor, ddim);
} }
struct SliceVectorizeVisitor : public boost::static_visitor<> {
std::vector<int64_t>& vector;
int begin;
int end;
SliceVectorizeVisitor(std::vector<int64_t>& v, int b, int e)
: vector(v), begin(b), end(e) {
PADDLE_ENFORCE(begin < end,
"Begin index must be less than end index in ddim slice.");
PADDLE_ENFORCE(begin >= 0,
"Begin index can't be less than zero in ddim slice.");
}
template <int S>
void operator()(const Dim<S>& dim) {
if (begin == 0) {
vector.push_back(dim.head);
} else {
--begin;
}
--end;
if (end > 0) {
this->operator()(dim.tail);
}
}
void operator()(const Dim<0>& dim) {
PADDLE_ENFORCE(end == 0, "End index in ddim slice is out of bound.");
}
};
DDim slice_ddim(const DDim& dim, int begin, int end) { DDim slice_ddim(const DDim& dim, int begin, int end) {
std::vector<int64_t> vec; PADDLE_ENFORCE(begin >= 0 && end <= dim.size(),
vec.reserve(end - begin); "[begin(%d), end(%d)) must be inside [0, %d) in ddim slice.",
SliceVectorizeVisitor visitor(vec, begin, end); begin, end, dim.size());
boost::apply_visitor(visitor, dim); // Constructor of DDim would check whether end - begin is valid
return make_ddim(vec); return DDim(dim.Get() + begin, end - begin);
} }
/// \cond HIDDEN int arity(const DDim& d) { return d.size(); }
struct ArityVisitor : boost::static_visitor<int> {
template <int D>
int operator()(Dim<D>) const {
return D;
}
};
/// \endcond
int arity(const DDim& d) { return boost::apply_visitor(ArityVisitor(), d); }
/// \cond HIDDEN struct DDimPrinter {
struct DDimPrinter : boost::static_visitor<void> {
std::ostream& os; std::ostream& os;
explicit DDimPrinter(std::ostream& os_) : os(os_) {} explicit DDimPrinter(std::ostream& os_) : os(os_) {}
template <typename T> template <int D>
void operator()(const T& t) { void operator()(const Dim<D>& t) {
os << t; os << t;
} }
}; };
/// \endcond
std::ostream& operator<<(std::ostream& os, const DDim& ddim) { std::ostream& operator<<(std::ostream& os, const DDim& ddim) {
DDimPrinter printer(os); ddim.apply_visitor(DDimPrinter(os));
boost::apply_visitor(printer, ddim);
return os; return os;
} }
DDim::DDim(std::initializer_list<int64_t> init_list) {
*this = make_ddim(init_list);
}
DDim flatten_to_2d(const DDim& src, int num_col_dims) { DDim flatten_to_2d(const DDim& src, int num_col_dims) {
int rank = src.size(); return DDim({product(slice_ddim(src, 0, num_col_dims)),
return make_ddim({product(slice_ddim(src, 0, num_col_dims)), product(slice_ddim(src, num_col_dims, src.size()))});
product(slice_ddim(src, num_col_dims, rank))});
} }
DDim flatten_to_1d(const DDim& src) { return make_ddim({product(src)}); } DDim flatten_to_1d(const DDim& src) { return DDim({product(src)}); }
DDim stride(const DDim& ddim) { DDim stride(const DDim& ddim) {
std::vector<int64_t> strides(ddim.size()); DDim strides;
strides.rank_ = ddim.size();
strides[ddim.size() - 1] = 1; strides[ddim.size() - 1] = 1;
for (int i = ddim.size() - 2; i >= 0; --i) { for (int i = ddim.size() - 2; i >= 0; --i) {
strides[i] = strides[i + 1] * ddim[i + 1]; strides[i] = strides[i + 1] * ddim[i + 1];
} }
return framework::make_ddim(strides); return strides;
} }
DDim stride_numel(const framework::DDim& ddim) { DDim stride_numel(const DDim& ddim) {
std::vector<int64_t> strides(ddim.size()); DDim strides;
strides.rank_ = ddim.size();
strides[ddim.size() - 1] = ddim[ddim.size() - 1]; strides[ddim.size() - 1] = ddim[ddim.size() - 1];
for (int i = ddim.size() - 2; i >= 0; --i) { for (int i = ddim.size() - 2; i >= 0; --i) {
strides[i] = strides[i + 1] * ddim[i]; strides[i] = strides[i + 1] * ddim[i];
} }
return framework::make_ddim(strides); return strides;
} }
} // namespace framework } // namespace framework
......
...@@ -18,62 +18,145 @@ limitations under the License. */ ...@@ -18,62 +18,145 @@ limitations under the License. */
#include <stdexcept> #include <stdexcept>
#include <vector> #include <vector>
#include "paddle/fluid/framework/dim.h" #include "paddle/fluid/framework/dim.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/variant.h"
namespace paddle { namespace paddle {
namespace framework { namespace framework {
#define PADDLE_VISIT_DDIM_BASE(rank, callback) \
case (rank): { \
constexpr auto kRank = (rank); \
return (callback); \
}
#define PADDLE_VISIT_DDIM(rank, callback) \
switch (rank) { \
PADDLE_VISIT_DDIM_BASE(0, callback); \
PADDLE_VISIT_DDIM_BASE(1, callback); \
PADDLE_VISIT_DDIM_BASE(2, callback); \
PADDLE_VISIT_DDIM_BASE(3, callback); \
PADDLE_VISIT_DDIM_BASE(4, callback); \
PADDLE_VISIT_DDIM_BASE(5, callback); \
PADDLE_VISIT_DDIM_BASE(6, callback); \
PADDLE_VISIT_DDIM_BASE(7, callback); \
PADDLE_VISIT_DDIM_BASE(8, callback); \
PADDLE_VISIT_DDIM_BASE(9, callback); \
default: \
PADDLE_THROW("Invalid rank %d", rank); \
}
template <typename T1, typename T2>
inline void dynamic_dim_assign(const T1* in, T2* out, int n) {
PADDLE_VISIT_DDIM(n, (static_dim_assign<kRank, T1, T2>(in, out)));
}
/** /**
* \brief A dynamically sized dimension. * \brief A dynamically sized dimension.
* *
* The number of dimensions must be between [1, 9]. * The number of dimensions must be between [1, 9].
*/ */
struct DDim { class DDim {
typedef boost::variant<Dim<0>, Dim<1>, Dim<2>, Dim<3>, Dim<4>, Dim<5>, Dim<6>, public:
Dim<7>, Dim<8>, Dim<9>> constexpr static int kMaxRank = 9;
DDimVar;
DDimVar var; DDim() : rank_(1) { dim_[0] = 0; }
DDim() : var(Dim<1>()) {} DDim(const DDim& ddim) : dim_() { CopyFrom(ddim); }
DDim(const int* d, int n) : rank_(n) {
dynamic_dim_assign(d, dim_.GetMutable(), n);
}
DDim(const int64_t* d, int n) : rank_(n) {
dynamic_dim_assign(d, dim_.GetMutable(), n);
}
template <int D> template <int D>
explicit DDim(const Dim<D>& in) : var(in) {} /*implicit*/ DDim(const Dim<D>& in) : rank_(D) { // NOLINT
UnsafeCast<D>() = in;
}
/*implicit*/ DDim(std::initializer_list<int64_t> init_list)
: DDim(init_list.begin(), init_list.size()) {}
/*implicit*/ DDim(std::initializer_list<int64_t> init_list); inline DDim& operator=(const DDim& ddim) { return CopyFrom(ddim); }
template <int D> template <int D>
DDim& operator=(const Dim<D>& in) { inline DDim& operator=(const Dim<D>& dim) {
var = in; rank_ = D;
UnsafeCast<D>() = dim;
return *this; return *this;
} }
int64_t& operator[](int idx); inline int64_t& operator[](int idx) { return dim_[idx]; }
int64_t operator[](int idx) const;
inline int64_t operator[](int idx) const { return dim_[idx]; }
inline int64_t& at(int idx) {
PADDLE_ENFORCE(idx >= 0 && idx < rank_, "Invalid idx %d", idx);
return dim_[idx];
}
inline int64_t at(int idx) const {
PADDLE_ENFORCE(idx >= 0 && idx < rank_, "Invalid idx %d", idx);
return dim_[idx];
}
template <typename Visitor> template <typename Visitor>
typename Visitor::result_type apply_visitor(Visitor& visitor) { typename std::result_of<Visitor(Dim<0>&)>::type apply_visitor(
return var.apply_visitor(visitor); Visitor&& visitor) {
PADDLE_VISIT_DDIM(rank_, visitor(UnsafeCast<kRank>()));
} }
template <typename Visitor> template <typename Visitor>
typename Visitor::result_type apply_visitor(Visitor& visitor) const { typename std::result_of<Visitor(const Dim<0>&)>::type apply_visitor(
return var.apply_visitor(visitor); Visitor&& visitor) const {
PADDLE_VISIT_DDIM(rank_, visitor(UnsafeCast<kRank>()));
} }
DDimVar getVar() { return var; } bool operator==(const DDim& d) const;
bool operator!=(const DDim& d) const;
DDim operator+(const DDim& d) const;
bool operator==(DDim d) const; DDim operator*(const DDim& d) const;
bool operator!=(DDim d) const; inline const int64_t* Get() const { return dim_.Get(); }
DDim operator+(DDim d) const; inline int64_t* GetMutable() { return dim_.GetMutable(); }
DDim operator*(DDim d) const; inline int size() const { return rank_; }
private:
template <int D>
inline Dim<D>& UnsafeCast() {
static_assert(D >= 0 && D <= kMaxRank, "Invalid rank");
auto* p = static_cast<void*>(&dim_);
return *reinterpret_cast<Dim<D>*>(p);
}
template <int D>
inline const Dim<D>& UnsafeCast() const {
static_assert(D >= 0 && D <= kMaxRank, "Invalid rank");
auto* p = static_cast<const void*>(&dim_);
return *reinterpret_cast<const Dim<D>*>(p);
}
int size() const; inline DDim& CopyFrom(const DDim& ddim) {
PADDLE_VISIT_DDIM(ddim.rank_, (*this = ddim.UnsafeCast<kRank>()));
}
friend DDim stride(const DDim& ddim);
friend DDim stride_numel(const DDim& ddim);
private:
Dim<kMaxRank> dim_;
int rank_;
}; };
#undef PADDLE_VISIT_DDIM_BASE
#undef PADDLE_VISIT_DDIM
/** /**
* \brief Make a DDim from std::vector<int64_t> * \brief Make a DDim from std::vector<int64_t>
* *
...@@ -92,7 +175,7 @@ DDim make_ddim(const std::vector<int>& dims); ...@@ -92,7 +175,7 @@ DDim make_ddim(const std::vector<int>& dims);
DDim make_ddim(std::initializer_list<int64_t> dims); DDim make_ddim(std::initializer_list<int64_t> dims);
int64_t get(const DDim& dim, int idx); int64_t get(const DDim& dim, int idx);
void set(DDim& dim, int idx, int val); void set(DDim& dim, int idx, int val); // NOLINT
std::vector<int64_t> vectorize(const DDim& ddim); std::vector<int64_t> vectorize(const DDim& ddim);
std::vector<int> vectorize2int(const DDim& ddim); std::vector<int> vectorize2int(const DDim& ddim);
...@@ -129,12 +212,3 @@ DDim stride(const DDim& ddim); ...@@ -129,12 +212,3 @@ DDim stride(const DDim& ddim);
DDim stride_numel(const DDim& ddim); DDim stride_numel(const DDim& ddim);
} // namespace framework } // namespace framework
} // namespace paddle } // namespace paddle
namespace boost {
template <typename T>
T get(const paddle::framework::DDim& in) {
return boost::get<T>(in.var);
}
} // namespace boost
...@@ -50,7 +50,7 @@ void AllReduceOpHandle::RunImpl() { ...@@ -50,7 +50,7 @@ void AllReduceOpHandle::RunImpl() {
// FIXME(typhoonzero): If scope0(global scope) have NCCL_ID_VAR, // FIXME(typhoonzero): If scope0(global scope) have NCCL_ID_VAR,
// this is a distributed or inter-process call, find a better way. // this is a distributed or inter-process call, find a better way.
#ifdef PADDLE_WITH_CUDA #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
if (NoDummyInputSize() == 1 && if (NoDummyInputSize() == 1 &&
local_scopes_[0]->FindLocalVar(NCCL_ID_VARNAME) == nullptr) { local_scopes_[0]->FindLocalVar(NCCL_ID_VARNAME) == nullptr) {
#else #else
......
...@@ -88,7 +88,7 @@ void EagerDeletionOpHandle::RunImpl() { ...@@ -88,7 +88,7 @@ void EagerDeletionOpHandle::RunImpl() {
} }
} else { } else {
PADDLE_THROW("Type %s of %s is not supported eager deletion", PADDLE_THROW("Type %s of %s is not supported eager deletion",
var->Type().name(), name); framework::ToTypeName(var->Type()), name);
} }
} }
......
...@@ -25,7 +25,7 @@ struct ExecutionStrategy { ...@@ -25,7 +25,7 @@ struct ExecutionStrategy {
size_t num_threads_{0}; size_t num_threads_{0};
bool use_cuda_{true}; bool use_cuda_{true};
bool allow_op_delay_{false}; bool allow_op_delay_{false};
size_t num_iteration_per_drop_scope_{100}; size_t num_iteration_per_drop_scope_{1};
ExecutorType type_{kDefault}; ExecutorType type_{kDefault};
bool dry_run_{false}; bool dry_run_{false};
}; };
......
...@@ -120,6 +120,7 @@ FeedFetchList FastThreadedSSAGraphExecutor::Run( ...@@ -120,6 +120,7 @@ FeedFetchList FastThreadedSSAGraphExecutor::Run(
ClearFetchOp(graph_.get(), &fetch_ops); ClearFetchOp(graph_.get(), &fetch_ops);
return fetches; return fetches;
} }
void FastThreadedSSAGraphExecutor::RunOpAsync( void FastThreadedSSAGraphExecutor::RunOpAsync(
std::unordered_map<OpHandleBase *, std::atomic<int>> *op_deps, std::unordered_map<OpHandleBase *, std::atomic<int>> *op_deps,
OpHandleBase *op, OpHandleBase *op,
......
...@@ -45,7 +45,7 @@ class MultiDevSSAGraphBuilder : public ir::Pass { ...@@ -45,7 +45,7 @@ class MultiDevSSAGraphBuilder : public ir::Pass {
#endif #endif
int GetVarDeviceID( int GetVarDeviceID(
const ir::Graph &graph, const std::string &varname, const std::string &varname,
const std::unordered_map<std::string, int> &sharded_var_device) const; const std::unordered_map<std::string, int> &sharded_var_device) const;
bool IsScaleLossOp(ir::Node *node) const; bool IsScaleLossOp(ir::Node *node) const;
...@@ -57,12 +57,6 @@ class MultiDevSSAGraphBuilder : public ir::Pass { ...@@ -57,12 +57,6 @@ class MultiDevSSAGraphBuilder : public ir::Pass {
ir::Graph *result, ir::Node *node, ir::Graph *result, ir::Node *node,
std::unordered_map<std::string, int> *sharded_var_device) const; std::unordered_map<std::string, int> *sharded_var_device) const;
std::vector<std::string> FindDistTrainSendVars(
const std::vector<ir::Node *> &nodes) const;
std::vector<std::string> FindDistTrainRecvVars(
const std::vector<ir::Node *> &nodes) const;
void CreateComputationalOps(ir::Graph *result, ir::Node *node, void CreateComputationalOps(ir::Graph *result, ir::Node *node,
size_t num_places) const; size_t num_places) const;
...@@ -77,7 +71,7 @@ class MultiDevSSAGraphBuilder : public ir::Pass { ...@@ -77,7 +71,7 @@ class MultiDevSSAGraphBuilder : public ir::Pass {
int dev_id) const; int dev_id) const;
int GetOpDeviceID( int GetOpDeviceID(
const ir::Graph &graph, ir::Node *node, ir::Node *node,
const std::unordered_map<std::string, int> &sharded_var_device) const; const std::unordered_map<std::string, int> &sharded_var_device) const;
void InsertAllReduceOp(ir::Graph *result, const std::string &og) const; void InsertAllReduceOp(ir::Graph *result, const std::string &og) const;
...@@ -100,6 +94,15 @@ class MultiDevSSAGraphBuilder : public ir::Pass { ...@@ -100,6 +94,15 @@ class MultiDevSSAGraphBuilder : public ir::Pass {
void SetCommunicationContext(OpHandleBase *op_handle, void SetCommunicationContext(OpHandleBase *op_handle,
const platform::Place &p) const; const platform::Place &p) const;
std::vector<ir::Node *> SortForReduceMode(
const std::vector<ir::Node *> &) const;
int GetOpDeviceID(
ir::Node *node,
const std::unordered_map<std::string, int> &shared_var_device,
std::unordered_map<std::string, std::vector<ir::Node *>> *delay_ops)
const;
mutable std::string loss_var_name_; mutable std::string loss_var_name_;
mutable std::vector<platform::Place> places_; mutable std::vector<platform::Place> places_;
mutable std::vector<Scope *> local_scopes_; mutable std::vector<Scope *> local_scopes_;
......
...@@ -64,20 +64,26 @@ FeedFetchList ScopeBufferedSSAGraphExecutor::Run( ...@@ -64,20 +64,26 @@ FeedFetchList ScopeBufferedSSAGraphExecutor::Run(
} }
platform::RecordEvent e("ScopeBufferedSSAGraphExecutorAfterRun", nullptr); platform::RecordEvent e("ScopeBufferedSSAGraphExecutorAfterRun", nullptr);
drop_scope_counter_ += 1; ++drop_scope_counter_;
if (!fetch_tensors.empty() || bool stream_end = false;
drop_scope_counter_ == strategy_.num_iteration_per_drop_scope_) { if (!fetch_tensors.empty()) {
drop_scope_counter_ = 0; WaitComputationalStreams();
// Wait All computational streams stream_end = true;
for (auto p : places_) { }
platform::DeviceContextPool::Instance().Get(p)->Wait();
if (drop_scope_counter_ == strategy_.num_iteration_per_drop_scope_) {
if (!stream_end) {
WaitComputationalStreams();
} }
for (auto &scope : local_scopes_) { for (auto &scope : local_scopes_) {
auto &local_scope = auto &local_scope =
*scope->Var(details::kLocalExecScopeName)->GetMutable<Scope *>(); *scope->Var(details::kLocalExecScopeName)->GetMutable<Scope *>();
scope->DeleteScope(local_scope); scope->DeleteScope(local_scope);
} }
drop_scope_counter_ = 0;
} }
if (eptr) { if (eptr) {
std::rethrow_exception(eptr); std::rethrow_exception(eptr);
......
...@@ -47,6 +47,14 @@ class ScopeBufferedSSAGraphExecutor : public SSAGraphExecutor { ...@@ -47,6 +47,14 @@ class ScopeBufferedSSAGraphExecutor : public SSAGraphExecutor {
FeedFetchList Run(const std::vector<std::string>& fetch_tensors) override; FeedFetchList Run(const std::vector<std::string>& fetch_tensors) override;
private:
inline void WaitComputationalStreams() {
// Wait All computational streams
for (auto p : places_) {
platform::DeviceContextPool::Instance().Get(p)->Wait();
}
}
private: private:
size_t drop_scope_counter_{0}; size_t drop_scope_counter_{0};
......
...@@ -24,7 +24,7 @@ static void VisitVariable(Variable* var, Func* func) { ...@@ -24,7 +24,7 @@ static void VisitVariable(Variable* var, Func* func) {
} else if (var->IsType<SelectedRows>()) { } else if (var->IsType<SelectedRows>()) {
(*func)(var->GetMutable<SelectedRows>()); (*func)(var->GetMutable<SelectedRows>());
} else { } else {
PADDLE_THROW("Not supported type %s", var->Type().name()); PADDLE_THROW("Not supported type %s", ToTypeName(var->Type()));
} }
} }
...@@ -35,7 +35,7 @@ static void VisitVariable(const Variable& var, Func* func) { ...@@ -35,7 +35,7 @@ static void VisitVariable(const Variable& var, Func* func) {
} else if (var.IsType<SelectedRows>()) { } else if (var.IsType<SelectedRows>()) {
(*func)(var.Get<SelectedRows>()); (*func)(var.Get<SelectedRows>());
} else { } else {
PADDLE_THROW("Not supported type %s", var.Type().name()); PADDLE_THROW("Not supported type %s", ToTypeName(var.Type()));
} }
} }
......
...@@ -16,332 +16,184 @@ ...@@ -16,332 +16,184 @@
#include <iostream> #include <iostream>
#include <sstream> #include <sstream>
#include <stdexcept> #include <stdexcept>
#include <string>
#include <type_traits> #include <type_traits>
#include "paddle/fluid/framework/array.h"
#include "paddle/fluid/platform/assert.h" #include "paddle/fluid/platform/assert.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/hostdevice.h" #include "paddle/fluid/platform/hostdevice.h"
namespace paddle { namespace paddle {
namespace framework { namespace framework {
// Statically sized, statically indexed dimension // Statically sized, statically indexed dimension
template <int i> template <int D>
struct Dim { class Dim : public Array<int64_t, D> {
static constexpr int dimensions = i; public:
static_assert(D >= 0, "D must be not less than 0");
template <typename... Args> static constexpr int kRank = D;
HOSTDEVICE Dim(int64_t _head, Args... _tail) : head(_head), tail(_tail...) { using BaseClass = Array<int64_t, D>;
static_assert(sizeof...(_tail) == i - 1,
"Dim initialized with the wrong number of parameters");
}
HOSTDEVICE inline Dim(int64_t head, const Dim<D - 1>& tail) {
Dim(int64_t _head, const Dim<i - 1>& _tail) : head(_head), tail(_tail) {} (*this)[0] = head;
new (this->GetMutable() + 1) Dim<D - 1>(tail);
}
HOSTDEVICE template <typename... Args>
Dim() : head(0), tail() {} HOSTDEVICE explicit Dim(int64_t head, Args... args)
: BaseClass(head, args...) {}
/** Construct a Dim from a linear index and size. Uses Fortran order /** Construct a Dim from a linear index and size. Uses Fortran order
* indexing. */ * indexing. */
HOSTDEVICE HOSTDEVICE Dim(int64_t idx, const Dim<D>& size);
Dim(int64_t idx, const Dim<i>& size)
: head(idx % size.head), tail(idx / size.head, size.tail) {}
/** Construct a Dim with each dimension set to the given index */ /** Construct a Dim with each dimension set to the given index */
HOSTDEVICE HOSTDEVICE explicit Dim(int64_t idx) { this->Fill(idx); }
Dim(int64_t idx) : head(idx), tail(idx) {}
HOSTDEVICE HOSTDEVICE Dim() = default;
bool operator==(const Dim<i>& o) const {
return (head == o.head) && (tail == o.tail);
}
HOSTDEVICE
bool operator!=(const Dim<i>& o) const { return !(*this == o); }
HOSTDEVICE
int64_t& operator[](int idx);
HOSTDEVICE
int64_t operator[](int idx) const;
HOST std::string to_string() const; HOST std::string to_string() const;
int64_t head;
Dim<i - 1> tail;
}; };
// Base case specialization namespace detail {
template <> template <int kStart, int kEnd, bool kStop>
struct Dim<0> { struct FortranOrderIndexingConstructorFunctor {
static constexpr int dimensions = 0; HOSTDEVICE inline static void Run(const int64_t* in, int64_t* idx,
int64_t* out) {
HOSTDEVICE out[kStart] = (*idx) % in[kStart];
Dim(int64_t _head) {} (*idx) /= in[kStart];
FortranOrderIndexingConstructorFunctor<kStart + 1, kEnd,
HOSTDEVICE kStart + 1 == kEnd>::Run(in, idx,
Dim() {} out);
HOSTDEVICE
Dim(int idx, const Dim<0>& size) {
#ifndef __CUDA_ARCH__
if (idx > 0) {
throw std::invalid_argument("Index out of range.");
}
#else
PADDLE_ASSERT(idx == 0);
#endif
}
HOSTDEVICE
bool operator==(const Dim<0>& o) const { return true; }
HOSTDEVICE
bool operator!=(const Dim<0>& o) const { return false; }
HOSTDEVICE
int64_t& operator[](int idx);
HOSTDEVICE
int64_t operator[](int idx) const;
};
namespace {
// Helper for accessing Dim classes
template <int i>
struct DimGetter {
// Return a copy if Dim is const
template <typename D>
HOSTDEVICE static int64_t impl(const D& d) {
return DimGetter<i - 1>::impl(d.tail);
}
// Return a reference if Dim is mutable
template <typename D>
HOSTDEVICE static int64_t& impl(D& d) {
return DimGetter<i - 1>::impl(d.tail);
} }
}; };
// Eureka! We found the element! template <int kStart, int kEnd>
template <> struct FortranOrderIndexingConstructorFunctor<kStart, kEnd, true> {
struct DimGetter<0> { HOSTDEVICE inline static void Run(const int64_t* in, int64_t* idx,
// Return a copy if Dim is const int64_t* out) {}
template <typename D>
HOSTDEVICE static int64_t impl(const D& d) {
return d.head;
}
// Return a reference if Dim is mutable
template <typename D>
HOSTDEVICE static int64_t& impl(D& d) {
return d.head;
}
}; };
} // namespace detail
template <int D> template <int D>
HOSTDEVICE int64_t& indexer(Dim<D>& dim, int idx) { HOSTDEVICE Dim<D>::Dim(int64_t idx, const Dim<D>& size) {
#ifndef __CUDA_ARCH__ detail::FortranOrderIndexingConstructorFunctor<0, D, D == 0>::Run(
if (idx < 0) { size.Get(), &idx, this->GetMutable());
throw std::invalid_argument("Tried to access a negative dimension");
}
#else
PADDLE_ASSERT(idx >= 0);
#endif
if (idx == 0) {
return dim.head;
}
return indexer(dim.tail, idx - 1);
}
template <>
HOSTDEVICE int64_t& indexer<0>(Dim<0>& dim, int idx) {
#ifndef __CUDA_ARCH__
throw std::invalid_argument("Invalid index");
#else
PADDLE_ASSERT(false);
#if CUDA_VERSION < 8000
// On CUDA versions previous to 8.0, only __shared__ variables
// could be declared as static in the device code.
int64_t head = 0;
#else
static int64_t head = 0;
#endif
return head;
#endif
}
template <int D>
HOSTDEVICE int64_t indexer(const Dim<D>& dim, int idx) {
#ifndef __CUDA_ARCH__
if (idx < 0) {
throw std::invalid_argument("Tried to access a negative dimension");
}
#else
PADDLE_ASSERT(idx >= 0);
#endif
if (idx == 0) {
return dim.head;
}
return indexer(dim.tail, idx - 1);
}
template <>
HOSTDEVICE int64_t indexer<0>(const Dim<0>& dim, int idx) {
#ifndef __CUDA_ARCH__
throw std::invalid_argument("Invalid index");
#else
PADDLE_ASSERT(false);
#if CUDA_VERSION < 8000
// On CUDA versions previous to 8.0, only __shared__ variables
// could be declared as static in the device code.
int64_t head = 0;
#else
static int64_t head = 0;
#endif
return head;
#endif
}
} // namespace
// Static access to constant Dim
template <int i, int l>
HOSTDEVICE int64_t get(const Dim<l>& d) {
return DimGetter<i>::impl(d);
} }
// Static access to mutable Dim template <int idx, int D>
template <int i, int l> HOSTDEVICE inline int64_t get(const Dim<D>& dim) {
HOSTDEVICE int64_t& get(Dim<l>& d) { return dim[idx];
return DimGetter<i>::impl(d);
} }
// Dynamic access to constant Dim template <int idx, int D>
template <int l> HOSTDEVICE inline int64_t& get(Dim<D>& dim) { // NOLINT
HOSTDEVICE int64_t Dim<l>::operator[](int i) const { return dim[idx];
return indexer(*this, i);
} }
// Dynamic access to mutable Dim template <int D>
template <int l> HOSTDEVICE inline int64_t get(const Dim<D>& dim, int idx) {
HOSTDEVICE int64_t& Dim<l>::operator[](int i) { return dim[idx];
return indexer(*this, i);
}
// Dynamic access to constant Dim
inline HOSTDEVICE int64_t Dim<0>::operator[](int i) const {
return indexer(*this, i);
}
// Dynamic access to mutable Dim
inline HOSTDEVICE int64_t& Dim<0>::operator[](int i) {
return indexer(*this, i);
}
// Dynamic access to constant Dim
// without std::enable_if will try to instantiate this on get<0>(d)
template <int l>
HOSTDEVICE typename std::enable_if<(l > 0), int64_t>::type get(const Dim<l>& d,
int i) {
return d[i];
} }
// Dynamic access to mutable Dim template <int D>
template <int l> HOSTDEVICE inline int64_t& get(Dim<D>& dim, int idx) { // NOLINT
HOSTDEVICE typename std::enable_if<(l > 0), int64_t&>::type get(Dim<l>& d, return dim[idx];
int i) {
return d[i];
} }
// Dot product of two dims // Dot product of two dims
template <int i> template <int D>
HOSTDEVICE int64_t linearize(const Dim<i>& a, const Dim<i>& b) { HOSTDEVICE inline int64_t linearize(const Dim<D>& a, const Dim<D>& b) {
return a.head * b.head + linearize(a.tail, b.tail); return UnrollProduct<D>::Run(a.Get(), b.Get());
}
// Base case dot product of two Dims
// Notice it is inline because it is no longer a template
template <>
HOSTDEVICE inline int64_t linearize(const Dim<0>& a, const Dim<0>& b) {
return 0;
} }
// Product of a Dim // Product of a Dim
template <int i> template <int D>
HOSTDEVICE int64_t product(const Dim<i>& a, int prod = 1) { HOSTDEVICE inline int64_t product(const Dim<D>& a) {
return prod * a.head * product(a.tail); return UnrollProduct<D>::Run(a.Get());
}
// Base case product of a Dim
// Notice it is inline because it is no longer a template
template <>
HOSTDEVICE inline int64_t product(const Dim<0>& a, int prod) {
return prod;
} }
// Is 0 <= idx_i < size_i for all i? // Is 0 <= idx_i < size_i for all i?
template <int i> namespace detail {
HOSTDEVICE bool contained(const Dim<i>& idx, const Dim<i>& size) { template <int kStart, int kEnd, bool kStop>
return ((0 <= idx.head) && (idx.head < size.head) && struct ContainedFunctor {
contained(idx.tail, size.tail)); HOSTDEVICE static inline bool Run(const int64_t* idx, const int64_t* size) {
} return (idx[kStart] >= 0 && idx[kStart] < size[kStart]) &&
ContainedFunctor<kStart + 1, kEnd, kStart + 1 == kEnd>::Run(idx,
size);
}
};
// Base case of is 0 <= idx_i < size_i ? template <int kStart, int kEnd>
// Notice it is inline because it is no longer a template struct ContainedFunctor<kStart, kEnd, true> {
template <> HOSTDEVICE static constexpr inline bool Run(const int64_t* idx,
HOSTDEVICE inline bool contained(const Dim<0>& idx, const Dim<0>& size) { const int64_t* size) {
return true; return true;
}
};
} // namespace detail
template <int D>
HOSTDEVICE inline bool contained(const Dim<D>& idx, const Dim<D>& size) {
return detail::ContainedFunctor<0, D, D == 0>::Run(idx.Get(), size.Get());
} }
/** /**
* \brief Compute exclusive prefix-multiply of a Dim. * \brief Compute exclusive prefix-multiply of a Dim.
*/ */
template <int i> namespace detail {
HOSTDEVICE Dim<i> ex_prefix_mul(const Dim<i>& src, int mul = 1) { template <int kStart, int kEnd, bool kStop>
return Dim<i>(mul, ex_prefix_mul(src.tail, mul * src.head)); struct ExPrefixMulFunctor {
} HOSTDEVICE static inline void Run(const int64_t* in, int64_t* out) {
kStart == 0 ? out[kStart] = 1 : out[kStart] =
out[kStart - 1] * in[kStart - 1];
detail::ExPrefixMulFunctor<kStart + 1, kEnd, kStart + 1 == kEnd>::Run(in,
out);
}
};
template <int kStart, int kEnd>
struct ExPrefixMulFunctor<kStart, kEnd, true> {
HOSTDEVICE static inline void Run(const int64_t* in, int64_t* out) {}
};
} // namespace detail
///\cond HIDDEN template <int D>
// Base case of ex_prefix_mul HOSTDEVICE inline Dim<D> ex_prefix_mul(const Dim<D>& src) {
// Notice it is inline because it is no longer a template Dim<D> ret;
template <> detail::ExPrefixMulFunctor<0, D, D == 0>::Run(src.Get(), ret.GetMutable());
HOSTDEVICE inline Dim<0> ex_prefix_mul(const Dim<0>& src, int mul) { return ret;
return Dim<0>();
} }
///\endcond
/** /**
* Add two dimensions together * Add two dimensions together
*/ */
template <int i> template <int D>
HOSTDEVICE Dim<i> dim_plus(const Dim<i>& a, const Dim<i>& b) { HOSTDEVICE inline Dim<D> dim_plus(const Dim<D>& a, const Dim<D>& b) {
return Dim<i>(a.head + b.head, dim_plus(a.tail, b.tail)); Dim<D> ret;
} UnrollAdd<D>::Run(a.Get(), b.Get(), ret.GetMutable());
return ret;
// Base case
template <>
HOSTDEVICE inline Dim<0> dim_plus(const Dim<0>& a, const Dim<0>& b) {
return Dim<0>();
} }
template <int i> template <int D>
HOSTDEVICE Dim<i> operator+(const Dim<i>& lhs, const Dim<i>& rhs) { HOSTDEVICE inline Dim<D> operator+(const Dim<D>& lhs, const Dim<D>& rhs) {
return dim_plus(lhs, rhs); return dim_plus(lhs, rhs);
} }
/** /**
* Multiply two dimensions together * Multiply two dimensions together
*/ */
template <int i> template <int D>
HOSTDEVICE Dim<i> dim_mult(const Dim<i>& a, const Dim<i>& b) { HOSTDEVICE inline Dim<D> dim_mult(const Dim<D>& a, const Dim<D>& b) {
return Dim<i>(a.head * b.head, dim_mult(a.tail, b.tail)); Dim<D> ret;
} UnrollMul<D>::Run(a.Get(), b.Get(), ret.GetMutable());
return ret;
// Base case
template <>
HOSTDEVICE inline Dim<0> dim_mult(const Dim<0>& a, const Dim<0>& b) {
return Dim<0>();
} }
template <int i> template <int D>
HOSTDEVICE Dim<i> operator*(const Dim<i>& lhs, const Dim<i>& rhs) { HOSTDEVICE Dim<D> operator*(const Dim<D>& lhs, const Dim<D>& rhs) {
return dim_mult(lhs, rhs); return dim_mult(lhs, rhs);
} }
...@@ -354,23 +206,32 @@ HOSTDEVICE Dim<i> operator*(const Dim<i>& lhs, const Dim<i>& rhs) { ...@@ -354,23 +206,32 @@ HOSTDEVICE Dim<i> operator*(const Dim<i>& lhs, const Dim<i>& rhs) {
* \return Dim object the same size as \p size with normalized strides * \return Dim object the same size as \p size with normalized strides
* *
*/ */
namespace detail {
template <int kStart, int kEnd, bool kStop>
struct NormalizeStridesFunctor {
HOSTDEVICE static void Run(const int64_t* size, const int64_t* stride,
int64_t* ret) {
ret[kStart] = (size[kStart] == 1 ? 0 : stride[kStart]);
NormalizeStridesFunctor<kStart + 1, kEnd, kStart + 1 == kEnd>::Run(
size, stride, ret);
}
};
template <int i> template <int kStart, int kEnd>
HOSTDEVICE Dim<i> normalize_strides(const Dim<i>& size, const Dim<i>& stride) { struct NormalizeStridesFunctor<kStart, kEnd, true> {
int norm_stride = size.head == 1 ? 0 : stride.head; HOSTDEVICE static void Run(const int64_t* size, const int64_t* stride,
return Dim<i>(norm_stride, normalize_strides(size.tail, stride.tail)); int64_t* ret) {}
} };
} // namespace detail
///\cond HIDDEN
template <> template <int D>
HOSTDEVICE inline Dim<0> normalize_strides(const Dim<0>& size, HOSTDEVICE Dim<D> normalize_strides(const Dim<D>& size, const Dim<D>& stride) {
const Dim<0>& stride) { Dim<D> ret;
return Dim<0>(); detail::NormalizeStridesFunctor<0, D, D == 0>::Run(size.Get(), stride.Get(),
ret.GetMutable());
return ret;
} }
///\endcond
/** /**
* Helper function to create a Dim * Helper function to create a Dim
* *
...@@ -379,25 +240,17 @@ HOSTDEVICE inline Dim<0> normalize_strides(const Dim<0>& size, ...@@ -379,25 +240,17 @@ HOSTDEVICE inline Dim<0> normalize_strides(const Dim<0>& size,
*/ */
template <typename... Args> template <typename... Args>
HOSTDEVICE Dim<sizeof...(Args)> make_dim(Args... idxes) { HOSTDEVICE inline Dim<sizeof...(Args)> make_dim(Args... idxes) {
return Dim<sizeof...(Args)>(idxes...); return Dim<sizeof...(Args)>(idxes...);
} }
// Allows us to output a Dim // Allows us to output a Dim
// XXX For some reason, overloading fails to resolve this correctly template <int D>
template <int i> inline std::ostream& operator<<(std::ostream& os, const Dim<D>& d) {
typename std::enable_if<(i > 1), std::ostream&>::type operator<<( os << d[0];
std::ostream& os, const Dim<i>& d) { for (int i = 1; i < D; ++i) {
os << d.head << ", " << d.tail; os << ", " << d[i];
return os; }
}
// Base case that allows us to output a Dim
// XXX I wish this could be an overload instead of a template
template <int i>
typename std::enable_if<(i == 1), std::ostream&>::type operator<<(
std::ostream& os, const Dim<i>& d) {
os << d.head;
return os; return os;
} }
...@@ -405,17 +258,15 @@ inline std::ostream& operator<<(std::ostream& os, const Dim<0>& d) { ...@@ -405,17 +258,15 @@ inline std::ostream& operator<<(std::ostream& os, const Dim<0>& d) {
return os; return os;
} }
template <int i> template <int D>
HOST std::string Dim<i>::to_string() const { HOST std::string Dim<D>::to_string() const {
std::stringstream stream; std::stringstream stream;
stream << *this; stream << *this;
return stream.str(); return stream.str();
} }
template <int D> template <int D>
HOSTDEVICE Dim<D> linear_to_dimension(int linear_index, Dim<D> extents) { HOSTDEVICE Dim<D> linear_to_dimension(int linear_index, const Dim<D>& extents) {
Dim<D> result; Dim<D> result;
for (int i = 0; i < D - 1; ++i) { for (int i = 0; i < D - 1; ++i) {
...@@ -428,5 +279,10 @@ HOSTDEVICE Dim<D> linear_to_dimension(int linear_index, Dim<D> extents) { ...@@ -428,5 +279,10 @@ HOSTDEVICE Dim<D> linear_to_dimension(int linear_index, Dim<D> extents) {
return result; return result;
} }
template <int D, typename T1, typename T2>
inline void static_dim_assign(const T1* in, T2* out) {
UnrollAssign<D>::Run(in, out);
}
} // namespace framework } // namespace framework
} // namespace paddle } // namespace paddle
...@@ -59,7 +59,7 @@ static DLDataType GetDLDataTypeFromTypeIndex(proto::VarType::Type type) { ...@@ -59,7 +59,7 @@ static DLDataType GetDLDataTypeFromTypeIndex(proto::VarType::Type type) {
struct DLContextVisitor : public boost::static_visitor<::DLContext> { struct DLContextVisitor : public boost::static_visitor<::DLContext> {
inline ::DLContext operator()(const platform::CPUPlace &place) const { inline ::DLContext operator()(const platform::CPUPlace &place) const {
DLContext ctx; ::DLContext ctx;
ctx.device_type = kDLCPU; ctx.device_type = kDLCPU;
ctx.device_id = 0; ctx.device_id = 0;
return ctx; return ctx;
...@@ -67,7 +67,7 @@ struct DLContextVisitor : public boost::static_visitor<::DLContext> { ...@@ -67,7 +67,7 @@ struct DLContextVisitor : public boost::static_visitor<::DLContext> {
inline ::DLContext operator()(const platform::CUDAPlace &place) const { inline ::DLContext operator()(const platform::CUDAPlace &place) const {
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
DLContext ctx; ::DLContext ctx;
ctx.device_type = kDLGPU; ctx.device_type = kDLGPU;
ctx.device_id = place.device; ctx.device_id = place.device;
return ctx; return ctx;
...@@ -78,7 +78,7 @@ struct DLContextVisitor : public boost::static_visitor<::DLContext> { ...@@ -78,7 +78,7 @@ struct DLContextVisitor : public boost::static_visitor<::DLContext> {
inline ::DLContext operator()(const platform::CUDAPinnedPlace &place) const { inline ::DLContext operator()(const platform::CUDAPinnedPlace &place) const {
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
DLContext ctx; ::DLContext ctx;
ctx.device_type = kDLCPUPinned; ctx.device_type = kDLCPUPinned;
ctx.device_id = 0; ctx.device_id = 0;
return ctx; return ctx;
......
...@@ -38,7 +38,7 @@ class DLPackTensor { ...@@ -38,7 +38,7 @@ class DLPackTensor {
// The shape in DLTensor is defined as int64_t* // The shape in DLTensor is defined as int64_t*
// Add this member to make TVMTensor init without heap allocation // Add this member to make TVMTensor init without heap allocation
ShapeType shape_[9]; ShapeType shape_[DDim::kMaxRank];
}; };
} // namespace framework } // namespace framework
......
...@@ -22,7 +22,7 @@ limitations under the License. */ ...@@ -22,7 +22,7 @@ limitations under the License. */
#include "paddle/fluid/framework/reader.h" #include "paddle/fluid/framework/reader.h"
#include "paddle/fluid/framework/transfer_scope_cache.h" #include "paddle/fluid/framework/transfer_scope_cache.h"
#include "paddle/fluid/framework/variable_helper.h" #include "paddle/fluid/framework/variable_helper.h"
#include "paddle/fluid/operators/detail/macros.h" #include "paddle/fluid/operators/distributed/distributed.h"
#include "paddle/fluid/platform/place.h" #include "paddle/fluid/platform/place.h"
#include "paddle/fluid/platform/profiler.h" #include "paddle/fluid/platform/profiler.h"
...@@ -119,7 +119,7 @@ static void DeleteUnusedTensors( ...@@ -119,7 +119,7 @@ static void DeleteUnusedTensors(
} }
} else { } else {
PADDLE_THROW("Type %s of %s is not supported eager deletion", PADDLE_THROW("Type %s of %s is not supported eager deletion",
var->Type().name(), name); framework::ToTypeName(var->Type()), name);
} }
} }
} }
......
...@@ -29,6 +29,7 @@ limitations under the License. */ ...@@ -29,6 +29,7 @@ limitations under the License. */
#include "paddle/fluid/inference/io.h" #include "paddle/fluid/inference/io.h"
#include "paddle/fluid/platform/cpu_helper.h" #include "paddle/fluid/platform/cpu_helper.h"
#include "paddle/fluid/platform/place.h" #include "paddle/fluid/platform/place.h"
#include "paddle/fluid/platform/timer.h"
#include "paddle/fluid/pybind/pybind.h" #include "paddle/fluid/pybind/pybind.h"
namespace paddle { namespace paddle {
namespace framework { namespace framework {
...@@ -180,6 +181,7 @@ void ExecutorThreadWorker::SetDevice() { ...@@ -180,6 +181,7 @@ void ExecutorThreadWorker::SetDevice() {
return; return;
#else #else
static unsigned concurrency_cap = std::thread::hardware_concurrency(); static unsigned concurrency_cap = std::thread::hardware_concurrency();
LOG(WARNING) << "concurrency capacity " << concurrency_cap;
int thread_id = this->thread_id_; int thread_id = this->thread_id_;
if (static_cast<unsigned>(thread_id) < concurrency_cap) { if (static_cast<unsigned>(thread_id) < concurrency_cap) {
...@@ -238,6 +240,55 @@ static void print_fetch_var(Scope* scope, const std::string& var_name) { ...@@ -238,6 +240,55 @@ static void print_fetch_var(Scope* scope, const std::string& var_name) {
VLOG(1) << "print_fetch_var: unrecognized data type:" << tensor.type(); VLOG(1) << "print_fetch_var: unrecognized data type:" << tensor.type();
} }
void ExecutorThreadWorker::TrainFilesWithTimer() {
platform::SetNumThreads(1);
SetDevice();
thread_reader_->Start();
std::vector<double> op_total_time;
std::vector<std::string> op_name;
for (auto& op : ops_) {
op_name.push_back(op->Type());
}
op_total_time.resize(ops_.size());
for (size_t i = 0; i < op_total_time.size(); ++i) {
op_total_time[i] = 0.0;
}
platform::Timer timeline;
double total_time = 0.0;
double read_time = 0.0;
int cur_batch;
int batch_cnt = 0;
timeline.Start();
while ((cur_batch = thread_reader_->Next()) > 0) {
timeline.Pause();
read_time += timeline.ElapsedSec();
total_time += timeline.ElapsedSec();
for (size_t i = 0; i < ops_.size(); ++i) {
timeline.Start();
ops_[i]->Run(*thread_scope_, place_);
timeline.Pause();
op_total_time[i] += timeline.ElapsedSec();
total_time += timeline.ElapsedSec();
}
++batch_cnt;
thread_scope_->DropKids();
if (thread_id_ == 0) {
if (batch_cnt > 0 && batch_cnt % 1000 == 0) {
for (size_t i = 0; i < ops_.size(); ++i) {
fprintf(stderr, "op_name:[%zu][%s], op_mean_time:[%fs]\n", i,
op_name[i].c_str(), op_total_time[i] / batch_cnt);
}
fprintf(stderr, "mean read time: %fs\n", read_time / batch_cnt);
int fetch_var_num = fetch_var_names_.size();
for (int i = 0; i < fetch_var_num; ++i) {
print_fetch_var(thread_scope_, fetch_var_names_[i]);
}
}
}
timeline.Start();
}
}
void ExecutorThreadWorker::TrainFiles() { void ExecutorThreadWorker::TrainFiles() {
platform::SetNumThreads(1); platform::SetNumThreads(1);
...@@ -320,10 +371,12 @@ void AsyncExecutorThreadWorker::SetPSlibPtr( ...@@ -320,10 +371,12 @@ void AsyncExecutorThreadWorker::SetPSlibPtr(
std::shared_ptr<paddle::distributed::PSlib> pslib_ptr) { std::shared_ptr<paddle::distributed::PSlib> pslib_ptr) {
_pslib_ptr = pslib_ptr; _pslib_ptr = pslib_ptr;
} }
void AsyncExecutorThreadWorker::SetPullDenseThread( void AsyncExecutorThreadWorker::SetPullDenseThread(
std::shared_ptr<DensePullThread> dpt) { std::shared_ptr<DensePullThread> dpt) {
_pull_dense_thread = dpt; _pull_dense_thread = dpt;
} }
void AsyncExecutorThreadWorker::TrainOneNetwork() { void AsyncExecutorThreadWorker::TrainOneNetwork() {
PrepareParams(); PrepareParams();
......
...@@ -155,6 +155,8 @@ class ExecutorThreadWorker { ...@@ -155,6 +155,8 @@ class ExecutorThreadWorker {
void SetDataFeed(const std::shared_ptr<DataFeed>& datafeed); void SetDataFeed(const std::shared_ptr<DataFeed>& datafeed);
// A multi-thread training function // A multi-thread training function
virtual void TrainFiles(); virtual void TrainFiles();
// with timer log
virtual void TrainFilesWithTimer();
// set fetch variable names from python interface assigned by users // set fetch variable names from python interface assigned by users
void SetFetchVarNames(const std::vector<std::string>& fetch_var_names); void SetFetchVarNames(const std::vector<std::string>& fetch_var_names);
#ifdef PADDLE_WITH_PSLIB #ifdef PADDLE_WITH_PSLIB
......
...@@ -45,6 +45,7 @@ pass_library(is_test_pass base) ...@@ -45,6 +45,7 @@ pass_library(is_test_pass base)
pass_library(conv_elementwise_add_act_fuse_pass inference) pass_library(conv_elementwise_add_act_fuse_pass inference)
pass_library(conv_elementwise_add2_act_fuse_pass inference) pass_library(conv_elementwise_add2_act_fuse_pass inference)
pass_library(conv_elementwise_add_fuse_pass inference) pass_library(conv_elementwise_add_fuse_pass inference)
pass_library(conv_affine_channel_fuse_pass inference)
if(WITH_MKLDNN) if(WITH_MKLDNN)
pass_library(mkldnn_placement_pass base) pass_library(mkldnn_placement_pass base)
pass_library(depthwise_conv_mkldnn_pass base) pass_library(depthwise_conv_mkldnn_pass base)
......
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.h"
#include <functional>
#include <string>
#include <vector>
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/operators/math/cpu_vec.h"
#include "paddle/fluid/platform/enforce.h"
namespace paddle {
namespace framework {
namespace ir {
#define GET_CONV_BN_NODES(pattern_name) \
/* OPERATORS */ \
GET_IR_NODE_FROM_SUBGRAPH(conv, conv, pattern_name); \
GET_IR_NODE_FROM_SUBGRAPH(affine_channel, affine_channel, pattern_name); \
/* CONV inputs */ \
GET_IR_NODE_FROM_SUBGRAPH(conv_weight, conv_weight, pattern_name); \
/* CONV outputs */ \
GET_IR_NODE_FROM_SUBGRAPH(conv_out, conv_out, pattern_name); \
/* Affine Channel inputs */ \
GET_IR_NODE_FROM_SUBGRAPH(ac_scale, ac_scale, pattern_name); \
GET_IR_NODE_FROM_SUBGRAPH(ac_bias, ac_bias, pattern_name); \
/* Affine channel outputs */ \
GET_IR_NODE_FROM_SUBGRAPH(ac_out, ac_out, pattern_name); /* Out */
void recompute_bias_and_weights(const Scope* scope, ir::Node* conv_weight,
const ir::Node& ac_scale,
const LoDTensor& ac_bias_tensor,
LoDTensor* eltwise_y_in_tensor) {
using EigenVectorArrayMap =
Eigen::Map<Eigen::Array<float, Eigen::Dynamic, 1>>;
using ConstEigenVectorArrayMap =
Eigen::Map<const Eigen::Array<float, Eigen::Dynamic, 1>>;
using EigenMatrixArrayMap = Eigen::Map<
Eigen::Array<float, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>>;
// Re-compute bias of conv2d from AffineChannel
PADDLE_ENFORCE_EQ(eltwise_y_in_tensor->dims(), ac_bias_tensor.dims());
auto* scale_tensor = scope->FindVar(ac_scale.Name())->GetMutable<LoDTensor>();
ConstEigenVectorArrayMap scale_array(scale_tensor->data<float>(),
scale_tensor->numel(), 1);
ConstEigenVectorArrayMap ac_bias_array(ac_bias_tensor.data<float>(),
ac_bias_tensor.numel(), 1);
EigenVectorArrayMap eltwise_y_in_array(
eltwise_y_in_tensor->mutable_data<float>(platform::CPUPlace()),
eltwise_y_in_tensor->numel(), 1);
eltwise_y_in_array = (eltwise_y_in_array * scale_array) + ac_bias_array;
// Re-compute weight of conv2d from AffineChannel
auto* weights = scope->FindVar(conv_weight->Name())->GetMutable<LoDTensor>();
auto weights_shape = weights->dims();
auto weights_shape_2d = flatten_to_2d(weights_shape, 1);
EigenMatrixArrayMap weights_array_2d(
weights->mutable_data<float>(platform::CPUPlace()), weights_shape_2d[0],
weights_shape_2d[1]);
weights_array_2d.colwise() *= scale_array;
}
std::unique_ptr<ir::Graph> ConvAffineChannelFusePass::ApplyImpl(
std::unique_ptr<ir::Graph> graph) const {
PADDLE_ENFORCE(graph.get());
FusePassBase::Init(name_scope_, graph.get());
auto* scope = param_scope();
PADDLE_ENFORCE(scope);
GraphPatternDetector gpd;
auto* conv_input =
gpd.mutable_pattern()
->NewNode(patterns::PDNodeName(name_scope_, "conv_input"))
->AsInput()
->assert_is_op_input("conv2d", "Input");
patterns::ConvAffineChannel conv_ac_pattern(gpd.mutable_pattern(),
name_scope_);
conv_ac_pattern(conv_input, false /*with_eltwise_add*/);
int found_conv_ac_count = 0;
auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
Graph* g) {
VLOG(4) << "handle ConvAffineChannel fuse";
GET_CONV_BN_NODES(conv_ac_pattern);
// check if fuse can be done and if MKL-DNN should be used
FuseOptions fuse_option = FindFuseOption(*conv, *affine_channel);
if (fuse_option == DO_NOT_FUSE) {
VLOG(3) << "do not perform conv+affinechannel fuse";
return;
}
// Create eltwise_y (conv bias) variable
VarDesc eltwise_y_in_desc(
patterns::PDNodeName(name_scope_, "eltwise_y_in"));
eltwise_y_in_desc.SetPersistable(true);
auto* eltwise_y_in_node = g->CreateVarNode(&eltwise_y_in_desc);
auto* eltwise_y_in_tensor =
scope->Var(eltwise_y_in_node->Name())->GetMutable<LoDTensor>();
// Get affine_channel bias
auto* ac_bias_tensor =
scope->FindVar(ac_bias->Name())->GetMutable<LoDTensor>();
// Initialize eltwise_y
eltwise_y_in_tensor->Resize(ac_bias_tensor->dims());
std::fill_n(eltwise_y_in_tensor->mutable_data<float>(platform::CPUPlace()),
eltwise_y_in_tensor->numel(), 0.0f);
// update weights and biases
recompute_bias_and_weights(scope, conv_weight, *ac_scale, *ac_bias_tensor,
eltwise_y_in_tensor);
// create an elementwise add node.
OpDesc desc;
desc.SetInput("X", std::vector<std::string>({conv_out->Name()}));
desc.SetInput("Y", std::vector<std::string>({eltwise_y_in_node->Name()}));
desc.SetOutput("Out", std::vector<std::string>({ac_out->Name()}));
desc.SetType("elementwise_add");
desc.SetAttr("axis", 1);
auto eltwise_op = g->CreateOpNode(&desc); // OpDesc will be copied.
GraphSafeRemoveNodes(graph.get(), {ac_scale, ac_bias, affine_channel});
IR_NODE_LINK_TO(conv_out, eltwise_op);
IR_NODE_LINK_TO(eltwise_y_in_node, eltwise_op);
IR_NODE_LINK_TO(eltwise_op, ac_out);
found_conv_ac_count++;
};
gpd(graph.get(), handler);
AddStatis(found_conv_ac_count);
return graph;
}
std::unique_ptr<ir::Graph> ConvEltwiseAddAffineChannelFusePass::ApplyImpl(
std::unique_ptr<ir::Graph> graph) const {
PADDLE_ENFORCE(graph.get());
FusePassBase::Init(name_scope_, graph.get());
auto* scope = param_scope();
PADDLE_ENFORCE(scope);
GraphPatternDetector gpd;
auto* conv_input =
gpd.mutable_pattern()
->NewNode(patterns::PDNodeName(name_scope_, "conv_input"))
->AsInput()
->assert_is_op_input("conv2d", "Input");
patterns::ConvAffineChannel conv_ac_pattern(gpd.mutable_pattern(),
name_scope_);
conv_ac_pattern(conv_input, true /*with_eltwise_add*/);
int found_conv_ac_count = 0;
auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
Graph* g) {
VLOG(4) << "handle ConvBN fuse";
GET_CONV_BN_NODES(conv_ac_pattern);
// OPERATORS
GET_IR_NODE_FROM_SUBGRAPH(eltwise, eltwise, conv_ac_pattern);
// BIAS inputs
GET_IR_NODE_FROM_SUBGRAPH(eltwise_y_in, eltwise_y_in, conv_ac_pattern);
// BIAS outputs
GET_IR_NODE_FROM_SUBGRAPH(eltwise_out, eltwise_out, conv_ac_pattern);
// Get eltwise_y (conv bias) variable
auto* eltwise_y_in_tensor =
scope->FindVar(eltwise_y_in->Name())->GetMutable<LoDTensor>();
// Get batch norm bias
auto* ac_bias_tensor =
scope->FindVar(ac_bias->Name())->GetMutable<LoDTensor>();
recompute_bias_and_weights(scope, conv_weight, *ac_scale, *ac_bias_tensor,
eltwise_y_in_tensor);
// Update the elementwise_add node
eltwise->Op()->SetAttr("axis", 1);
eltwise->Op()->SetOutput("Out", std::vector<std::string>({ac_out->Name()}));
GraphSafeRemoveNodes(graph.get(),
{ac_scale, ac_bias, affine_channel, eltwise_out});
IR_NODE_LINK_TO(eltwise, ac_out);
found_conv_ac_count++;
};
gpd(graph.get(), handler);
AddStatis(found_conv_ac_count);
return graph;
}
} // namespace ir
} // namespace framework
} // namespace paddle
REGISTER_PASS(conv_affine_channel_fuse_pass,
paddle::framework::ir::ConvAffineChannelFusePass);
REGISTER_PASS(conv_eltwiseadd_affine_channel_fuse_pass,
paddle::framework::ir::ConvEltwiseAddAffineChannelFusePass);
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <string>
#include "paddle/fluid/framework/ir/fuse_pass_base.h"
#include "paddle/fluid/framework/ir/graph.h"
#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
namespace paddle {
namespace framework {
namespace ir {
/*
* Fuse the Conv and ConvAffineChannel.
*/
class ConvAffineChannelFusePass : public FusePassBase {
public:
virtual ~ConvAffineChannelFusePass() {}
protected:
std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const;
const std::string name_scope_{"conv_affine_channel_fuse"};
};
class ConvEltwiseAddAffineChannelFusePass : public FusePassBase {
public:
virtual ~ConvEltwiseAddAffineChannelFusePass() {}
protected:
std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const;
const std::string name_scope_{"conv_eltwiseadd_affine_channel_fuse"};
};
} // namespace ir
} // namespace framework
} // namespace paddle
...@@ -40,18 +40,20 @@ framework::proto::OpDesc PrepareOpDesc( ...@@ -40,18 +40,20 @@ framework::proto::OpDesc PrepareOpDesc(
const std::string& output) { const std::string& output) {
auto proto = base_desc; auto proto = base_desc;
framework::OpDesc desc(proto, nullptr); framework::OpDesc desc(proto, nullptr);
desc.SetType("conv2d_fusion");
desc.SetInput("Bias", {bias}); desc.SetInput("Bias", {bias});
desc.SetInput("ResidualData", {bias1}); desc.SetInput("ResidualData", {bias1});
desc.SetAttr("activation", activation); desc.SetAttr("activation", activation);
desc.SetOutput("Output", {output}); desc.SetOutput("Output", {output});
desc.SetAttr("is_test", true); desc.SetAttr("is_test", true);
desc.SetAttr("use_cudnn", false);
desc.Flush();
return *desc.Proto(); return *desc.Proto();
} }
std::unique_ptr<ir::Graph> ConvElementwiseAdd2ActFusePass::ApplyImpl( std::unique_ptr<ir::Graph> ConvElementwiseAdd2ActFusePass::ApplyImpl(
std::unique_ptr<ir::Graph> graph) const { std::unique_ptr<ir::Graph> graph) const {
const std::string pattern_name = "conv_elementwise_add_act_fuse"; const std::string pattern_name = "conv_elementwise_add2_act_fuse";
FusePassBase::Init(pattern_name, graph.get()); FusePassBase::Init(pattern_name, graph.get());
GraphPatternDetector gpd; GraphPatternDetector gpd;
...@@ -76,22 +78,23 @@ std::unique_ptr<ir::Graph> ConvElementwiseAdd2ActFusePass::ApplyImpl( ...@@ -76,22 +78,23 @@ std::unique_ptr<ir::Graph> ConvElementwiseAdd2ActFusePass::ApplyImpl(
framework::OpDesc new_op_desc(new_op_proto, nullptr); framework::OpDesc new_op_desc(new_op_proto, nullptr);
// Create a new node for the fused op. // Create a new node for the fused op.
graph->CreateOpNode(&new_op_desc); auto* new_conv_op = graph->CreateOpNode(&new_op_desc);
// Link inputs and outputs. // Link inputs and outputs.
PADDLE_ENFORCE(subgraph.count(x)); PADDLE_ENFORCE(subgraph.count(x));
auto* conv_in_node = subgraph.at(x); auto* conv_in_node = subgraph.at(x);
IR_NODE_LINK_TO(conv_in_node, conv_op); // Input IR_NODE_LINK_TO(conv_in_node, new_conv_op); // Input
IR_NODE_LINK_TO(conv_filter, conv_op); // Filter IR_NODE_LINK_TO(conv_filter, new_conv_op); // Filter
IR_NODE_LINK_TO(conv_op, conv_out); // Output IR_NODE_LINK_TO(elementwise_add_in_y, new_conv_op); // Bias
IR_NODE_LINK_TO(elementwise_add_in_y, conv_op); // Bias IR_NODE_LINK_TO(elementwise_add_in_y_1, new_conv_op); // Bias
IR_NODE_LINK_TO(elementwise_add_in_y_1, conv_op); // Bias IR_NODE_LINK_TO(new_conv_op, act_out); // Output
// Delete the unneeded nodes. // Delete the unneeded nodes.
GraphSafeRemoveNodes(graph.get(), GraphSafeRemoveNodes(
{conv_op, elementwise_add_op, elementwise_add_op_1, graph.get(),
elementwise_add_out}); {conv_op, conv_out, elementwise_add_op, elementwise_add_op_1,
elementwise_add_out, elementwise_add_out_1, act_op});
}; };
gpd(graph.get(), handler); gpd(graph.get(), handler);
return graph; return graph;
......
...@@ -23,66 +23,8 @@ limitations under the License. */ ...@@ -23,66 +23,8 @@ limitations under the License. */
namespace paddle { namespace paddle {
namespace framework { namespace framework {
namespace ir { namespace ir {
namespace {
void CheckProgram(const ProgramDesc &program) {
#define _INT(role) static_cast<int>(role)
std::map<int, bool> visit;
for (OpDesc *op : program.Block(0).AllOps()) {
// For backward compatibility, some program doesn't have role added.
if (!op->HasAttr(OpProtoAndCheckerMaker::OpRoleAttrName())) continue;
int role_id =
boost::get<int>(op->GetAttr(OpProtoAndCheckerMaker::OpRoleAttrName()));
visit[role_id] = true;
switch (role_id) {
case _INT(OpRole::kForward):
if (visit.find(_INT(OpRole::kBackward)) != visit.end()) {
LOG(ERROR) << "Cannot add backward operator before forward operator "
<< op->Type();
}
break;
case _INT(OpRole::kBackward):
case _INT(OpRole::kBackward) | _INT(OpRole::kLoss):
PADDLE_ENFORCE(
visit.find(_INT(OpRole::kOptimize)) == visit.end(),
"Cannot add backward operator %s after optimize operator.",
op->Type());
break;
case _INT(OpRole::kForward) | _INT(OpRole::kLoss):
PADDLE_ENFORCE(visit.find(_INT(OpRole::kBackward) |
_INT(OpRole::kLoss)) == visit.end(),
"Cannot add backward|loss operator before "
"forward|loss operator %s.",
op->Type());
PADDLE_ENFORCE(
visit.find(_INT(OpRole::kOptimize)) == visit.end(),
"Cannot add forward|loss operator %s after optimize operator.",
op->Type());
break;
case _INT(OpRole::kOptimize):
case _INT(OpRole::kOptimize) | _INT(OpRole::kLRSched):
PADDLE_ENFORCE(visit.find(_INT(OpRole::kBackward)) != visit.end(),
"Optimize operators %s must follow backward operator.",
op->Type());
break;
case _INT(OpRole::kLRSched):
case _INT(OpRole::kDist):
case _INT(OpRole::kRPC):
case _INT(OpRole::kNotSpecified):
break;
default:
LOG(FATAL) << "Unknown operator role. Don't add new role because "
"you don't know what you are doing.";
}
}
#undef _INT
}
} // namespace
Graph::Graph(const ProgramDesc &program) : program_(program) { Graph::Graph(const ProgramDesc &program) : program_(program) {
CheckProgram(program_);
auto var_nodes = InitFromProgram(program_); auto var_nodes = InitFromProgram(program_);
ResolveHazard(var_nodes); ResolveHazard(var_nodes);
} }
......
...@@ -1101,9 +1101,7 @@ PDNode *patterns::ElementwiseAdd::operator()(PDNode *x_var, PDNode *y_var) { ...@@ -1101,9 +1101,7 @@ PDNode *patterns::ElementwiseAdd::operator()(PDNode *x_var, PDNode *y_var) {
return out_var; return out_var;
} }
std::unordered_set<std::string> conv_act_set({"identity", "sigmoid", "relu", std::unordered_set<std::string> conv_act_set({"identity", "relu"});
"relu6", "relux", "tanh",
"band_pass"});
PDNode *patterns::ConvElementwiseaddAct::operator()(PDNode *conv_in) { PDNode *patterns::ConvElementwiseaddAct::operator()(PDNode *conv_in) {
conv_in->AsInput(); conv_in->AsInput();
...@@ -1169,13 +1167,13 @@ PDNode *patterns::ConvElementwiseadd2Act::operator()(PDNode *conv_in) { ...@@ -1169,13 +1167,13 @@ PDNode *patterns::ConvElementwiseadd2Act::operator()(PDNode *conv_in) {
->AsInput(); ->AsInput();
auto elementwise_add_out = pattern->NewNode(elementwise_add_out_repr()) auto elementwise_add_out = pattern->NewNode(elementwise_add_out_repr())
->assert_is_op_output("elementwise_add") ->assert_is_op_output("elementwise_add")
->assert_is_op_input("elementwise_add", "X") ->assert_is_op_input("elementwise_add", "Y")
->AsIntermediate(); ->AsIntermediate();
auto elementwise_add_op_1 = pattern->NewNode(elementwise_add_op_1_repr()) auto elementwise_add_op_1 = pattern->NewNode(elementwise_add_op_1_repr())
->assert_is_op("elementwise_add"); ->assert_is_op("elementwise_add");
auto elementwise_add_in_y_1 = pattern->NewNode(elementwise_add_in_y_1_repr()) auto elementwise_add_in_y_1 = pattern->NewNode(elementwise_add_in_y_1_repr())
->assert_is_op_input("elementwise_add", "Y") ->assert_is_op_input("elementwise_add", "X")
->AsInput(); ->AsInput();
auto elementwise_add_out_1 = pattern->NewNode(elementwise_add_out_1_repr()) auto elementwise_add_out_1 = pattern->NewNode(elementwise_add_out_1_repr())
->assert_is_op_output("elementwise_add") ->assert_is_op_output("elementwise_add")
...@@ -1203,8 +1201,8 @@ PDNode *patterns::ConvElementwiseadd2Act::operator()(PDNode *conv_in) { ...@@ -1203,8 +1201,8 @@ PDNode *patterns::ConvElementwiseadd2Act::operator()(PDNode *conv_in) {
conv_op->LinksFrom({conv_in, conv_filter}).LinksTo({conv_out}); conv_op->LinksFrom({conv_in, conv_filter}).LinksTo({conv_out});
elementwise_add_op->LinksFrom({conv_out, elementwise_add_in_y}) elementwise_add_op->LinksFrom({conv_out, elementwise_add_in_y})
.LinksTo({elementwise_add_out}); .LinksTo({elementwise_add_out});
elementwise_add_op_1->LinksFrom( elementwise_add_op_1->LinksFrom({elementwise_add_out, elementwise_add_in_y_1})
{elementwise_add_out, elementwise_add_in_y_1}); .LinksTo({elementwise_add_out_1});
act_op->LinksFrom({elementwise_add_out_1}).LinksTo({act_out}); act_op->LinksFrom({elementwise_add_out_1}).LinksTo({act_out});
return act_out; return act_out;
} }
...@@ -1236,6 +1234,78 @@ PDNode *patterns::ConvElementwiseadd::operator()(PDNode *conv_in) { ...@@ -1236,6 +1234,78 @@ PDNode *patterns::ConvElementwiseadd::operator()(PDNode *conv_in) {
return elementwise_add_out; return elementwise_add_out;
} }
PDNode *patterns::ConvAffineChannel::operator()(
paddle::framework::ir::PDNode *conv_input, bool with_eltwise_add) {
// Create Operators
conv_input->assert_is_op_input("conv2d", "Input");
auto *conv_op = pattern->NewNode(conv_repr())->assert_is_op("conv2d");
PDNode *eltwise_op = nullptr;
if (with_eltwise_add) {
eltwise_op =
pattern->NewNode(eltwise_repr())->assert_is_op("elementwise_add");
}
auto *affine_channel_op =
pattern->NewNode(affine_channel_repr())->assert_is_op("affine_channel");
// Create variables
// Conv Filter
auto *conv_weight_var = pattern->NewNode(conv_weight_repr())
->AsInput()
->assert_is_persistable_var()
->assert_is_op_input("conv2d", "Filter");
auto *conv_out_var = pattern->NewNode(conv_out_repr())
->AsIntermediate()
->assert_is_only_output_of_op("conv2d");
PDNode *eltwise_y_in_var = nullptr;
PDNode *eltwise_out_var = nullptr;
if (with_eltwise_add) {
// Conv output as Bias input
conv_out_var->assert_is_op_input("elementwise_add", "X");
// Bias
eltwise_y_in_var = pattern->NewNode(eltwise_y_in_repr())
->assert_is_op_input("elementwise_add", "Y")
->AsInput();
eltwise_out_var = pattern->NewNode(eltwise_out_repr())
->AsIntermediate()
->assert_is_only_output_of_op("elementwise_add");
} else {
// Conv output as AffineChannel input
conv_out_var->assert_is_op_input("affine_channel", "X");
}
// AC Scale
auto *ac_scale_var = pattern->NewNode(ac_scale_repr())
->AsInput()
->assert_is_persistable_var()
->assert_is_op_input("affine_channel", "Scale");
// AC Bias
auto *ac_bias_var = pattern->NewNode(ac_bias_repr())
->AsInput()
->assert_is_persistable_var()
->assert_is_op_input("affine_channel", "Bias");
// AC output
auto *ac_out_var = pattern->NewNode(ac_out_repr())
->AsOutput()
->assert_is_op_output("affine_channel");
conv_op->LinksFrom({conv_input, conv_weight_var}).LinksTo({conv_out_var});
if (with_eltwise_add) {
eltwise_op->LinksFrom({conv_out_var, eltwise_y_in_var})
.LinksTo({eltwise_out_var});
affine_channel_op->LinksFrom({eltwise_out_var, ac_scale_var, ac_bias_var})
.LinksTo({ac_out_var});
} else {
affine_channel_op->LinksFrom({conv_out_var, ac_scale_var, ac_bias_var})
.LinksTo({ac_out_var});
}
return ac_out_var;
}
} // namespace ir } // namespace ir
} // namespace framework } // namespace framework
} // namespace paddle } // namespace paddle
...@@ -734,6 +734,38 @@ struct ConvElementwiseadd : public PatternBase { ...@@ -734,6 +734,38 @@ struct ConvElementwiseadd : public PatternBase {
PATTERN_DECL_NODE(elementwise_add_out); PATTERN_DECL_NODE(elementwise_add_out);
}; };
// Conv with affine_channel
// op: conv + (elementwise_add +) affine_channel
// named nodes:
// conv_weight, conv_out, conv,
// ac_x, ac_scale, ac_bias
// affine_channel, ac_out
struct ConvAffineChannel : public PatternBase {
ConvAffineChannel(PDPattern* pattern, const std::string& name_scope)
: PatternBase(pattern, name_scope, "conv_affine_channel") {}
PDNode* operator()(PDNode* conv_input, bool with_eltwise_add);
// declare operator node's name
PATTERN_DECL_NODE(conv);
PATTERN_DECL_NODE(affine_channel);
PATTERN_DECL_NODE(eltwise); // ELEMENTWISE_ADD
// CONV inputs
PATTERN_DECL_NODE(conv_weight); // Filter
// CONV outputs
PATTERN_DECL_NODE(conv_out); // tmp
// ELTWISE inputs
PATTERN_DECL_NODE(eltwise_y_in);
// ELTWISE outputs
PATTERN_DECL_NODE(eltwise_out); // tmp
// AC(Affine_Channel) inputs
PATTERN_DECL_NODE(ac_scale);
PATTERN_DECL_NODE(ac_bias);
// AC outputs
PATTERN_DECL_NODE(ac_out); // Out
};
} // namespace patterns } // namespace patterns
// Link two ir::Nodes from each other. // Link two ir::Nodes from each other.
......
...@@ -75,6 +75,7 @@ std::unique_ptr<Graph> BatchMergePass::ApplyImpl( ...@@ -75,6 +75,7 @@ std::unique_ptr<Graph> BatchMergePass::ApplyImpl(
std::vector<Node*> optimize_ops; std::vector<Node*> optimize_ops;
std::vector<Node*> lr_ops; // ops other than forward/backward/optimize std::vector<Node*> lr_ops; // ops other than forward/backward/optimize
std::unordered_set<std::string> grad_names; std::unordered_set<std::string> grad_names;
std::unordered_map<std::string, std::string> gradname2paramname;
std::vector<ir::Node*> nodes = TopologySortOperations(*graph); std::vector<ir::Node*> nodes = TopologySortOperations(*graph);
auto origin_nodes = graph->ReleaseNodes(); auto origin_nodes = graph->ReleaseNodes();
...@@ -99,6 +100,7 @@ std::unique_ptr<Graph> BatchMergePass::ApplyImpl( ...@@ -99,6 +100,7 @@ std::unique_ptr<Graph> BatchMergePass::ApplyImpl(
auto op_role_vars = boost::get<std::vector<std::string>>(op_role_var); auto op_role_vars = boost::get<std::vector<std::string>>(op_role_var);
for (size_t i = 0; i < op_role_vars.size(); i += 2) { for (size_t i = 0; i < op_role_vars.size(); i += 2) {
grad_names.insert(op_role_vars[i + 1]); grad_names.insert(op_role_vars[i + 1]);
gradname2paramname[op_role_vars[i + 1]] = op_role_vars[i];
} }
} else if (op_role & static_cast<int>(framework::OpRole::kLRSched)) { } else if (op_role & static_cast<int>(framework::OpRole::kLRSched)) {
lr_ops.push_back(node); lr_ops.push_back(node);
...@@ -109,7 +111,7 @@ std::unique_ptr<Graph> BatchMergePass::ApplyImpl( ...@@ -109,7 +111,7 @@ std::unique_ptr<Graph> BatchMergePass::ApplyImpl(
// 2. copy forward backward // 2. copy forward backward
ir::Node* prev_repeat_last_op_node = nullptr; ir::Node* prev_repeat_last_op_node = nullptr;
// record origin_grad -> repeated grad list map. // record origin_grad -> repeated_grad_list map.
std::map<ir::Node*, std::vector<ir::Node*>> grad_repeated_map; std::map<ir::Node*, std::vector<ir::Node*>> grad_repeated_map;
std::map<std::string, std::vector<ir::Node*>> created; std::map<std::string, std::vector<ir::Node*>> created;
std::unordered_set<std::string> bn_vars_need_rename; std::unordered_set<std::string> bn_vars_need_rename;
...@@ -124,10 +126,16 @@ std::unique_ptr<Graph> BatchMergePass::ApplyImpl( ...@@ -124,10 +126,16 @@ std::unique_ptr<Graph> BatchMergePass::ApplyImpl(
if (grad_names.find(outname) != grad_names.end()) { if (grad_names.find(outname) != grad_names.end()) {
std::string new_gname = string::Sprintf("%s.repeat.%d", outname, i); std::string new_gname = string::Sprintf("%s.repeat.%d", outname, i);
repeated_op.RenameOutput(outname, new_gname); repeated_op.RenameOutput(outname, new_gname);
// remove op_role_var for backward ops that outputs grad for a
// parameter.
repeated_op.SetAttr(OpProtoAndCheckerMaker::OpRoleVarAttrName(),
std::vector<std::string>());
} }
} }
// 3.5 let batch_norm ops use independent vars, note batch_norm_grad do // 3.5 let batch_norm ops use independent vars, note batch_norm_grad do
// not need this update // not need this update, because only moving mean and variance should be
// differ, trainable parameter scale and bias is the same as other
// parameters.
if (node->Name() == "batch_norm") { if (node->Name() == "batch_norm") {
// NOTE: assume bn op created by layers use save var as output mean and // NOTE: assume bn op created by layers use save var as output mean and
// variance // variance
...@@ -224,16 +232,25 @@ std::unique_ptr<Graph> BatchMergePass::ApplyImpl( ...@@ -224,16 +232,25 @@ std::unique_ptr<Graph> BatchMergePass::ApplyImpl(
var->inputs.push_back(repeated_node); var->inputs.push_back(repeated_node);
} }
} }
} } // end copy forward backward
// 5. create GRAD merge op node // 5. create GRAD merge op node: sum(repeat.0...repeat.n) ->
// scale(1/num_repeats)
for (auto kv : grad_repeated_map) { for (auto kv : grad_repeated_map) {
OpDesc sum_op; OpDesc sum_op;
sum_op.SetType("sum"); sum_op.SetType("sum");
std::vector<std::string> repeated_grad_names; std::vector<std::string> repeated_grad_names;
std::vector<std::string> param_grad_op_role_var;
for (auto r : kv.second) { for (auto r : kv.second) {
repeated_grad_names.push_back(r->Var()->Name()); repeated_grad_names.push_back(r->Var()->Name());
} }
// NOTE: use op_role_var to control allreduce op appending in
// multi_devices_graph_pass, we want to append op_role_var
// only once for the merged gradient, so break after first call.
param_grad_op_role_var.push_back(
gradname2paramname.at(kv.first->Var()->Name())); // param
param_grad_op_role_var.push_back(kv.first->Var()->Name()); // grad
sum_op.SetInput("X", repeated_grad_names); sum_op.SetInput("X", repeated_grad_names);
sum_op.SetOutput("Out", {kv.first->Var()->Name()}); sum_op.SetOutput("Out", {kv.first->Var()->Name()});
sum_op.SetAttr(OpProtoAndCheckerMaker::OpRoleAttrName(), sum_op.SetAttr(OpProtoAndCheckerMaker::OpRoleAttrName(),
...@@ -256,6 +273,10 @@ std::unique_ptr<Graph> BatchMergePass::ApplyImpl( ...@@ -256,6 +273,10 @@ std::unique_ptr<Graph> BatchMergePass::ApplyImpl(
scale_op.SetAttr("scale", static_cast<float>(1.0f / num_repeats)); scale_op.SetAttr("scale", static_cast<float>(1.0f / num_repeats));
scale_op.SetAttr(OpProtoAndCheckerMaker::OpRoleAttrName(), scale_op.SetAttr(OpProtoAndCheckerMaker::OpRoleAttrName(),
static_cast<int>(OpRole::kBackward)); static_cast<int>(OpRole::kBackward));
scale_op.SetAttr(OpProtoAndCheckerMaker::OpRoleVarAttrName(),
param_grad_op_role_var);
auto scale_op_node = result.CreateOpNode(&scale_op); auto scale_op_node = result.CreateOpNode(&scale_op);
scale_op_node->inputs.push_back(sum_out_var_node); scale_op_node->inputs.push_back(sum_out_var_node);
sum_out_var_node->outputs.push_back(scale_op_node); sum_out_var_node->outputs.push_back(scale_op_node);
......
...@@ -215,8 +215,8 @@ class Vector { ...@@ -215,8 +215,8 @@ class Vector {
auto stream = dev_ctx->stream(); auto stream = dev_ctx->stream();
void *src = gpu_->ptr(); void *src = gpu_->ptr();
void *dst = cpu_.data(); void *dst = cpu_.data();
memory::Copy(platform::CPUPlace(), dst, CUDAPlace().get(), src, paddle::memory::Copy(platform::CPUPlace(), dst, CUDAPlace().get(), src,
gpu_->size(), stream); gpu_->size(), stream);
dev_ctx->Wait(); dev_ctx->Wait();
} }
...@@ -261,8 +261,8 @@ class Vector { ...@@ -261,8 +261,8 @@ class Vector {
auto *dev_ctx = static_cast<platform::CUDADeviceContext *>( auto *dev_ctx = static_cast<platform::CUDADeviceContext *>(
platform::DeviceContextPool::Instance().Get(place)); platform::DeviceContextPool::Instance().Get(place));
auto stream = dev_ctx->stream(); auto stream = dev_ctx->stream();
memory::Copy(CUDAPlace().get(), dst, platform::CPUPlace(), src, paddle::memory::Copy(CUDAPlace().get(), dst, platform::CPUPlace(), src,
gpu_->size(), stream); gpu_->size(), stream);
} }
void ImmutableCPU() const { void ImmutableCPU() const {
...@@ -284,7 +284,7 @@ class Vector { ...@@ -284,7 +284,7 @@ class Vector {
bool IsInCPU() const { return flag_ & kDataInCPU; } bool IsInCPU() const { return flag_ & kDataInCPU; }
mutable std::vector<T> cpu_; mutable std::vector<T> cpu_;
mutable memory::AllocationPtr gpu_; mutable paddle::memory::AllocationPtr gpu_;
mutable int flag_; mutable int flag_;
mutable std::mutex mtx_; mutable std::mutex mtx_;
......
...@@ -399,7 +399,7 @@ void NgraphEngine::BuildNgFunction() { ...@@ -399,7 +399,7 @@ void NgraphEngine::BuildNgFunction() {
BuildNgNodes(); BuildNgNodes();
ngraph_function_ = nullptr; ngraph_function_ = nullptr;
ngraph::NodeVector func_outputs; ngraph::NodeVector func_outputs;
ngraph::op::ParameterVector func_inputs; ngraph::ParameterVector func_inputs;
for (auto& vo : var_out_) { for (auto& vo : var_out_) {
func_outputs.push_back(var_node_map_->at(vo)); func_outputs.push_back(var_node_map_->at(vo));
......
...@@ -82,10 +82,6 @@ void OpProtoAndCheckerMaker::operator()(proto::OpProto* proto, ...@@ -82,10 +82,6 @@ void OpProtoAndCheckerMaker::operator()(proto::OpProto* proto,
AddAttr<std::string>(OpNamescopeAttrName(), "Operator name with namesope.") AddAttr<std::string>(OpNamescopeAttrName(), "Operator name with namesope.")
.SetDefault(""); .SetDefault("");
AddAttr<std::vector<std::string>>(OpCreationCallstackAttrName(),
"Callstack for Op Creatation.")
.SetDefault({});
Validate(); Validate();
} }
......
...@@ -47,7 +47,6 @@ class OpProtoAndCheckerMaker { ...@@ -47,7 +47,6 @@ class OpProtoAndCheckerMaker {
static const char *OpRoleAttrName() { return "op_role"; } static const char *OpRoleAttrName() { return "op_role"; }
static const char *OpRoleVarAttrName() { return "op_role_var"; } static const char *OpRoleVarAttrName() { return "op_role_var"; }
static const char *OpNamescopeAttrName() { return "op_namescope"; } static const char *OpNamescopeAttrName() { return "op_namescope"; }
static const char *OpCreationCallstackAttrName() { return "op_callstack"; }
void operator()(proto::OpProto *proto, OpAttrChecker *attr_checker); void operator()(proto::OpProto *proto, OpAttrChecker *attr_checker);
......
...@@ -23,7 +23,8 @@ limitations under the License. */ ...@@ -23,7 +23,8 @@ limitations under the License. */
#include <unordered_map> #include <unordered_map>
#include <unordered_set> #include <unordered_set>
#include "glog/logging.h" // For VLOG() #define GLOG_NO_ABBREVIATED_SEVERITIES // msvc conflict logging with windows.h
#include "glog/logging.h" // For VLOG()
#include "paddle/fluid/framework/attribute.h" #include "paddle/fluid/framework/attribute.h"
#include "paddle/fluid/framework/details/op_registry.h" #include "paddle/fluid/framework/details/op_registry.h"
#include "paddle/fluid/framework/framework.pb.h" #include "paddle/fluid/framework/framework.pb.h"
......
...@@ -16,15 +16,9 @@ limitations under the License. */ ...@@ -16,15 +16,9 @@ limitations under the License. */
#include <glog/logging.h> #include <glog/logging.h>
#include <algorithm> #include <algorithm>
#include <sstream>
#include <string>
#include <vector>
#include "gflags/gflags.h"
#include "glog/logging.h"
#include "paddle/fluid/framework/data_transform.h" #include "paddle/fluid/framework/data_transform.h"
#include "paddle/fluid/framework/executor.h" #include "paddle/fluid/framework/executor.h"
#include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/op_proto_maker.h"
#include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/framework/shape_inference.h" #include "paddle/fluid/framework/shape_inference.h"
#include "paddle/fluid/framework/transfer_scope_cache.h" #include "paddle/fluid/framework/transfer_scope_cache.h"
...@@ -162,67 +156,31 @@ RuntimeContext::RuntimeContext(const VariableNameMap& innames, ...@@ -162,67 +156,31 @@ RuntimeContext::RuntimeContext(const VariableNameMap& innames,
} }
void OperatorBase::Run(const Scope& scope, const platform::Place& place) { void OperatorBase::Run(const Scope& scope, const platform::Place& place) {
try { VLOG(4) << place << " " << DebugStringEx(&scope);
if (VLOG_IS_ON(4)) { if (platform::is_gpu_place(place)) {
VLOG(4) << place << " " << DebugStringEx(&scope);
}
if (platform::is_gpu_place(place)) {
#ifndef PADDLE_WITH_CUDA #ifndef PADDLE_WITH_CUDA
PADDLE_THROW("Cannot run operator on place %s", place); PADDLE_THROW("Cannot run operator on place %s", place);
#else #else
auto dev_id = boost::get<platform::CUDAPlace>(place).device; auto dev_id = boost::get<platform::CUDAPlace>(place).device;
platform::SetDeviceId(dev_id); platform::SetDeviceId(dev_id);
#endif #endif
} }
// The profile has a process-wide mutex, results in serious performance
// issue
// in concurrency scenerio. Here use an `if` to fix this issue.
// Please not remove the `if`, ask @Superjomn if there are any concern.
if (platform::IsProfileEnabled()) {
platform::DeviceContextPool& pool =
platform::DeviceContextPool::Instance();
platform::RecordEvent record_event(Type(), pool.Get(place));
RunImpl(scope, place);
} else {
RunImpl(scope, place);
}
if (VLOG_IS_ON(3)) {
VLOG(3) << place << " " << DebugStringEx(&scope);
}
} catch (platform::EnforceNotMet exception) {
if (Attrs().count("sub_block") != 0) {
throw exception;
}
auto& callstack = Attr<std::vector<std::string>>(
OpProtoAndCheckerMaker::OpCreationCallstackAttrName());
if (callstack.empty()) { // The profile has a process-wide mutex, results in serious performance issue
throw exception; // in concurrency scenerio. Here use an `if` to fix this issue.
} // Please not remove the `if`, ask @Superjomn if there are any concern.
std::ostringstream sout; if (platform::IsProfileEnabled()) {
sout << "Invoke operator " << Type() << " error.\n"; platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
sout << "Python Callstacks: \n"; platform::RecordEvent record_event(Type(), pool.Get(place));
for (auto& line : callstack) { RunImpl(scope, place);
sout << line; } else {
} RunImpl(scope, place);
sout << "C++ Callstacks: \n";
sout << exception.err_str_;
exception.err_str_ = sout.str();
throw exception;
} catch (...) {
std::rethrow_exception(std::current_exception());
} }
VLOG(3) << place << " " << DebugStringEx(&scope);
} }
bool OperatorBase::HasInputs(const std::string& name) const { bool OperatorBase::HasInputs(const std::string& name) const {
if (inputs_.find(name) != inputs_.end()) { return inputs_.find(name) != inputs_.end();
return true;
} else {
return false;
}
} }
std::string OperatorBase::Input(const std::string& name) const { std::string OperatorBase::Input(const std::string& name) const {
...@@ -421,7 +379,7 @@ const Tensor* GetLoDTensorOrSelectedRowsValueFromVar(const Variable& var) { ...@@ -421,7 +379,7 @@ const Tensor* GetLoDTensorOrSelectedRowsValueFromVar(const Variable& var) {
return &(var.Get<SelectedRows>().value()); return &(var.Get<SelectedRows>().value());
} else { } else {
PADDLE_THROW("Variable type_id %s, expect LoDTensor/SelectedRows.", PADDLE_THROW("Variable type_id %s, expect LoDTensor/SelectedRows.",
var.Type().name()); ToTypeName(var.Type()));
} }
} }
...@@ -432,7 +390,7 @@ Tensor* GetMutableLoDTensorOrSelectedRowsValueFromVar(Variable* var) { ...@@ -432,7 +390,7 @@ Tensor* GetMutableLoDTensorOrSelectedRowsValueFromVar(Variable* var) {
return var->GetMutable<SelectedRows>()->mutable_value(); return var->GetMutable<SelectedRows>()->mutable_value();
} else { } else {
PADDLE_THROW("Variable type_id %s, expect LoDTensor/SelectedRows.", PADDLE_THROW("Variable type_id %s, expect LoDTensor/SelectedRows.",
var->Type().name()); ToTypeName(var->Type()));
} }
} }
...@@ -526,7 +484,7 @@ const std::vector<const Tensor*> ExecutionContext::MultiInput<Tensor>( ...@@ -526,7 +484,7 @@ const std::vector<const Tensor*> ExecutionContext::MultiInput<Tensor>(
PADDLE_ENFORCE( PADDLE_ENFORCE(
var->IsType<LoDTensor>(), var->IsType<LoDTensor>(),
"should be LoDTensor, but the received type is %s", "should be LoDTensor, but the received type is %s",
var->Type().name()); ToTypeName(var->Type()));
return &(var->Get<LoDTensor>()); return &(var->Get<LoDTensor>());
}); });
return res; return res;
...@@ -545,7 +503,7 @@ const std::vector<const Tensor*> ExecutionContext::LegacyMultiInput<Tensor>( ...@@ -545,7 +503,7 @@ const std::vector<const Tensor*> ExecutionContext::LegacyMultiInput<Tensor>(
PADDLE_ENFORCE( PADDLE_ENFORCE(
var->IsType<LoDTensor>(), var->IsType<LoDTensor>(),
"%s should be LoDTensor, but the received type is %s", "%s should be LoDTensor, but the received type is %s",
sub_name, var->Type().name()); sub_name, ToTypeName(var->Type()));
return &(var->Get<LoDTensor>()); return &(var->Get<LoDTensor>());
}); });
return res; return res;
...@@ -574,7 +532,7 @@ std::vector<Tensor*> ExecutionContext::MultiOutput<Tensor>( ...@@ -574,7 +532,7 @@ std::vector<Tensor*> ExecutionContext::MultiOutput<Tensor>(
PADDLE_ENFORCE( PADDLE_ENFORCE(
var->IsType<LoDTensor>(), var->IsType<LoDTensor>(),
"%s should be LoDTensor, but the received type is %s", "%s should be LoDTensor, but the received type is %s",
sub_name, var->Type().name()); sub_name, ToTypeName(var->Type()));
return var->GetMutable<LoDTensor>(); return var->GetMutable<LoDTensor>();
}); });
return res; return res;
...@@ -816,7 +774,7 @@ class RuntimeInferShapeContext : public InferShapeContext { ...@@ -816,7 +774,7 @@ class RuntimeInferShapeContext : public InferShapeContext {
PADDLE_THROW( PADDLE_THROW(
"Only LoDTensor/SelectedRows support 'GetDim', but Variables " "Only LoDTensor/SelectedRows support 'GetDim', but Variables "
"type_id is %s.", "type_id is %s.",
var->Type().name()); ToTypeName(var->Type()));
} }
} }
...@@ -839,7 +797,7 @@ class RuntimeInferShapeContext : public InferShapeContext { ...@@ -839,7 +797,7 @@ class RuntimeInferShapeContext : public InferShapeContext {
var->GetMutable<SelectedRows>()->set_height(dim[0]); var->GetMutable<SelectedRows>()->set_height(dim[0]);
} else { } else {
PADDLE_THROW("Variable type_id %s, expect LoDTensor/SelectedRows.", PADDLE_THROW("Variable type_id %s, expect LoDTensor/SelectedRows.",
var->Type().name()); ToTypeName(var->Type()));
} }
} }
...@@ -1082,12 +1040,11 @@ Scope* OperatorWithKernel::PrepareData( ...@@ -1082,12 +1040,11 @@ Scope* OperatorWithKernel::PrepareData(
proto::VarType::Type OperatorWithKernel::IndicateDataType( proto::VarType::Type OperatorWithKernel::IndicateDataType(
const ExecutionContext& ctx) const { const ExecutionContext& ctx) const {
auto& scope = ctx.scope();
int data_type = -1; int data_type = -1;
std::string last_input_name;
for (auto& input : this->inputs_) { for (auto& input : this->inputs_) {
for (auto& ipt_name : input.second) { const std::vector<const Variable*> vars = ctx.MultiInputVar(input.first);
auto* var = scope.FindVar(ipt_name); for (size_t i = 0; i < vars.size(); ++i) {
const Variable* var = vars[i];
if (var != nullptr) { if (var != nullptr) {
const Tensor* t = nullptr; const Tensor* t = nullptr;
if (var->IsType<Tensor>()) { if (var->IsType<Tensor>()) {
...@@ -1098,15 +1055,14 @@ proto::VarType::Type OperatorWithKernel::IndicateDataType( ...@@ -1098,15 +1055,14 @@ proto::VarType::Type OperatorWithKernel::IndicateDataType(
t = &(var->Get<SelectedRows>().value()); t = &(var->Get<SelectedRows>().value());
} }
if (t != nullptr) { if (t != nullptr) {
PADDLE_ENFORCE(t->IsInitialized(), "Input %s is not initialized", PADDLE_ENFORCE(t->IsInitialized(), "Input %s(%lu)is not initialized",
ipt_name); input.first, i);
int tmp = static_cast<int>(t->type()); int tmp = static_cast<int>(t->type());
PADDLE_ENFORCE( PADDLE_ENFORCE(
tmp == data_type || data_type == -1, tmp == data_type || data_type == -1,
"DataType of Paddle Op %s must be the same. Get %s(%d) != %s(%d)", "DataType of Paddle Op %s must be the same. Get (%d) != (%d)",
Type(), last_input_name, data_type, ipt_name, tmp); Type(), data_type, tmp);
data_type = tmp; data_type = tmp;
last_input_name = ipt_name;
} }
} }
} }
......
...@@ -49,6 +49,8 @@ constexpr char kTempVarName[] = "@TEMP@"; ...@@ -49,6 +49,8 @@ constexpr char kTempVarName[] = "@TEMP@";
/// e.g. Variable "x@GRAD" is the gradient of varibale "x". /// e.g. Variable "x@GRAD" is the gradient of varibale "x".
constexpr char kGradVarSuffix[] = "@GRAD"; constexpr char kGradVarSuffix[] = "@GRAD";
constexpr size_t kGradVarSuffixSize = 5U;
/// Variables with this suffix are supposed to be filled up with zeros. /// Variables with this suffix are supposed to be filled up with zeros.
constexpr char kZeroVarSuffix[] = "@ZERO"; constexpr char kZeroVarSuffix[] = "@ZERO";
...@@ -60,7 +62,11 @@ constexpr char kNewGradSuffix[] = "@NEWGRAD@"; ...@@ -60,7 +62,11 @@ constexpr char kNewGradSuffix[] = "@NEWGRAD@";
extern std::vector<std::tuple<platform::Place, LibraryType>> kKernelPriority; extern std::vector<std::tuple<platform::Place, LibraryType>> kKernelPriority;
inline std::string GradVarName(const std::string& var_name) { inline std::string GradVarName(const std::string& var_name) {
return var_name + kGradVarSuffix; std::string result;
result.reserve(var_name.size() + kGradVarSuffixSize);
result += var_name;
result += kGradVarSuffix;
return result;
} }
proto::VarType::Type GetDataTypeOfVar(const Variable* var); proto::VarType::Type GetDataTypeOfVar(const Variable* var);
...@@ -75,6 +81,10 @@ class RuntimeContext { ...@@ -75,6 +81,10 @@ class RuntimeContext {
RuntimeContext(const VariableNameMap& innames, RuntimeContext(const VariableNameMap& innames,
const VariableNameMap& outnames, const Scope& scope); const VariableNameMap& outnames, const Scope& scope);
RuntimeContext(const VariableValueMap& invars,
const VariableValueMap& outvars)
: inputs(invars), outputs(outvars) {}
VariableValueMap inputs; VariableValueMap inputs;
VariableValueMap outputs; VariableValueMap outputs;
}; };
...@@ -110,8 +120,8 @@ class OperatorBase { ...@@ -110,8 +120,8 @@ class OperatorBase {
bool HasAttr(const std::string& name) const { return attrs_.count(name); } bool HasAttr(const std::string& name) const { return attrs_.count(name); }
template <typename T> template <typename T>
inline const T& Attr(const std::string& name) const { inline const T& Attr(const std::string& name) const {
PADDLE_ENFORCE(attrs_.count(name) != 0, "%s should be in AttributeMap", PADDLE_ENFORCE(attrs_.find(name) != attrs_.end(),
name); "%s should be in AttributeMap", name);
return boost::get<T>(attrs_.at(name)); return boost::get<T>(attrs_.at(name));
} }
const AttributeMap& Attrs() const { return attrs_; } const AttributeMap& Attrs() const { return attrs_; }
...@@ -441,8 +451,9 @@ class OperatorWithKernel : public OperatorBase { ...@@ -441,8 +451,9 @@ class OperatorWithKernel : public OperatorBase {
void RuntimeInferShape(const Scope& scope, const platform::Place& place, void RuntimeInferShape(const Scope& scope, const platform::Place& place,
const RuntimeContext& ctx) const override; const RuntimeContext& ctx) const override;
protected:
virtual OpKernelType GetExpectedKernelType(const ExecutionContext& ctx) const; virtual OpKernelType GetExpectedKernelType(const ExecutionContext& ctx) const;
protected:
virtual OpKernelType GetKernelTypeForVar( virtual OpKernelType GetKernelTypeForVar(
const std::string& var_name, const Tensor& tensor, const std::string& var_name, const Tensor& tensor,
const OpKernelType& expected_kernel_type) const; const OpKernelType& expected_kernel_type) const;
......
...@@ -320,6 +320,7 @@ void ParallelExecutor::BCastParamsToDevices( ...@@ -320,6 +320,7 @@ void ParallelExecutor::BCastParamsToDevices(
if (paddle::platform::is_gpu_place(main_tensor.place())) { if (paddle::platform::is_gpu_place(main_tensor.place())) {
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
std::vector<void *> buffers; std::vector<void *> buffers;
buffers.reserve(member_->places_.size());
size_t numel = main_tensor.numel(); size_t numel = main_tensor.numel();
ncclDataType_t data_type = platform::ToNCCLDataType(main_tensor.type()); ncclDataType_t data_type = platform::ToNCCLDataType(main_tensor.type());
for (size_t i = 0; i < member_->places_.size(); ++i) { for (size_t i = 0; i < member_->places_.size(); ++i) {
...@@ -353,9 +354,7 @@ void ParallelExecutor::BCastParamsToDevices( ...@@ -353,9 +354,7 @@ void ParallelExecutor::BCastParamsToDevices(
#endif #endif
} else { } else {
platform::CPUPlace cpu; platform::CPUPlace cpu;
for (size_t i = 0; i < member_->places_.size(); ++i) { for (size_t i = 1; i < member_->places_.size(); ++i) {
if (i == 0) continue;
auto local_scope = member_->local_scopes_[i]; auto local_scope = member_->local_scopes_[i];
auto *t = local_scope->Var(var)->GetMutable<LoDTensor>(); auto *t = local_scope->Var(var)->GetMutable<LoDTensor>();
......
...@@ -16,7 +16,9 @@ limitations under the License. */ ...@@ -16,7 +16,9 @@ limitations under the License. */
#if !defined(_WIN32) #if !defined(_WIN32)
#include <pthread.h> #include <pthread.h>
#endif // !_WIN32 #else
#include <mutex> // NOLINT
#endif // !_WIN32
#include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/enforce.h"
...@@ -29,17 +31,17 @@ struct RWLock { ...@@ -29,17 +31,17 @@ struct RWLock {
~RWLock() { pthread_rwlock_destroy(&lock_); } ~RWLock() { pthread_rwlock_destroy(&lock_); }
void RDLock() { inline void RDLock() {
PADDLE_ENFORCE_EQ(pthread_rwlock_rdlock(&lock_), 0, PADDLE_ENFORCE_EQ(pthread_rwlock_rdlock(&lock_), 0,
"acquire read lock failed"); "acquire read lock failed");
} }
void WRLock() { inline void WRLock() {
PADDLE_ENFORCE_EQ(pthread_rwlock_wrlock(&lock_), 0, PADDLE_ENFORCE_EQ(pthread_rwlock_wrlock(&lock_), 0,
"acquire write lock failed"); "acquire write lock failed");
} }
void UNLock() { inline void UNLock() {
PADDLE_ENFORCE_EQ(pthread_rwlock_unlock(&lock_), 0, "unlock failed"); PADDLE_ENFORCE_EQ(pthread_rwlock_unlock(&lock_), 0, "unlock failed");
} }
...@@ -51,81 +53,46 @@ struct RWLock { ...@@ -51,81 +53,46 @@ struct RWLock {
// https://stackoverflow.com/questions/7125250/making-pthread-rwlock-wrlock-recursive // https://stackoverflow.com/questions/7125250/making-pthread-rwlock-wrlock-recursive
// In windows, rw_lock seems like a hack. Use empty object and do nothing. // In windows, rw_lock seems like a hack. Use empty object and do nothing.
struct RWLock { struct RWLock {
void RDLock() {} // FIXME(minqiyang): use mutex here to do fake lock
void WRLock() {} inline void RDLock() { mutex_.lock(); }
void UNLock() {}
inline void WRLock() { mutex_.lock(); }
inline void UNLock() { mutex_.unlock(); }
private:
std::mutex mutex_;
}; };
#endif #endif
class RWLockGuard { class AutoWRLock {
public: public:
enum Status { kUnLock, kWRLock, kRDLock }; explicit AutoWRLock(RWLock* rw_lock) : lock_(rw_lock) { Lock(); }
RWLockGuard(RWLock* rw_lock, Status init_status)
: lock_(rw_lock), status_(Status::kUnLock) {
switch (init_status) {
case Status::kRDLock: {
RDLock();
break;
}
case Status::kWRLock: {
WRLock();
break;
}
case Status::kUnLock: {
break;
}
}
}
void WRLock() { ~AutoWRLock() { UnLock(); }
switch (status_) {
case Status::kUnLock: {
lock_->WRLock();
status_ = Status::kWRLock;
break;
}
case Status::kWRLock: {
break;
}
case Status::kRDLock: {
PADDLE_THROW(
"Please unlock read lock first before invoking write lock.");
break;
}
}
}
void RDLock() { private:
switch (status_) { inline void Lock() { lock_->WRLock(); }
case Status::kUnLock: {
lock_->RDLock();
status_ = Status::kRDLock;
break;
}
case Status::kRDLock: {
break;
}
case Status::kWRLock: {
PADDLE_THROW(
"Please unlock write lock first before invoking read lock.");
break;
}
}
}
void UnLock() { inline void UnLock() { lock_->UNLock(); }
if (status_ != Status::kUnLock) {
lock_->UNLock(); private:
status_ = Status::kUnLock; RWLock* lock_;
} };
}
class AutoRDLock {
public:
explicit AutoRDLock(RWLock* rw_lock) : lock_(rw_lock) { Lock(); }
~AutoRDLock() { UnLock(); }
private:
inline void Lock() { lock_->RDLock(); }
~RWLockGuard() { UnLock(); } inline void UnLock() { lock_->UNLock(); }
private: private:
RWLock* lock_; RWLock* lock_;
Status status_;
}; };
} // namespace framework } // namespace framework
......
...@@ -47,9 +47,15 @@ DEFINE_bool(fast_eager_deletion_mode, false, ...@@ -47,9 +47,15 @@ DEFINE_bool(fast_eager_deletion_mode, false,
// the mutex will cause serious performance issue. // the mutex will cause serious performance issue.
// So the mutex is disabled when `ON_INFER`. // So the mutex is disabled when `ON_INFER`.
#ifdef PADDLE_ON_INFERENCE #ifdef PADDLE_ON_INFERENCE
#define SCOPE_LOCK_GUARD #define SCOPE_KIDS_READER_LOCK
#define SCOPE_KIDS_WRITER_LOCK
#define SCOPE_VARS_READER_LOCK
#define SCOPE_VARS_WRITER_LOCK
#else #else
#define SCOPE_LOCK_GUARD std::lock_guard<std::mutex> lock(mutex_); #define SCOPE_KIDS_READER_LOCK AutoRDLock auto_lock(&kids_lock_);
#define SCOPE_KIDS_WRITER_LOCK AutoWRLock auto_lock(&kids_lock_);
#define SCOPE_VARS_READER_LOCK AutoRDLock auto_lock(&vars_lock_);
#define SCOPE_VARS_WRITER_LOCK AutoWRLock auto_lock(&vars_lock_);
#endif #endif
namespace paddle { namespace paddle {
...@@ -67,64 +73,69 @@ bool IsFastEagerDeletionModeEnabled() { return FLAGS_fast_eager_deletion_mode; } ...@@ -67,64 +73,69 @@ bool IsFastEagerDeletionModeEnabled() { return FLAGS_fast_eager_deletion_mode; }
Scope::~Scope() { DropKids(); } Scope::~Scope() { DropKids(); }
Scope& Scope::NewScope() const { Scope& Scope::NewScope() const {
SCOPE_LOCK_GUARD Scope* child = new Scope(this);
kids_.push_back(new Scope(this)); {
return *kids_.back(); SCOPE_KIDS_WRITER_LOCK
kids_.push_back(child);
}
return *child;
} }
Variable* Scope::Var(const std::string& name) { Variable* Scope::Var(const std::string& name) {
SCOPE_LOCK_GUARD SCOPE_VARS_WRITER_LOCK
return VarInternal(name); return VarInternal(name);
} }
Variable* Scope::Var(std::string* name) { Variable* Scope::Var(std::string* name) {
SCOPE_LOCK_GUARD
auto new_name = string::Sprintf("%p.%d", this, vars_.size()); auto new_name = string::Sprintf("%p.%d", this, vars_.size());
if (name != nullptr) { if (name != nullptr) {
*name = new_name; *name = new_name;
} }
SCOPE_VARS_WRITER_LOCK
return VarInternal(new_name); return VarInternal(new_name);
} }
Variable* Scope::FindVar(const std::string& name) const { Variable* Scope::FindVar(const std::string& name) const {
SCOPE_LOCK_GUARD SCOPE_VARS_READER_LOCK
return FindVarInternal(name); return FindVarInternal(name);
} }
Variable* Scope::FindLocalVar(const std::string& name) const { Variable* Scope::FindLocalVar(const std::string& name) const {
SCOPE_LOCK_GUARD SCOPE_VARS_READER_LOCK
return FindVarLocally(name); return FindVarLocally(name);
} }
const Scope* Scope::FindScope(const Variable* var) const { const Scope* Scope::FindScope(const Variable* var) const {
SCOPE_LOCK_GUARD SCOPE_VARS_READER_LOCK
return FindScopeInternal(var); return FindScopeInternal(var);
} }
void Scope::DropKids() { void Scope::DropKids() {
SCOPE_LOCK_GUARD SCOPE_KIDS_WRITER_LOCK
for (Scope* s : kids_) delete s; for (Scope* s : kids_) delete s;
kids_.clear(); kids_.clear();
} }
bool Scope::HasKid(const Scope* scope) const { bool Scope::HasKid(const Scope* scope) const {
SCOPE_LOCK_GUARD SCOPE_KIDS_READER_LOCK
auto it = std::find(this->kids_.begin(), this->kids_.end(), scope); auto it = std::find(this->kids_.begin(), this->kids_.end(), scope);
return it != this->kids_.end(); return it != this->kids_.end();
} }
std::vector<std::string> Scope::LocalVarNames() const { std::vector<std::string> Scope::LocalVarNames() const {
SCOPE_LOCK_GUARD
std::vector<std::string> known_vars; std::vector<std::string> known_vars;
known_vars.reserve(this->vars_.size()); {
for (auto& p : vars_) { SCOPE_VARS_READER_LOCK
known_vars.emplace_back(p.first); known_vars.reserve(this->vars_.size());
for (auto& p : vars_) {
known_vars.emplace_back(p.first);
}
} }
return known_vars; return known_vars;
} }
void Scope::DeleteScope(Scope* scope) const { void Scope::DeleteScope(Scope* scope) const {
SCOPE_LOCK_GUARD SCOPE_KIDS_WRITER_LOCK
auto it = std::find(this->kids_.begin(), this->kids_.end(), scope); auto it = std::find(this->kids_.begin(), this->kids_.end(), scope);
PADDLE_ENFORCE(it != this->kids_.end(), "%p Cannot find %p as kid scope", PADDLE_ENFORCE(it != this->kids_.end(), "%p Cannot find %p as kid scope",
this, scope); this, scope);
...@@ -138,8 +149,8 @@ void Scope::DeleteScope(Scope* scope) const { ...@@ -138,8 +149,8 @@ void Scope::DeleteScope(Scope* scope) const {
} }
void Scope::EraseVars(const std::vector<std::string>& var_names) { void Scope::EraseVars(const std::vector<std::string>& var_names) {
SCOPE_LOCK_GUARD
std::set<std::string> var_set(var_names.begin(), var_names.end()); std::set<std::string> var_set(var_names.begin(), var_names.end());
SCOPE_VARS_WRITER_LOCK
for (auto it = vars_.begin(); it != vars_.end();) { for (auto it = vars_.begin(); it != vars_.end();) {
if (var_set.find(it->first) != var_set.end()) { if (var_set.find(it->first) != var_set.end()) {
it = vars_.erase(it); it = vars_.erase(it);
...@@ -151,12 +162,12 @@ void Scope::EraseVars(const std::vector<std::string>& var_names) { ...@@ -151,12 +162,12 @@ void Scope::EraseVars(const std::vector<std::string>& var_names) {
void Scope::Rename(const std::string& origin_name, void Scope::Rename(const std::string& origin_name,
const std::string& new_name) const { const std::string& new_name) const {
SCOPE_LOCK_GUARD SCOPE_VARS_WRITER_LOCK
RenameInternal(origin_name, new_name); RenameInternal(origin_name, new_name);
} }
std::string Scope::Rename(const std::string& origin_name) const { std::string Scope::Rename(const std::string& origin_name) const {
SCOPE_LOCK_GUARD SCOPE_VARS_WRITER_LOCK
auto new_name = string::Sprintf("%p.%d", this, vars_.size()); auto new_name = string::Sprintf("%p.%d", this, vars_.size());
RenameInternal(origin_name, new_name); RenameInternal(origin_name, new_name);
return new_name; return new_name;
...@@ -165,11 +176,9 @@ std::string Scope::Rename(const std::string& origin_name) const { ...@@ -165,11 +176,9 @@ std::string Scope::Rename(const std::string& origin_name) const {
Variable* Scope::VarInternal(const std::string& name) { Variable* Scope::VarInternal(const std::string& name) {
auto* v = FindVarLocally(name); auto* v = FindVarLocally(name);
if (v != nullptr) return v; if (v != nullptr) return v;
v = new Variable(); v = new Variable();
vars_[name].reset(v); vars_.emplace(name, std::unique_ptr<Variable>(v));
VLOG(3) << "Create variable " << name; VLOG(3) << "Create variable " << name;
v->name_ = &(vars_.find(name)->first);
return v; return v;
} }
......
...@@ -14,12 +14,18 @@ limitations under the License. */ ...@@ -14,12 +14,18 @@ limitations under the License. */
#pragma once #pragma once
extern "C" {
#include <xxhash.h>
}
#include <list> #include <list>
#include <mutex> // NOLINT #include <memory>
#include <string> #include <string>
#include <unordered_map> #include <unordered_map>
#include <utility>
#include <vector> #include <vector>
#include "paddle/fluid/framework/rw_lock.h"
#include "paddle/fluid/framework/variable.h" #include "paddle/fluid/framework/variable.h"
#include "paddle/fluid/platform/macros.h" #include "paddle/fluid/platform/macros.h"
...@@ -95,7 +101,14 @@ class Scope { ...@@ -95,7 +101,14 @@ class Scope {
std::string Rename(const std::string& origin_name) const; std::string Rename(const std::string& origin_name) const;
protected: protected:
mutable std::unordered_map<std::string, std::unique_ptr<Variable>> vars_; struct KeyHasher {
std::size_t operator()(const std::string& key) const {
return XXH32(key.c_str(), key.size(), 1);
}
};
mutable std::unordered_map<std::string, std::unique_ptr<Variable>, KeyHasher>
vars_;
private: private:
// Call Scope::NewScope for a sub-scope. // Call Scope::NewScope for a sub-scope.
...@@ -124,7 +137,8 @@ class Scope { ...@@ -124,7 +137,8 @@ class Scope {
DISABLE_COPY_AND_ASSIGN(Scope); DISABLE_COPY_AND_ASSIGN(Scope);
private: private:
mutable std::mutex mutex_; mutable RWLock kids_lock_;
mutable RWLock vars_lock_;
}; };
// Generate some debug string about the inherience structure of scope, quite // Generate some debug string about the inherience structure of scope, quite
......
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/scope_pool.h"
#include "paddle/fluid/framework/threadpool.h"
namespace paddle {
namespace framework {
ScopePool &ScopePool::Instance() { // NOLINT
static ScopePool pool;
return pool;
}
void ScopePool::DeleteScope(Scope *scope) { delete scope; }
void ScopePool::Insert(std::unique_ptr<Scope> &&s) {
std::lock_guard<std::mutex> guard(mtx_);
scopes_.insert(s.release());
}
void ScopePool::Remove(Scope *s) {
size_t has_scope;
{
std::lock_guard<std::mutex> guard(mtx_);
has_scope = scopes_.erase(s);
}
PADDLE_ENFORCE(has_scope > 0, "Delete non-existing global scope");
DeleteScope(s);
}
ScopePool::~ScopePool() { Clear(); }
void ScopePool::Clear() {
std::lock_guard<std::mutex> guard(mtx_);
for (auto *s : scopes_) {
DeleteScope(s);
}
scopes_.clear();
}
} // namespace framework
} // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <mutex> // NOLINT
#include <unordered_set>
#include "paddle/fluid/framework/scope.h"
namespace paddle {
namespace framework {
class ScopePool {
public:
static ScopePool &Instance(); // NOLINT
void Insert(std::unique_ptr<Scope> &&s);
void Remove(Scope *s);
void Clear();
~ScopePool();
private:
ScopePool() = default;
static void DeleteScope(Scope *scope);
std::unordered_set<Scope *> scopes_;
std::mutex mtx_;
};
} // namespace framework
} // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <cstddef>
#include <type_traits>
#include "paddle/fluid/platform/hostdevice.h"
namespace paddle {
namespace framework {
namespace detail {
template <size_t kStart, size_t kEnd, bool kStop>
struct UnrollFillConstant {
template <typename T>
HOSTDEVICE inline static void Run(T *data, T val) {
data[kStart] = val;
UnrollFillConstant<kStart + 1, kEnd, kStart + 1 == kEnd>::Run(data, val);
}
};
template <size_t kStart, size_t kEnd>
struct UnrollFillConstant<kStart, kEnd, true> {
template <typename T>
HOSTDEVICE inline static void Run(T *data, T val) {}
};
template <size_t kStart, size_t kEnd, bool kStop>
struct UnrollAssign {
template <typename Tin, typename Tout>
HOSTDEVICE inline static void Run(const Tin *d1, Tout *d2) {
d2[kStart] = static_cast<Tout>(d1[kStart]);
UnrollAssign<kStart + 1, kEnd, kStart + 1 == kEnd>::Run(d1, d2);
}
};
template <size_t kStart, size_t kEnd>
struct UnrollAssign<kStart, kEnd, true> {
template <typename Tin, typename Tout>
HOSTDEVICE inline static void Run(const Tin *d1, Tout *d2) {}
};
template <typename T, size_t kStart, size_t kEnd, bool kStop>
struct UnrollVarArgsAssignImpl {
template <typename... Args>
HOSTDEVICE inline static void Run(T *d, T val, Args... args) {
static_assert(sizeof...(args) + 1 == kEnd - kStart, "Wrong argument");
d[kStart] = val;
UnrollVarArgsAssignImpl<T, kStart + 1, kEnd, kStart + 1 == kEnd>::Run(
d, args...);
}
};
template <typename T, size_t kStart, size_t kEnd>
struct UnrollVarArgsAssignImpl<T, kStart, kEnd, true> {
HOSTDEVICE inline static void Run(T *d) {}
};
template <typename T>
struct UnrollVarArgsAssign {
template <typename... Args>
HOSTDEVICE inline static void Run(T *d, Args... args) {
UnrollVarArgsAssignImpl<T, 0, sizeof...(Args), sizeof...(Args) == 0>::Run(
d, args...);
}
};
template <size_t kStart, size_t kEnd, bool kStop>
struct UnrollCompare {
template <typename T>
HOSTDEVICE inline static bool Run(const T *d1, const T *d2) {
return d1[kStart] == d2[kStart] &&
UnrollCompare<kStart + 1, kEnd, kStart + 1 == kEnd>::Run(d1, d2);
}
};
template <size_t kStart, size_t kEnd>
struct UnrollCompare<kStart, kEnd, true> {
template <typename T>
HOSTDEVICE inline constexpr static bool Run(const T *d1, const T *d2) {
return true;
}
};
template <size_t kStart, size_t kEnd, bool kStop>
struct UnrollAdd {
template <typename T>
HOSTDEVICE inline static void Run(const T *d1, const T *d2, T *d3) {
d3[kStart] = d1[kStart] + d2[kStart];
UnrollAdd<kStart + 1, kEnd, kStart + 1 == kEnd>::Run(d1, d2, d3);
}
};
template <size_t kStart, size_t kEnd>
struct UnrollAdd<kStart, kEnd, true> {
template <typename T>
HOSTDEVICE inline static void Run(const T *d1, const T *d2, T *d3) {}
};
template <size_t kStart, size_t kEnd, bool kStop>
struct UnrollMul {
template <typename T>
HOSTDEVICE inline static void Run(const T *d1, const T *d2, T *d3) {
d3[kStart] = d1[kStart] * d2[kStart];
UnrollMul<kStart + 1, kEnd, kStart + 1 == kEnd>::Run(d1, d2, d3);
}
};
template <size_t kStart, size_t kEnd>
struct UnrollMul<kStart, kEnd, true> {
template <typename T>
HOSTDEVICE inline static void Run(const T *d1, const T *d2, T *d3) {}
};
template <size_t kStart, size_t kEnd, bool kStop>
struct UnrollProduct {
template <typename T>
HOSTDEVICE inline static T Run(const T *d) {
return d[kStart] *
UnrollProduct<kStart + 1, kEnd, kStart + 1 == kEnd>::Run(d);
}
template <typename T>
HOSTDEVICE inline static T Run(const T *d1, const T *d2) {
return d1[kStart] * d2[kStart] +
UnrollProduct<kStart + 1, kEnd, kStart + 1 == kEnd>::Run(d1, d2);
}
};
template <size_t kStart, size_t kEnd>
struct UnrollProduct<kStart, kEnd, true> {
template <typename T>
HOSTDEVICE inline constexpr static T Run(const T *d) {
return 1;
}
template <typename T>
HOSTDEVICE inline constexpr static T Run(const T *d1, const T *d2) {
return 0;
}
};
} // namespace detail
template <size_t N>
using UnrollFillConstant = detail::UnrollFillConstant<0, N, N == 0>;
template <size_t N>
using UnrollAssign = detail::UnrollAssign<0, N, N == 0>;
template <typename T>
using UnrollVarArgsAssign = detail::UnrollVarArgsAssign<T>;
template <size_t N>
using UnrollCompare = detail::UnrollCompare<0, N, N == 0>;
template <size_t N>
using UnrollAdd = detail::UnrollAdd<0, N, N == 0>;
template <size_t N>
using UnrollMul = detail::UnrollMul<0, N, N == 0>;
template <size_t N>
using UnrollProduct = detail::UnrollProduct<0, N, N == 0>;
} // namespace framework
} // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/unroll_array_ops.h"
#include <gtest/gtest.h>
#include <algorithm>
#include <array>
#include <cstdint>
namespace paddle {
namespace framework {
template <typename T>
bool CheckEquality(const T* p, size_t n, T val) {
return std::all_of(p, p + n, [val](const T& v) { return v == val; });
}
template <int D1, int D2>
bool FillConstantTestMain() {
static_assert(D1 >= D2, "");
std::array<int, D1> arr;
arr.fill(0);
UnrollFillConstant<D2>::Run(arr.data(), 1);
return CheckEquality(arr.data(), D2, 1) &&
CheckEquality(arr.data() + D2, arr.size() - D2, 0);
}
TEST(unroll_ops, fill_constant) {
EXPECT_TRUE((FillConstantTestMain<9, 0>()));
EXPECT_TRUE((FillConstantTestMain<9, 1>()));
EXPECT_TRUE((FillConstantTestMain<9, 4>()));
EXPECT_TRUE((FillConstantTestMain<9, 9>()));
}
TEST(unroll_ops, assign) {
const int a[] = {1, 2, 3, 4, 5};
int b[] = {0, 0, 0, 0, 0};
UnrollAssign<3>::Run(a, b);
EXPECT_EQ(b[0], 1);
EXPECT_EQ(b[1], 2);
EXPECT_EQ(b[2], 3);
EXPECT_EQ(b[3], 0);
EXPECT_EQ(b[4], 0);
}
TEST(unroll_ops, var_args_assign) {
int a[] = {0, 0, 0};
UnrollVarArgsAssign<int>::Run(a, 1, 2);
EXPECT_EQ(a[0], 1);
EXPECT_EQ(a[1], 2);
EXPECT_EQ(a[2], 0);
}
TEST(unroll_ops, compare) {
int a[] = {1, 2, 3};
int b[] = {1, 2, 4};
EXPECT_TRUE(UnrollCompare<2>::Run(a, b));
EXPECT_FALSE(UnrollCompare<3>::Run(a, b));
b[0] = -1;
EXPECT_TRUE(UnrollCompare<0>::Run(a, b));
EXPECT_FALSE(UnrollCompare<1>::Run(a, b));
}
TEST(unroll_ops, add) {
int a[] = {2, 3, 4};
int b[] = {5, 10, 102};
int c[] = {0, 0, 0};
UnrollAdd<2>::Run(a, b, c);
EXPECT_EQ(a[0] + b[0], c[0]);
EXPECT_EQ(a[1] + b[1], c[1]);
EXPECT_EQ(c[2], 0);
}
TEST(unroll_ops, mul) {
int a[] = {2, 3, 4};
int b[] = {5, 10, 102};
int c[] = {0, 0, 0};
UnrollMul<2>::Run(a, b, c);
EXPECT_EQ(a[0] * b[0], c[0]);
EXPECT_EQ(a[1] * b[1], c[1]);
EXPECT_EQ(c[2], 0);
}
TEST(unroll_ops, product) {
int a[] = {2, 3, 4};
int b[] = {5, 10, 102};
EXPECT_EQ(UnrollProduct<3>::Run(a), a[0] * a[1] * a[2]);
EXPECT_EQ(UnrollProduct<3>::Run(a, b),
a[0] * b[0] + a[1] * b[1] + a[2] * b[2]);
}
} // namespace framework
} // namespace paddle
...@@ -19,52 +19,50 @@ limitations under the License. */ ...@@ -19,52 +19,50 @@ limitations under the License. */
#include "paddle/fluid/framework/lod_tensor_array.h" #include "paddle/fluid/framework/lod_tensor_array.h"
#include "paddle/fluid/framework/reader.h" #include "paddle/fluid/framework/reader.h"
#include "paddle/fluid/framework/selected_rows.h" #include "paddle/fluid/framework/selected_rows.h"
#include "paddle/fluid/framework/var_type_traits.h"
#include "paddle/fluid/framework/variable.h" #include "paddle/fluid/framework/variable.h"
namespace paddle { namespace paddle {
namespace framework { namespace framework {
template <typename T> template <typename T>
inline bool IsType(const std::type_index& type_index) { inline bool IsType(const std::type_index& type) {
return type_index == std::type_index(typeid(T)); return type == typeid(T);
} }
inline proto::VarType::Type ToVarType(std::type_index type) { inline proto::VarType::Type ToVarType(int type) {
if (IsType<LoDTensor>(type)) { switch (type) {
return proto::VarType_Type_LOD_TENSOR; case proto::VarType::LOD_TENSOR:
} else if (IsType<LoDRankTable>(type)) { case proto::VarType::SELECTED_ROWS:
return proto::VarType_Type_LOD_RANK_TABLE; case proto::VarType::LOD_RANK_TABLE:
} else if (IsType<LoDTensorArray>(type)) { case proto::VarType::LOD_TENSOR_ARRAY:
return proto::VarType_Type_LOD_TENSOR_ARRAY; case proto::VarType::READER:
} else if (IsType<SelectedRows>(type)) { return static_cast<proto::VarType::Type>(type);
return proto::VarType_Type_SELECTED_ROWS; default:
} else if (IsType<ReaderHolder>(type)) { PADDLE_THROW("ToVarType:Unsupported type %d", type);
return proto::VarType_Type_READER;
} else {
PADDLE_THROW("ToVarType:Unsupported type %s", type.name());
} }
} }
template <typename Visitor> template <typename Visitor>
inline void VisitVarType(const framework::Variable& var, Visitor visitor) { inline void VisitVarType(const framework::Variable& var, Visitor visitor) {
switch (ToVarType(var.Type())) { switch (var.Type()) {
case proto::VarType_Type_LOD_TENSOR: case proto::VarType::LOD_TENSOR:
visitor(var.Get<LoDTensor>()); visitor(var.Get<LoDTensor>());
return; return;
case proto::VarType_Type_LOD_RANK_TABLE: case proto::VarType::LOD_RANK_TABLE:
visitor(var.Get<LoDRankTable>()); visitor(var.Get<LoDRankTable>());
return; return;
case proto::VarType_Type_LOD_TENSOR_ARRAY: case proto::VarType::LOD_TENSOR_ARRAY:
visitor(var.Get<LoDTensorArray>()); visitor(var.Get<LoDTensorArray>());
return; return;
case proto::VarType_Type_SELECTED_ROWS: case proto::VarType::SELECTED_ROWS:
visitor(var.Get<SelectedRows>()); visitor(var.Get<SelectedRows>());
return; return;
case proto::VarType_Type_READER: case proto::VarType::READER:
visitor(var.Get<ReaderHolder>()); visitor(var.Get<ReaderHolder>());
return; return;
default: default:
PADDLE_THROW("Not supported visit type, %d", ToVarType(var.Type())); PADDLE_THROW("Not supported visit type, %s", ToTypeName(var.Type()));
} }
} }
......
...@@ -108,7 +108,7 @@ TEST(InferVarType, sum_op_without_infer_var_type) { ...@@ -108,7 +108,7 @@ TEST(InferVarType, sum_op_without_infer_var_type) {
op->InferVarType(prog.MutableBlock(0)); op->InferVarType(prog.MutableBlock(0));
ASSERT_EQ(proto::VarType_Type_LOD_TENSOR, ASSERT_EQ(proto::VarType::LOD_TENSOR,
prog.MutableBlock(0)->Var("test2_out")->GetType()); prog.MutableBlock(0)->Var("test2_out")->GetType());
} }
......
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/var_type_traits.h"
#include "paddle/fluid/framework/lod_rank_table.h"
#include "paddle/fluid/framework/reader.h"
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/framework/selected_rows.h"
#include "paddle/fluid/operators/reader/lod_tensor_blocking_queue.h"
#include "paddle/fluid/platform/macros.h"
#ifdef PADDLE_WITH_CUDA
#ifndef _WIN32
#include "paddle/fluid/operators/nccl/nccl_gpu_common.h"
#endif
#include <cudnn.h>
#include "paddle/fluid/operators/conv_cudnn_op_cache.h"
#include "paddle/fluid/operators/cudnn_rnn_cache.h"
#endif
namespace paddle {
namespace framework {
// Besides registering variable type id, it is helpful to register a
// var_id -> std::type_index map (for example, get type names according to id)
namespace detail {
template <int kStart, int kEnd, bool kStop>
struct VarIdToTypeIndexMapInitializerImpl {
template <typename MapType1, typename MapType2>
static void Init(MapType1 *id_to_type, MapType2 *type_to_id) {
using Type =
typename std::tuple_element<kStart, VarTypeRegistry::ArgTuple>::type;
static_assert(!std::is_same<Type, void>::value, "Type cannot be void");
constexpr int kId = VarTypeTrait<Type>::kId;
auto type = std::type_index(typeid(Type));
PADDLE_ENFORCE(id_to_type->count(kId) == 0,
"Registered duplicate type id %d for type %s", kId,
type.name());
PADDLE_ENFORCE(type_to_id->count(type) == 0,
"Registered duplicate type_index %s for id %d", type.name(),
kId);
id_to_type->emplace(kId, type);
type_to_id->emplace(type, kId);
VarIdToTypeIndexMapInitializerImpl<kStart + 1, kEnd,
kStart + 1 == kEnd>::Init(id_to_type,
type_to_id);
}
};
template <int kStart, int kEnd>
struct VarIdToTypeIndexMapInitializerImpl<kStart, kEnd, true> {
template <typename MapType1, typename MapType2>
static void Init(MapType1 *, MapType2 *) {}
};
// VarIdToTypeIndexMapInitializer is designed to initialize var_id ->
// std::type_index map and std::type_index -> var_id map
using VarIdToTypeIndexMapInitializer =
VarIdToTypeIndexMapInitializerImpl<0, VarTypeRegistry::kRegisteredTypeNum,
VarTypeRegistry::kRegisteredTypeNum ==
0>;
struct VarIdToTypeIndexMapHolder {
DISABLE_COPY_AND_ASSIGN(VarIdToTypeIndexMapHolder);
public:
static const std::type_index &ToTypeIndex(int var_id) {
auto it = Instance().id_to_type_map_.find(var_id);
PADDLE_ENFORCE(it != Instance().id_to_type_map_.end(),
"VarId %d is not registered.", var_id);
return it->second;
}
static int ToTypeId(const std::type_index &type) {
auto it = Instance().type_to_id_map_.find(type);
PADDLE_ENFORCE(it != Instance().type_to_id_map_.end(),
"VarType %s is not registered.", type.name());
return it->second;
}
private:
VarIdToTypeIndexMapHolder() {
VarIdToTypeIndexMapInitializer::Init(&id_to_type_map_, &type_to_id_map_);
}
static const VarIdToTypeIndexMapHolder &Instance() {
static const VarIdToTypeIndexMapHolder instance;
return instance;
}
std::unordered_map<int, std::type_index> id_to_type_map_;
std::unordered_map<std::type_index, int> type_to_id_map_;
};
} // namespace detail
const std::type_index &ToTypeIndex(int var_id) {
return detail::VarIdToTypeIndexMapHolder::ToTypeIndex(var_id);
}
const char *ToTypeName(int var_id) { return ToTypeIndex(var_id).name(); }
int ToTypeId(const std::type_index &type) {
return detail::VarIdToTypeIndexMapHolder::ToTypeId(type);
}
} // namespace framework
} // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <map>
#include <string>
#include <tuple>
#include <typeindex>
#include <vector>
#include "paddle/fluid/framework/framework.pb.h"
#include "paddle/fluid/framework/lod_tensor_array.h"
#include "paddle/fluid/platform/place.h"
#ifdef PADDLE_WITH_CUDA
#include <cudnn.h>
#ifndef _WIN32
#include <nccl.h>
#endif
#endif
// Users should add forward declarations here
namespace paddle {
namespace platform {
#ifdef PADDLE_WITH_CUDA
#ifndef _WIN32
class Communicator;
#endif
#endif
} // namespace platform
namespace framework {
class Tensor;
class LoDTensor;
class SelectedRows;
class LoDRankTable;
class ReaderHolder;
class Scope;
} // namespace framework
namespace operators {
template <typename T>
class AlgorithmsCache;
class CudnnRNNCache;
namespace reader {
class LoDTensorBlockingQueueHolder;
} // namespace reader
} // namespace operators
} // namespace paddle
namespace paddle {
namespace framework {
const char *ToTypeName(int var_id);
const std::type_index &ToTypeIndex(int var_id);
int ToTypeId(const std::type_index &type);
namespace detail {
template <bool kStop, int kStart, int kEnd, typename T1, typename T2,
typename... Args>
struct TypePosFinderImpl {
static constexpr int kPos =
std::is_same<T1, T2>::value
? kStart
: TypePosFinderImpl<kStart + 2 == kEnd, kStart + 1, kEnd, T1,
Args...>::kPos;
};
template <int kStart, int kEnd, typename T1, typename T2>
struct TypePosFinderImpl<true, kStart, kEnd, T1, T2> {
static constexpr int kPos = std::is_same<T1, T2>::value ? kStart : -1;
};
// TypePosFinder helps to find the position in which T is inside Args...
// If T is not inside Args..., kPos would be -1
template <typename T, typename... Args>
struct TypePosFinder {
static constexpr int kPos =
TypePosFinderImpl<sizeof...(Args) == 1, 0, sizeof...(Args), T,
Args...>::kPos;
};
template <typename... Args>
struct VarTypeRegistryImpl {
static constexpr size_t kRegisteredTypeNum = sizeof...(Args);
using ArgTuple = std::tuple<Args...>;
// TypePos() returns the position in which T is inside Args...
// If T is not inside Args..., return -1
template <typename T>
static constexpr int TypePos() {
return TypePosFinder<T, Args...>::kPos;
}
// IsRegistered() returns whether T is registered inside RegistryImpl
template <typename T>
static constexpr bool IsRegistered() {
return TypePos<T>() >= 0;
}
};
} // namespace detail
#define REG_PROTO_VAR_TYPE_TRAIT(type, proto_id) \
template <> \
struct VarTypeTrait<type> { \
static_assert(VarTypeRegistry::IsRegistered<type>(), \
"Must be registered type"); \
using Type = type; \
static constexpr int kId = static_cast<int>(proto_id); \
}
/**
* The following codes are designed to register variable types.
* Only registered types can be stored in Variable.
* This registry mechanism is designed to speed up Variable.
*
* Caution: If you want to add more var types, please consider carefully
* whether you really need to add it.
*/
// Users should add other variable types below.
// Paddle would generate unique Ids for each registered variable types.
using VarTypeRegistry = detail::VarTypeRegistryImpl<
Tensor, LoDTensor, SelectedRows, std::vector<Scope *>, LoDRankTable,
LoDTensorArray, platform::PlaceList, ReaderHolder, std::string, Scope *,
std::map<size_t, Tensor>, operators::reader::LoDTensorBlockingQueueHolder,
#ifdef PADDLE_WITH_CUDA
#ifndef _WIN32
ncclUniqueId, platform::Communicator,
#endif
operators::AlgorithmsCache<cudnnConvolutionFwdAlgo_t>,
operators::AlgorithmsCache<cudnnConvolutionBwdDataAlgo_t>,
operators::AlgorithmsCache<cudnnConvolutionBwdFilterAlgo_t>,
operators::CudnnRNNCache,
#endif
int, float>;
template <typename T>
struct VarTypeTrait {
static_assert(VarTypeRegistry::IsRegistered<T>(), "Must be registered type");
using Type = T;
/**
* Unique VarType Id generation.
*
* The auto-generated id should not be the same as any protobuf id defined in
* framework.proto. Therefore, we generate id by adding the type pos and
* maximum protobuf id (i.e., proto::VarType::TUPLE).
*
* However, we may need more protobuf id in the future.
* To avoid changing this auto id generation algorithm frequently, we
* generate id by adding the type pos and twice of maximum protobuf id (i.e.,
* proto::VarType::TUPLE).
*/
static constexpr int kId = VarTypeRegistry::TypePos<T>() +
static_cast<int>(proto::VarType::TUPLE) * 2;
};
// Users should set some of variable type ids to be what is defined in
// framework.proto below
REG_PROTO_VAR_TYPE_TRAIT(LoDTensor, proto::VarType::LOD_TENSOR);
REG_PROTO_VAR_TYPE_TRAIT(SelectedRows, proto::VarType::SELECTED_ROWS);
REG_PROTO_VAR_TYPE_TRAIT(std::vector<Scope *>, proto::VarType::STEP_SCOPES);
REG_PROTO_VAR_TYPE_TRAIT(LoDRankTable, proto::VarType::LOD_RANK_TABLE);
REG_PROTO_VAR_TYPE_TRAIT(LoDTensorArray, proto::VarType::LOD_TENSOR_ARRAY);
REG_PROTO_VAR_TYPE_TRAIT(platform::PlaceList, proto::VarType::PLACE_LIST);
REG_PROTO_VAR_TYPE_TRAIT(ReaderHolder, proto::VarType::READER);
REG_PROTO_VAR_TYPE_TRAIT(int, proto::VarType::INT32);
REG_PROTO_VAR_TYPE_TRAIT(float, proto::VarType::FP32);
/** End of variable type registration */
template <typename T>
inline constexpr bool IsRegisteredVarType() {
return VarTypeRegistry::IsRegistered<T>();
}
#undef REG_PROTO_VAR_TYPE_TRAIT
} // namespace framework
} // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <gtest/gtest.h>
#include <cstdint>
#include <iostream>
#include <unordered_set>
#include "paddle/fluid/framework/lod_rank_table.h"
#include "paddle/fluid/framework/reader.h"
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/framework/selected_rows.h"
#include "paddle/fluid/framework/var_type_traits.h"
#include "paddle/fluid/operators/reader/lod_tensor_blocking_queue.h"
#ifdef PADDLE_WITH_CUDA
#ifndef _WIN32
#include "paddle/fluid/operators/nccl/nccl_gpu_common.h"
#endif
#include "paddle/fluid/operators/conv_cudnn_op_cache.h"
#include "paddle/fluid/operators/cudnn_rnn_cache.h"
#endif
namespace paddle {
namespace framework {
template <int kPos, int kEnd, bool kStop>
struct TypeIndexChecker {
template <typename SetType1, typename SetType2>
static void Check(SetType1 *var_id_set, SetType2 *type_index_set) {
using Type =
typename std::tuple_element<kPos, VarTypeRegistry::ArgTuple>::type;
static_assert(std::is_same<typename VarTypeTrait<Type>::Type, Type>::value,
"Type must be the same");
constexpr auto kId = VarTypeTrait<Type>::kId;
std::type_index actual_type(typeid(Type));
EXPECT_EQ(std::string(ToTypeName(kId)), std::string(actual_type.name()));
EXPECT_EQ(ToTypeIndex(kId), actual_type);
EXPECT_EQ(ToTypeId(actual_type), kId);
EXPECT_EQ(ToTypeIndex(ToTypeId(actual_type)), actual_type);
EXPECT_EQ(ToTypeId(ToTypeIndex(kId)), kId);
EXPECT_TRUE(var_id_set->count(kId) == 0); // NOLINT
EXPECT_TRUE(type_index_set->count(actual_type) == 0); // NOLINT
var_id_set->insert(kId);
type_index_set->insert(std::type_index(typeid(Type)));
TypeIndexChecker<kPos + 1, kEnd, kPos + 1 == kEnd>::Check(var_id_set,
type_index_set);
}
};
template <int kPos, int kEnd>
struct TypeIndexChecker<kPos, kEnd, true> {
template <typename SetType1, typename SetType2>
static void Check(SetType1 *, SetType2 *) {}
};
TEST(var_type_traits, check_no_duplicate_registry) {
constexpr size_t kRegisteredNum = VarTypeRegistry::kRegisteredTypeNum;
std::unordered_set<int> var_id_set;
std::unordered_set<std::type_index> type_index_set;
TypeIndexChecker<0, kRegisteredNum, kRegisteredNum == 0>::Check(
&var_id_set, &type_index_set);
}
template <typename T>
bool CheckVarId(int proto_id) {
static_assert(std::is_same<typename VarTypeTrait<T>::Type, T>::value,
"Type must be the same");
return VarTypeTrait<T>::kId == proto_id;
}
TEST(var_type_traits, check_proto_type_id) {
ASSERT_TRUE(CheckVarId<LoDTensor>(proto::VarType::LOD_TENSOR));
ASSERT_TRUE(CheckVarId<SelectedRows>(proto::VarType::SELECTED_ROWS));
ASSERT_TRUE(CheckVarId<std::vector<Scope *>>(proto::VarType::STEP_SCOPES));
ASSERT_TRUE(CheckVarId<LoDRankTable>(proto::VarType::LOD_RANK_TABLE));
ASSERT_TRUE(CheckVarId<LoDTensorArray>(proto::VarType::LOD_TENSOR_ARRAY));
ASSERT_TRUE(CheckVarId<platform::PlaceList>(proto::VarType::PLACE_LIST));
ASSERT_TRUE(CheckVarId<ReaderHolder>(proto::VarType::READER));
ASSERT_TRUE(CheckVarId<int>(proto::VarType::INT32));
ASSERT_TRUE(CheckVarId<float>(proto::VarType::FP32));
ASSERT_EQ(proto::VarType_Type_LOD_TENSOR, proto::VarType::LOD_TENSOR);
ASSERT_EQ(proto::VarType_Type_SELECTED_ROWS, proto::VarType::SELECTED_ROWS);
ASSERT_EQ(proto::VarType_Type_STEP_SCOPES, proto::VarType::STEP_SCOPES);
ASSERT_EQ(proto::VarType_Type_LOD_RANK_TABLE, proto::VarType::LOD_RANK_TABLE);
ASSERT_EQ(proto::VarType_Type_LOD_TENSOR_ARRAY,
proto::VarType::LOD_TENSOR_ARRAY);
ASSERT_EQ(proto::VarType_Type_PLACE_LIST, proto::VarType::PLACE_LIST);
ASSERT_EQ(proto::VarType_Type_READER, proto::VarType::READER);
ASSERT_EQ(proto::VarType_Type_FEED_MINIBATCH, proto::VarType::FEED_MINIBATCH);
ASSERT_EQ(proto::VarType_Type_FETCH_LIST, proto::VarType::FETCH_LIST);
ASSERT_EQ(proto::VarType_Type_RAW, proto::VarType::RAW);
ASSERT_EQ(proto::VarType_Type_TUPLE, proto::VarType::TUPLE);
ASSERT_EQ(proto::VarType_Type_INT32, proto::VarType::INT32);
ASSERT_EQ(proto::VarType_Type_FP32, proto::VarType::FP32);
}
TEST(var_type_traits, test_registry) {
using Registry = detail::VarTypeRegistryImpl<int8_t, int32_t, size_t, double>;
ASSERT_TRUE(Registry::TypePos<int8_t>() == 0);
ASSERT_TRUE(Registry::TypePos<int32_t>() == 1);
ASSERT_TRUE(Registry::TypePos<size_t>() == 2);
ASSERT_TRUE(Registry::TypePos<double>() == 3);
ASSERT_TRUE(Registry::TypePos<float>() == -1);
}
} // namespace framework
} // namespace paddle
...@@ -18,7 +18,7 @@ ...@@ -18,7 +18,7 @@
#include <typeindex> #include <typeindex>
#include <typeinfo> #include <typeinfo>
#include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/framework/var_type_traits.h"
namespace paddle { namespace paddle {
namespace framework { namespace framework {
...@@ -27,10 +27,14 @@ class Variable { ...@@ -27,10 +27,14 @@ class Variable {
public: public:
template <typename T> template <typename T>
const T& Get() const { const T& Get() const {
static_assert(
IsRegisteredVarType<T>(),
"Not registered type. Please register T inside var_type_traits.h");
PADDLE_ENFORCE(holder_ != nullptr, "Variable must hold some thing"); PADDLE_ENFORCE(holder_ != nullptr, "Variable must hold some thing");
PADDLE_ENFORCE(IsType<T>(), PADDLE_ENFORCE(holder_->Type() == VarTypeTrait<T>::kId,
"Variable must be type %s, the holding type is %s", "Variable must be type %s, the holding type is %s",
typeid(T).name(), holder_->Type().name()); ToTypeName(VarTypeTrait<T>::kId),
ToTypeName(holder_->Type()));
return *static_cast<const T*>(holder_->Ptr()); return *static_cast<const T*>(holder_->Ptr());
} }
...@@ -39,61 +43,61 @@ class Variable { ...@@ -39,61 +43,61 @@ class Variable {
template <typename T> template <typename T>
T* GetMutable() { T* GetMutable() {
if (!holder_) { if (!holder_) {
holder_.reset(new PlaceholderImpl<T>(new T())); holder_.reset(new PlaceholderImpl<T>());
} else { } else {
PADDLE_ENFORCE(IsType<T>(), PADDLE_ENFORCE(holder_->Type() == VarTypeTrait<T>::kId,
"Variable must be type %s, the holding type is %s", "Variable must be type %s, the holding type is %s",
typeid(T).name(), holder_->Type().name()); ToTypeName(VarTypeTrait<T>::kId),
ToTypeName(holder_->Type()));
} }
return static_cast<T*>(holder_->Ptr()); return static_cast<T*>(holder_->Ptr());
} }
template <typename T> template <typename T>
bool IsType() const { bool IsType() const {
return holder_ != nullptr && return holder_ && holder_->Type() == VarTypeTrait<T>::kId;
std::type_index(typeid(T)) == std::type_index(holder_->Type());
} }
void Clear() { holder_.reset(); } void Clear() { holder_.reset(); }
std::type_index Type() const { int Type() const {
PADDLE_ENFORCE(holder_ != nullptr, "Must hold memory"); PADDLE_ENFORCE(holder_ != nullptr, "Must hold memory");
return holder_->Type(); return holder_->Type();
} }
private: private:
struct Placeholder { struct Placeholder {
virtual ~Placeholder() {} virtual ~Placeholder() = default;
virtual const std::type_info& Type() const = 0;
virtual void* Ptr() const = 0; inline int Type() const { return type_; }
inline const void* Ptr() const { return ptr_; }
inline void* Ptr() { return ptr_; }
protected:
inline void Init(void* p, int type) {
ptr_ = p;
type_ = type;
}
void* ptr_;
int type_;
}; };
// Placeholder hides type T, so it doesn't appear as a template // Placeholder hides type T, so it doesn't appear as a template
// parameter of Variable. // parameter of Variable.
template <typename T> template <typename T>
struct PlaceholderImpl : public Placeholder { struct PlaceholderImpl : public Placeholder {
explicit PlaceholderImpl(T* ptr) : ptr_(ptr), type_(typeid(T)) {} static_assert(
IsRegisteredVarType<T>(),
virtual const std::type_info& Type() const { return type_; } "Not registered type. Please register T inside var_type_traits.h");
virtual void* Ptr() const { return static_cast<void*>(ptr_.get()); } PlaceholderImpl() { this->Init(&obj_, VarTypeTrait<T>::kId); }
std::unique_ptr<T> ptr_; private:
const std::type_info& type_; T obj_;
}; };
std::unique_ptr<Placeholder> // pointers to a PlaceholderImpl object indeed.
holder_; // pointers to a PlaceholderImpl object indeed. std::unique_ptr<Placeholder> holder_;
// name_ is only meaningful with a Scope and accessible by it.
//
// NOTE: Please don't expose name_ by adding methods like
// Variable::Name or Scope::VarName! A variable could have a human
// readable name or an auto-generated scope-unique name. In the
// former case, the caller knows the name and doesn't need to access
// the name; in the latter case, the variable should be identified
// by its address but not the unreadable name.
friend class Scope;
const std::string* name_;
}; };
} // namespace framework } // namespace framework
......
...@@ -16,27 +16,28 @@ ...@@ -16,27 +16,28 @@
#include <string> #include <string>
#include "gtest/gtest.h" #include "gtest/gtest.h"
#include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/framework/variable.h" #include "paddle/fluid/framework/variable.h"
TEST(Variable, GetMutable) { namespace paddle {
using paddle::framework::Variable; namespace framework {
struct Tensor {
int content_;
};
TEST(Variable, GetMutable) {
std::unique_ptr<Variable> v(new Variable()); std::unique_ptr<Variable> v(new Variable());
Tensor* t = v->GetMutable<Tensor>(); auto* t = v->GetMutable<std::string>();
t->content_ = 1234; *t = "1234";
const Tensor& tt = v->Get<Tensor>(); const auto& tt = v->Get<std::string>();
EXPECT_EQ(1234, tt.content_); EXPECT_EQ("1234", tt);
try { try {
v->GetMutable<std::string>(); v->GetMutable<Tensor>();
} catch (std::exception& e) { } catch (std::exception& e) {
return; return;
} }
EXPECT_TRUE(false); EXPECT_TRUE(false);
} }
} // namespace framework
} // namespace paddle
此差异已折叠。
...@@ -14,17 +14,69 @@ ...@@ -14,17 +14,69 @@
#pragma once #pragma once
#include <map>
#include <string> #include <string>
#include <vector> #include <vector>
#include "paddle/fluid/framework/op_desc.h" #include "paddle/fluid/framework/op_desc.h"
#include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/framework/var_desc.h" #include "paddle/fluid/framework/var_desc.h"
#include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/enforce.h"
namespace paddle { namespace paddle {
namespace imperative { namespace imperative {
class PreparedOp {
public:
PreparedOp(const framework::OperatorBase& op,
const framework::RuntimeContext& ctx,
framework::OperatorWithKernel::OpKernelFunc func,
platform::DeviceContext* dev_ctx)
: op(op), ctx(ctx), func(func), dev_ctx(dev_ctx) {}
static PreparedOp Prepare(const framework::RuntimeContext& ctx,
const framework::OperatorWithKernel& op,
const platform::Place& place) {
platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
auto* dev_ctx = pool.Get(place);
// check if op[type] has kernel registered.
auto& all_op_kernels = op.AllOpKernels();
auto kernels_iter = all_op_kernels.find(op.Type());
if (kernels_iter == all_op_kernels.end()) {
PADDLE_THROW(
"There are no kernels which are registered in the %s operator.",
op.Type());
}
framework::OperatorWithKernel::OpKernelMap& kernels = kernels_iter->second;
auto expected_kernel_key = op.GetExpectedKernelType(
framework::ExecutionContext(op, framework::Scope(), *dev_ctx, ctx));
VLOG(3) << "expected_kernel_key:" << expected_kernel_key;
auto kernel_iter = kernels.find(expected_kernel_key);
#ifdef PADDLE_WITH_MKLDNN
// workaround for missing MKLDNN kernel when FLAGS_use_mkldnn env var is set
if (kernel_iter == kernels.end() &&
expected_kernel_key.library_type_ == framework::LibraryType::kMKLDNN) {
VLOG(3) << "missing MKLDNN kernel: fallbacking to PLAIN one";
expected_kernel_key.library_type_ = framework::LibraryType::kPlain;
expected_kernel_key.data_layout_ = framework::DataLayout::kAnyLayout;
kernel_iter = kernels.find(expected_kernel_key);
}
#endif
if (kernel_iter == kernels.end()) {
PADDLE_THROW("op %s does not have kernel for %s", op.Type(),
KernelTypeToString(expected_kernel_key));
}
return PreparedOp(op, ctx, kernel_iter->second, dev_ctx);
}
const framework::OperatorBase& op;
const framework::RuntimeContext& ctx;
framework::OperatorWithKernel::OpKernelFunc func;
platform::DeviceContext* dev_ctx;
};
class OpBase; class OpBase;
class VarBase { class VarBase {
...@@ -33,18 +85,26 @@ class VarBase { ...@@ -33,18 +85,26 @@ class VarBase {
: pre_op_(nullptr), : pre_op_(nullptr),
pre_op_out_idx_(-1), pre_op_out_idx_(-1),
var_desc_(nullptr), var_desc_(nullptr),
var_(nullptr), var_(new framework::Variable()),
grads_(nullptr) {} grads_(new framework::Variable()) {}
virtual ~VarBase() {} virtual ~VarBase() {
if (var_) {
void ApplyGrad(framework::Scope* scope, framework::Variable* grad); delete var_;
var_ = nullptr;
}
if (grads_) {
delete grads_;
grads_ = nullptr;
}
}
void RunBackward(framework::Scope* scope); void RunBackward();
framework::LoDTensor& Grad(); framework::LoDTensor& Grad();
OpBase* pre_op_; OpBase* pre_op_;
std::string pre_op_out_name_;
int pre_op_out_idx_; int pre_op_out_idx_;
framework::VarDesc* var_desc_; framework::VarDesc* var_desc_;
...@@ -54,35 +114,24 @@ class VarBase { ...@@ -54,35 +114,24 @@ class VarBase {
class OpBase { class OpBase {
public: public:
OpBase() OpBase() : op_desc_(nullptr), grad_op_desc_(nullptr) {}
: input_vars_(new std::vector<VarBase*>()),
output_vars_(new std::vector<VarBase*>()),
pre_ops_(new std::vector<OpBase*>()),
pre_ops_out_idx_(new std::vector<int>()),
op_desc_(nullptr),
grad_op_desc_(nullptr) {}
virtual ~OpBase() { virtual ~OpBase() {
delete input_vars_;
delete output_vars_;
delete pre_ops_;
delete pre_ops_out_idx_;
if (grad_op_desc_) delete grad_op_desc_; if (grad_op_desc_) delete grad_op_desc_;
if (grad_to_var_) delete grad_to_var_;
} }
std::vector<framework::Variable*> ApplyGrad(framework::Scope* scope); std::map<std::string, std::vector<VarBase*>> ApplyGrad();
std::vector<VarBase*>* input_vars_;
std::vector<VarBase*>* output_vars_;
std::vector<OpBase*>* pre_ops_;
std::vector<int>* pre_ops_out_idx_;
framework::OpDesc* op_desc_; framework::OpDesc* op_desc_;
framework::OpDesc* grad_op_desc_; framework::OpDesc* grad_op_desc_;
std::unordered_map<std::string, std::string>* grad_to_var_;
std::map<std::string, std::vector<VarBase*>> input_vars_;
std::map<std::string, std::vector<VarBase*>> output_vars_;
std::map<std::string, std::vector<OpBase*>> pre_ops_;
std::map<std::string, std::vector<int>> pre_ops_out_idx_;
std::map<std::string, std::vector<framework::Variable*>> grad_input_vars_;
std::map<std::string, std::vector<framework::Variable*>> grad_output_vars_;
framework::BlockDesc* block_; framework::BlockDesc* block_;
}; };
......
此差异已折叠。
...@@ -69,17 +69,17 @@ void TestWord2vecPrediction(const std::string& model_path) { ...@@ -69,17 +69,17 @@ void TestWord2vecPrediction(const std::string& model_path) {
std::vector<PaddleTensor> outputs; std::vector<PaddleTensor> outputs;
CHECK(predictor->Run(slots, &outputs)); CHECK(predictor->Run(slots, &outputs));
PADDLE_ENFORCE(outputs.size(), 1UL); PADDLE_ENFORCE_EQ(outputs.size(), 1UL);
// Check the output buffer size and result of each tid. // Check the output buffer size and result of each tid.
PADDLE_ENFORCE(outputs.front().data.length(), 33168UL); PADDLE_ENFORCE_EQ(outputs.front().data.length(), 33168UL);
float result[5] = {0.00129761, 0.00151112, 0.000423564, 0.00108815, float result[5] = {0.00129761, 0.00151112, 0.000423564, 0.00108815,
0.000932706}; 0.000932706};
const size_t num_elements = outputs.front().data.length() / sizeof(float); const size_t num_elements = outputs.front().data.length() / sizeof(float);
// The outputs' buffers are in CPU memory. // The outputs' buffers are in CPU memory.
for (size_t i = 0; i < std::min(static_cast<size_t>(5UL), num_elements); for (size_t i = 0; i < std::min(static_cast<size_t>(5UL), num_elements);
i++) { i++) {
LOG(INFO) << "data: " LOG(INFO) << "data: " << static_cast<float*>(outputs.front().data.data())[i]
<< static_cast<float*>(outputs.front().data.data())[i]; << " result: " << result[i];
PADDLE_ENFORCE(static_cast<float*>(outputs.front().data.data())[i], PADDLE_ENFORCE(static_cast<float*>(outputs.front().data.data())[i],
result[i]); result[i]);
} }
......
...@@ -127,6 +127,7 @@ struct Argument { ...@@ -127,6 +127,7 @@ struct Argument {
std::function<bool(const framework::ir::Node*)>); std::function<bool(const framework::ir::Node*)>);
DECL_ARGUMENT_FIELD(tensorrt_max_batch_size, TensorRtMaxBatchSize, int); DECL_ARGUMENT_FIELD(tensorrt_max_batch_size, TensorRtMaxBatchSize, int);
DECL_ARGUMENT_FIELD(tensorrt_workspace_size, TensorRtWorkspaceSize, int); DECL_ARGUMENT_FIELD(tensorrt_workspace_size, TensorRtWorkspaceSize, int);
DECL_ARGUMENT_FIELD(tensorrt_min_subgraph_size, TensorRtMinSubgraphSize, int);
// The program transformed by IR analysis phase. // The program transformed by IR analysis phase.
DECL_ARGUMENT_UNIQUE_FIELD(ir_analyzed_program, IrAnalyzedProgram, DECL_ARGUMENT_UNIQUE_FIELD(ir_analyzed_program, IrAnalyzedProgram,
......
...@@ -75,6 +75,8 @@ void IRPassManager::CreatePasses(Argument *argument, ...@@ -75,6 +75,8 @@ void IRPassManager::CreatePasses(Argument *argument,
argument->tensorrt_node_teller_ptr()); argument->tensorrt_node_teller_ptr());
pass->Set("workspace_size", new int(argument->tensorrt_workspace_size())); pass->Set("workspace_size", new int(argument->tensorrt_workspace_size()));
pass->Set("max_batch_size", new int(argument->tensorrt_max_batch_size())); pass->Set("max_batch_size", new int(argument->tensorrt_max_batch_size()));
pass->Set("min_subgraph_size",
new int(argument->tensorrt_min_subgraph_size()));
} }
// graph_ = pass->Apply(std::move(graph_)); // graph_ = pass->Apply(std::move(graph_));
......
...@@ -25,7 +25,7 @@ void TensorArrayBatchCleaner::CollectTensorArrays(framework::Scope *scope) { ...@@ -25,7 +25,7 @@ void TensorArrayBatchCleaner::CollectTensorArrays(framework::Scope *scope) {
// TODO(Superjomn) should avoid the case when a TensorArray is a // TODO(Superjomn) should avoid the case when a TensorArray is a
// parameter. // parameter.
if (var_name == "feed" || var_name == "fetch") continue; if (var_name == "feed" || var_name == "fetch") continue;
if (var->Type() == typeid(framework::LoDTensorArray)) { if (var->IsType<framework::LoDTensorArray>()) {
VLOG(4) << "collect " << var_name; VLOG(4) << "collect " << var_name;
arrays_.push_back(var->GetMutable<framework::LoDTensorArray>()); arrays_.push_back(var->GetMutable<framework::LoDTensorArray>());
} }
......
此差异已折叠。
...@@ -46,7 +46,7 @@ endif() ...@@ -46,7 +46,7 @@ endif()
register_operators(EXCLUDES py_func_op warpctc_op conv_fusion_op DEPS ${OP_HEADER_DEPS} ${OP_PREFETCH_DEPS}) register_operators(EXCLUDES py_func_op warpctc_op conv_fusion_op DEPS ${OP_HEADER_DEPS} ${OP_PREFETCH_DEPS})
# warpctc_op needs cudnn 7 above # warpctc_op needs cudnn 7 above
if (WITH_GPU AND NOT WIN32) if (WITH_GPU)
if (${CUDNN_MAJOR_VERSION} VERSION_LESS 7) if (${CUDNN_MAJOR_VERSION} VERSION_LESS 7)
op_library(warpctc_op DEPS dynload_warpctc sequence_padding sequence_scale SRCS warpctc_op.cc warpctc_op.cu.cc) op_library(warpctc_op DEPS dynload_warpctc sequence_padding sequence_scale SRCS warpctc_op.cc warpctc_op.cu.cc)
else() else()
......
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册