diff --git a/CMakeLists.txt b/CMakeLists.txt index d43df124bdee2d568a0c09d5acd35d5ff96f4654..24262c1821dab88bdf7202349fcde0e9dd6a0820 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -72,6 +72,7 @@ option(WITH_INFERENCE "Compile fluid inference library" ON) option(WITH_INFERENCE_API_TEST "Test fluid inference high-level api interface" OFF) option(WITH_SYSTEM_BLAS "Use system blas library" OFF) option(PY_VERSION "Compile PaddlePaddle with python3 support" ${PY_VERSION}) +option(WITH_FAST_MATH "Make use of fast math library" OFF) # PY_VERSION if(NOT PY_VERSION) diff --git a/Dockerfile b/Dockerfile index 634be18a51bf61e96a8bf6f263b6674a7932d6e4..738bba9bc2e1ab19709722fe04f1490b1b13bd4f 100644 --- a/Dockerfile +++ b/Dockerfile @@ -24,6 +24,7 @@ COPY ./paddle/scripts/docker/root/ /root/ RUN apt-get update && \ apt-get install -y --allow-downgrades patchelf \ + python3 python3-dev python3-pip \ git python-pip python-dev python-opencv openssh-server bison \ libnccl2=2.1.2-1+cuda8.0 libnccl-dev=2.1.2-1+cuda8.0 \ wget unzip unrar tar xz-utils bzip2 gzip coreutils ntp \ @@ -70,24 +71,33 @@ RUN localedef -i en_US -f UTF-8 en_US.UTF-8 # specify sphinx version as 1.5.6 and remove -U option for [pip install -U # sphinx-rtd-theme] since -U option will cause sphinx being updated to newest # version(1.7.1 for now), which causes building documentation failed. -RUN easy_install -U pip && \ +RUN pip3 install -U wheel && \ + pip3 install -U docopt PyYAML sphinx==1.5.6 && \ + pip3 install sphinx-rtd-theme==0.1.9 recommonmark && \ + easy_install -U pip && \ pip install -U wheel && \ pip install -U docopt PyYAML sphinx==1.5.6 && \ pip install sphinx-rtd-theme==0.1.9 recommonmark -RUN pip install pre-commit 'ipython==5.3.0' && \ +RUN pip3 install pre-commit 'ipython==5.3.0' && \ + pip3 install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \ + pip3 install opencv-python && \ + pip install pre-commit 'ipython==5.3.0' && \ pip install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \ pip install opencv-python #For docstring checker +RUN pip3 install pylint pytest astroid isort RUN pip install pylint pytest astroid isort LinkChecker COPY ./python/requirements.txt /root/ +RUN pip3 install -r /root/requirements.txt RUN pip install -r /root/requirements.txt # To fix https://github.com/PaddlePaddle/Paddle/issues/1954, we use # the solution in https://urllib3.readthedocs.io/en/latest/user-guide.html#ssl-py2 RUN apt-get install -y libssl-dev libffi-dev +RUN pip3 install certifi urllib3[secure] RUN pip install certifi urllib3[secure] diff --git a/cmake/cblas.cmake b/cmake/cblas.cmake index 6ed51c648478efb9784d0c43b169c285e740e0f3..24de8d9d7ced5f8111cc5d65f761b7506bde048e 100644 --- a/cmake/cblas.cmake +++ b/cmake/cblas.cmake @@ -40,7 +40,7 @@ set(OPENBLAS_LIB_SEARCH_PATHS /usr/local/opt/openblas/lib) find_path(OPENBLAS_INC_DIR NAMES cblas.h - PATHS ${OPENBLAS_INCLUDE_SEARCH_PATHS}) + PATHS ${OPENBLAS_INCLUDE_SEARCH_PATHS} NO_DEFAULT_PATH) find_path(OPENBLAS_LAPACKE_INC_DIR NAMES lapacke.h PATHS ${OPENBLAS_INCLUDE_SEARCH_PATHS}) find_library(OPENBLAS_LIB NAMES openblas diff --git a/cmake/configure.cmake b/cmake/configure.cmake index ce1857582bd3e8ab3077158384beaae36a83a4b2..e9852f00b1835adec31373f58ac538f9685251e0 100644 --- a/cmake/configure.cmake +++ b/cmake/configure.cmake @@ -62,8 +62,26 @@ if(NOT CMAKE_CROSSCOMPILING) endif() if(WIN32) - # windows stupid compile option for all targets. + # windows header option for all targets. add_definitions(-D_XKEYCHECK_H) + # Use symbols instead of absolute path, reduce the cmake link command length. + SET(CMAKE_C_USE_RESPONSE_FILE_FOR_LIBRARIES 1) + SET(CMAKE_CXX_USE_RESPONSE_FILE_FOR_LIBRARIES 1) + SET(CMAKE_C_USE_RESPONSE_FILE_FOR_OBJECTS 1) + SET(CMAKE_CXX_USE_RESPONSE_FILE_FOR_OBJECTS 1) + SET(CMAKE_C_USE_RESPONSE_FILE_FOR_INCLUDES 1) + SET(CMAKE_CXX_USE_RESPONSE_FILE_FOR_INCLUDES 1) + SET(CMAKE_C_RESPONSE_FILE_LINK_FLAG "@") + SET(CMAKE_CXX_RESPONSE_FILE_LINK_FLAG "@") + + # Specify the program to use when building static libraries + SET(CMAKE_C_CREATE_STATIC_LIBRARY " lib ") + SET(CMAKE_CXX_CREATE_STATIC_LIBRARY " lib ") + + # set defination for the dll export + if (NOT MSVC) + message(FATAL "Windows build only support msvc. Which was binded by the nvcc compiler of NVIDIA.") + endif(NOT MSVC) endif(WIN32) if(NOT WITH_GOLANG) diff --git a/cmake/cuda.cmake b/cmake/cuda.cmake index 03c73786a6c31868b1893bfcb319e43e37db1a3d..f507bb41a1103c093e9569176ee868cfaac6bf7b 100644 --- a/cmake/cuda.cmake +++ b/cmake/cuda.cmake @@ -175,7 +175,10 @@ list(APPEND CUDA_NVCC_FLAGS "-std=c++11") list(APPEND CUDA_NVCC_FLAGS "-Xcompiler -fPIC") endif(NOT WIN32) -list(APPEND CUDA_NVCC_FLAGS "--use_fast_math") +if(WITH_FAST_MATH) + # Make use of fast math library. https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html + list(APPEND CUDA_NVCC_FLAGS "--use_fast_math") +endif() # in cuda9, suppress cuda warning on eigen list(APPEND CUDA_NVCC_FLAGS "-w") # Set :expt-relaxed-constexpr to suppress Eigen warnings diff --git a/cmake/external/anakin.cmake b/cmake/external/anakin.cmake index ed054ff41ae0ec5a4b31dd256e397129cba3e8f1..84354c446e2f54fa13b90fa37221eed90968b251 100644 --- a/cmake/external/anakin.cmake +++ b/cmake/external/anakin.cmake @@ -52,6 +52,7 @@ ExternalProject_Add( PREFIX ${ANAKIN_SOURCE_DIR} UPDATE_COMMAND "" CMAKE_ARGS ${CMAKE_ARGS_PREFIX} + -DUSE_LOGGER=YES -DUSE_X86_PLACE=YES -DBUILD_WITH_UNIT_TEST=NO -DPROTOBUF_ROOT=${THIRD_PARTY_PATH}/install/protobuf diff --git a/cmake/external/eigen.cmake b/cmake/external/eigen.cmake index e029300eee9b99582f085f6b650e03f7dacc091a..573ad5e5f06a93f38f24c6a8af3b45767e93a1a4 100644 --- a/cmake/external/eigen.cmake +++ b/cmake/external/eigen.cmake @@ -3,6 +3,14 @@ INCLUDE(ExternalProject) SET(EIGEN_SOURCE_DIR ${THIRD_PARTY_PATH}/eigen3) SET(EIGEN_INCLUDE_DIR ${EIGEN_SOURCE_DIR}/src/extern_eigen3) INCLUDE_DIRECTORIES(${EIGEN_INCLUDE_DIR}) +if(NOT WITH_FAST_MATH) + # EIGEN_FAST_MATH: https://eigen.tuxfamily.org/dox/TopicPreprocessorDirectives.html + # enables some optimizations which might affect the accuracy of the result. + # This currently enables the SSE vectorization of sin() and cos(), + # and speedups sqrt() for single precision. + # Defined to 1 by default. Define it to 0 to disable. + add_definitions(-DEIGEN_FAST_MATH=0) +endif() if(WITH_AMD_GPU) ExternalProject_Add( diff --git a/cmake/external/openblas.cmake b/cmake/external/openblas.cmake index c3fbe4dbdb28f1008bb274ee18293db348bfc6ed..755dbd610c40c2d9b85d3017b6f000a869b0f39a 100644 --- a/cmake/external/openblas.cmake +++ b/cmake/external/openblas.cmake @@ -27,7 +27,7 @@ IF(NOT ${CBLAS_FOUND}) SET(CBLAS_SOURCES_DIR ${THIRD_PARTY_PATH}/openblas) SET(CBLAS_INSTALL_DIR ${THIRD_PARTY_PATH}/install/openblas) - SET(CBLAS_INCLUDE_DIR "${CBLAS_INSTALL_DIR}/include" CACHE PATH "openblas include directory." FORCE) + SET(CBLAS_INC_DIR "${CBLAS_INSTALL_DIR}/include" CACHE PATH "openblas include directory." FORCE) SET(CBLAS_LIBRARIES "${CBLAS_INSTALL_DIR}/lib/${CMAKE_STATIC_LIBRARY_PREFIX}openblas${CMAKE_STATIC_LIBRARY_SUFFIX}" @@ -96,7 +96,7 @@ IF(NOT ${CBLAS_FOUND}) ENDIF(NOT WIN32) SET(CBLAS_PROVIDER openblas) IF(WITH_C_API) - INSTALL(DIRECTORY ${CBLAS_INCLUDE_DIR} DESTINATION third_party/openblas) + INSTALL(DIRECTORY ${CBLAS_INC_DIR} DESTINATION third_party/openblas) # Because libopenblas.a is a symbolic link of another library, thus need to # install the whole directory. IF(ANDROID) @@ -117,8 +117,8 @@ IF(NOT ${CBLAS_FOUND}) ENDIF(NOT ${CBLAS_FOUND}) MESSAGE(STATUS "BLAS library: ${CBLAS_LIBRARIES}") -MESSAGE(STATUS "BLAS Include: ${CBLAS_INCLUDE_DIR}") -INCLUDE_DIRECTORIES(${CBLAS_INCLUDE_DIR}) +MESSAGE(STATUS "BLAS Include: ${CBLAS_INC_DIR}") +INCLUDE_DIRECTORIES(${CBLAS_INC_DIR}) # FIXME(gangliao): generate cblas target to track all high performance # linear algebra libraries for cc_library(xxx SRCS xxx.c DEPS cblas) diff --git a/cmake/flags.cmake b/cmake/flags.cmake index e0556a0babc74ba6efa0a190d4f7b77416bef3bf..343e44ab4bc21c1a656048b675062f1b897bbc77 100644 --- a/cmake/flags.cmake +++ b/cmake/flags.cmake @@ -27,7 +27,6 @@ endfunction() CheckCompilerCXX11Flag() set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11") - # safe_set_flag # # Set a compile flag only if compiler is support @@ -71,6 +70,20 @@ macro(safe_set_nvflag flag_name) endif() endmacro() +macro(safe_set_static_flag) # set c_flags and cxx_flags to static or shared + if (BUILD_SHARED_LIBS) + return() # if build shared libs, the flags keep same with '/MD' + endif(BUILD_SHARED_LIBS) + foreach(flag_var + CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_DEBUG CMAKE_CXX_FLAGS_RELEASE + CMAKE_CXX_FLAGS_MINSIZEREL CMAKE_CXX_FLAGS_RELWITHDEBINFO + CMAKE_C_FLAGS CMAKE_C_FLAGS_DEBUG CMAKE_C_FLAGS_RELEASE + CMAKE_C_FLAGS_MINSIZEREL CMAKE_C_FLAGS_RELWITHDEBINFO) + if(${flag_var} MATCHES "/MD") + string(REGEX REPLACE "/MD" "/MT" ${flag_var} "${${flag_var}}") + endif(${flag_var} MATCHES "/MD") + endforeach(flag_var) +endmacro() CHECK_CXX_SYMBOL_EXISTS(UINT64_MAX "stdint.h" UINT64_MAX_EXISTS) if(NOT UINT64_MAX_EXISTS) @@ -97,9 +110,13 @@ SET(CMAKE_EXTRA_INCLUDE_FILES "") # Common flags. the compiler flag used for C/C++ sources whenever release or debug # Do not care if this flag is support for gcc. + +# https://github.com/PaddlePaddle/Paddle/issues/12773 +if (NOT WIN32) set(COMMON_FLAGS -fPIC -fno-omit-frame-pointer + -Werror -Wall -Wextra -Wnon-virtual-dtor @@ -114,11 +131,6 @@ set(COMMON_FLAGS -Wno-error=terminate # Warning in PADDLE_ENFORCE ) -# https://github.com/PaddlePaddle/Paddle/issues/12773 -if (NOT WIN32) -list(APPEND COMMON_FLAGS -Werror) -endif() - set(GPU_COMMON_FLAGS -fPIC -fno-omit-frame-pointer @@ -133,30 +145,55 @@ set(GPU_COMMON_FLAGS -Wno-error=array-bounds # Warnings in Eigen::array ) +else(NOT WIN32) +set(COMMON_FLAGS + "/w") #disable all warnings. +set(GPU_COMMON_FLAGS + "/w") #disable all warnings +endif(NOT WIN32) + if (APPLE) if(NOT CMAKE_CROSSCOMPILING) # On Mac OS X build fat binaries with x86_64 architectures by default. set (CMAKE_OSX_ARCHITECTURES "x86_64" CACHE STRING "Build architectures for OSX" FORCE) endif() -else() + # On Mac OS X register class specifier is deprecated and will cause warning error on latest clang 10.0 + set (COMMON_FLAGS -Wno-deprecated-register) +endif(APPLE) + +if(LINUX) set(GPU_COMMON_FLAGS -Wall -Wextra -Werror ${GPU_COMMON_FLAGS}) -endif() +endif(LINUX) if(UNIX AND NOT APPLE) # except apple from nix*Os family set(LINUX TRUE) endif(UNIX AND NOT APPLE) - foreach(flag ${COMMON_FLAGS}) safe_set_cflag(CMAKE_C_FLAGS ${flag}) safe_set_cxxflag(CMAKE_CXX_FLAGS ${flag}) + endforeach() foreach(flag ${GPU_COMMON_FLAGS}) safe_set_nvflag(${flag}) endforeach() + +if(WIN32) +# windows build turn off warnings. +safe_set_static_flag() + foreach(flag_var + CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_DEBUG CMAKE_CXX_FLAGS_RELEASE + CMAKE_CXX_FLAGS_MINSIZEREL CMAKE_CXX_FLAGS_RELWITHDEBINFO + CMAKE_C_FLAGS CMAKE_C_FLAGS_DEBUG CMAKE_C_FLAGS_RELEASE + CMAKE_C_FLAGS_MINSIZEREL CMAKE_C_FLAGS_RELWITHDEBINFO) + if(${flag_var} MATCHES "/W3") + string(REGEX REPLACE "/W3" "/w" ${flag_var} "${${flag_var}}") + endif(${flag_var} MATCHES "/W3") + endforeach(flag_var) +endif(WIN32) diff --git a/doc/CMakeLists.txt b/doc/CMakeLists.txt index 0f9521616952a2857222feab8c38fb480761ee2d..a777a4974cc377db103a470698f817612a4e9a32 100644 --- a/doc/CMakeLists.txt +++ b/doc/CMakeLists.txt @@ -1,11 +1,9 @@ add_custom_target(paddle_apis ALL - DEPENDS paddle_v2_apis paddle_fluid_apis) + DEPENDS paddle_v2_apis) add_custom_target(paddle_docs ALL DEPENDS paddle_v2_docs paddle_v2_docs_cn - paddle_fluid_docs paddle_fluid_docs_cn paddle_mobile_docs paddle_mobile_docs_cn) add_subdirectory(v2) -add_subdirectory(fluid) add_subdirectory(mobile) diff --git a/paddle/contrib/float16/float16_transpiler.py b/paddle/contrib/float16/float16_transpiler.py index 66e0345c299730c113ffbdc8dd3c1fa32f872f3d..8d95dc0591e1d6bd815cc697528191c2ee8c1cfe 100644 --- a/paddle/contrib/float16/float16_transpiler.py +++ b/paddle/contrib/float16/float16_transpiler.py @@ -102,8 +102,8 @@ class Float16Transpiler: continue for input_arg in current_op.input_arg_names: if input_arg in self.input_map: - current_op.rename_input(input_arg, - self.input_map[input_arg]) + current_op._rename_input(input_arg, + self.input_map[input_arg]) def _remove_unused_var(self): ''' @@ -187,7 +187,7 @@ class Float16Transpiler: shape=var.shape, persistable=var.persistable) find_op(var) - var.op.rename_output(var_name, tmp_var_name) + var.op._rename_output(var_name, tmp_var_name) self.block._insert_op( i, type="cast", diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec index 2f9371a2870346e5a553f9389717efa64f34d37b..6418da2a7e51c51575ff56aeabedff5452458fbc 100644 --- a/paddle/fluid/API.spec +++ b/paddle/fluid/API.spec @@ -6,29 +6,9 @@ paddle.fluid.Program.global_block ArgSpec(args=['self'], varargs=None, keywords= paddle.fluid.Program.list_vars ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) paddle.fluid.Program.parse_from_string ArgSpec(args=['binary_str'], varargs=None, keywords=None, defaults=None) paddle.fluid.Program.to_string ArgSpec(args=['self', 'throw_on_error', 'with_details'], varargs=None, keywords=None, defaults=(False,)) -paddle.fluid.Operator.__init__ ArgSpec(args=['self', 'block', 'desc', 'type', 'inputs', 'outputs', 'attrs'], varargs=None, keywords=None, defaults=(None, None, None, None)) -paddle.fluid.Operator.all_attrs ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) -paddle.fluid.Operator.attr ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=None) -paddle.fluid.Operator.attr_type ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=None) -paddle.fluid.Operator.block_attr ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=None) -paddle.fluid.Operator.block_attr_id ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=None) -paddle.fluid.Operator.blocks_attr ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=None) -paddle.fluid.Operator.blocks_attr_ids ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=None) -paddle.fluid.Operator.has_attr ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=None) -paddle.fluid.Operator.has_kernel ArgSpec(args=['self', 'op_type'], varargs=None, keywords=None, defaults=None) -paddle.fluid.Operator.input ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=None) -paddle.fluid.Operator.output ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=None) -paddle.fluid.Operator.rename_input ArgSpec(args=['self', 'old_name', 'new_name'], varargs=None, keywords=None, defaults=None) -paddle.fluid.Operator.rename_output ArgSpec(args=['self', 'old_name', 'new_name'], varargs=None, keywords=None, defaults=None) -paddle.fluid.Operator.set_attr ArgSpec(args=['self', 'name', 'val'], varargs=None, keywords=None, defaults=None) -paddle.fluid.Operator.to_string ArgSpec(args=['self', 'throw_on_error'], varargs=None, keywords=None, defaults=None) -paddle.fluid.Parameter.__init__ ArgSpec(args=['self', 'block', 'shape', 'dtype'], varargs=None, keywords='kwargs', defaults=None) -paddle.fluid.Parameter.astype ArgSpec(args=['self', 'dtype'], varargs=None, keywords=None, defaults=None) -paddle.fluid.Parameter.to_string ArgSpec(args=['self', 'throw_on_error', 'with_details'], varargs=None, keywords=None, defaults=(False,)) paddle.fluid.default_startup_program ArgSpec(args=[], varargs=None, keywords=None, defaults=None) paddle.fluid.default_main_program ArgSpec(args=[], varargs=None, keywords=None, defaults=None) paddle.fluid.program_guard ArgSpec(args=[], varargs='args', keywords='kwds', defaults=None) -paddle.fluid.get_var ArgSpec(args=['name', 'program'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.name_scope ArgSpec(args=[], varargs='args', keywords='kwds', defaults=None) paddle.fluid.Executor.__init__ ArgSpec(args=['self', 'place'], varargs=None, keywords=None, defaults=None) paddle.fluid.Executor.close ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) @@ -41,10 +21,10 @@ paddle.fluid.DistributeTranspiler.get_pserver_programs ArgSpec(args=['self', 'en paddle.fluid.DistributeTranspiler.get_startup_program ArgSpec(args=['self', 'endpoint', 'pserver_program', 'startup_program'], varargs=None, keywords=None, defaults=(None, None)) paddle.fluid.DistributeTranspiler.get_trainer_program ArgSpec(args=['self', 'wait_port'], varargs=None, keywords=None, defaults=(True,)) paddle.fluid.DistributeTranspiler.transpile ArgSpec(args=['self', 'trainer_id', 'program', 'pservers', 'trainers', 'sync_mode', 'startup_program', 'current_endpoint'], varargs=None, keywords=None, defaults=(None, '127.0.0.1:6174', 1, True, None, '127.0.0.1:6174')) -paddle.fluid.memory_optimize ArgSpec(args=['input_program', 'skip_opt_set', 'print_log', 'level'], varargs=None, keywords=None, defaults=(None, False, 0)) +paddle.fluid.memory_optimize ArgSpec(args=['input_program', 'skip_opt_set', 'print_log', 'level', 'skip_grads'], varargs=None, keywords=None, defaults=(None, False, 0, False)) paddle.fluid.release_memory ArgSpec(args=['input_program', 'skip_opt_set'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.DistributeTranspilerConfig.__init__ -paddle.fluid.ParallelExecutor.__init__ ArgSpec(args=['self', 'use_cuda', 'loss_name', 'main_program', 'share_vars_from', 'exec_strategy', 'build_strategy', 'num_trainers', 'trainer_id', 'scope'], varargs=None, keywords='kwargs', defaults=(None, None, None, None, None, 1, 0, None)) +paddle.fluid.ParallelExecutor.__init__ ArgSpec(args=['self', 'use_cuda', 'loss_name', 'main_program', 'share_vars_from', 'exec_strategy', 'build_strategy', 'num_trainers', 'trainer_id', 'scope'], varargs=None, keywords=None, defaults=(None, None, None, None, None, 1, 0, None)) paddle.fluid.ParallelExecutor.run ArgSpec(args=['self', 'fetch_list', 'feed', 'feed_dict', 'return_numpy'], varargs=None, keywords=None, defaults=(None, None, True)) paddle.fluid.ExecutionStrategy.__init__ __init__(self: paddle.fluid.core.ExecutionStrategy) -> None paddle.fluid.BuildStrategy.GradientScaleStrategy.__init__ __init__(self: paddle.fluid.core.GradientScaleStrategy, arg0: int) -> None @@ -69,7 +49,7 @@ paddle.fluid.initializer.BilinearInitializer.__init__ ArgSpec(args=['self'], var paddle.fluid.initializer.MSRAInitializer.__init__ ArgSpec(args=['self', 'uniform', 'fan_in', 'seed'], varargs=None, keywords=None, defaults=(True, None, 0)) paddle.fluid.initializer.force_init_on_cpu ArgSpec(args=[], varargs=None, keywords=None, defaults=None) paddle.fluid.initializer.init_on_cpu ArgSpec(args=[], varargs='args', keywords='kwds', defaults=None) -paddle.fluid.layers.fc ArgSpec(args=['input', 'size', 'num_flatten_dims', 'param_attr', 'bias_attr', 'use_mkldnn', 'act', 'is_test', 'name'], varargs=None, keywords=None, defaults=(1, None, None, False, None, False, None)) +paddle.fluid.layers.fc ArgSpec(args=['input', 'size', 'num_flatten_dims', 'param_attr', 'bias_attr', 'act', 'is_test', 'name'], varargs=None, keywords=None, defaults=(1, None, None, None, False, None)) paddle.fluid.layers.embedding ArgSpec(args=['input', 'size', 'is_sparse', 'is_distributed', 'padding_idx', 'param_attr', 'dtype'], varargs=None, keywords=None, defaults=(False, False, None, None, 'float32')) paddle.fluid.layers.dynamic_lstm ArgSpec(args=['input', 'size', 'h_0', 'c_0', 'param_attr', 'bias_attr', 'use_peepholes', 'is_reverse', 'gate_activation', 'cell_activation', 'candidate_activation', 'dtype', 'name'], varargs=None, keywords=None, defaults=(None, None, None, None, True, False, 'sigmoid', 'tanh', 'tanh', 'float32', None)) paddle.fluid.layers.dynamic_lstmp ArgSpec(args=['input', 'size', 'proj_size', 'param_attr', 'bias_attr', 'use_peepholes', 'is_reverse', 'gate_activation', 'cell_activation', 'candidate_activation', 'proj_activation', 'dtype', 'name'], varargs=None, keywords=None, defaults=(None, None, True, False, 'sigmoid', 'tanh', 'tanh', 'tanh', 'float32', None)) @@ -82,14 +62,14 @@ paddle.fluid.layers.cross_entropy ArgSpec(args=['input', 'label', 'soft_label', paddle.fluid.layers.square_error_cost ArgSpec(args=['input', 'label'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.chunk_eval ArgSpec(args=['input', 'label', 'chunk_scheme', 'num_chunk_types', 'excluded_chunk_types'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.sequence_conv ArgSpec(args=['input', 'num_filters', 'filter_size', 'filter_stride', 'padding', 'bias_attr', 'param_attr', 'act'], varargs=None, keywords=None, defaults=(3, 1, None, None, None, None)) -paddle.fluid.layers.conv2d ArgSpec(args=['input', 'num_filters', 'filter_size', 'stride', 'padding', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'use_mkldnn', 'act', 'name'], varargs=None, keywords=None, defaults=(1, 0, 1, None, None, None, True, False, None, None)) -paddle.fluid.layers.conv3d ArgSpec(args=['input', 'num_filters', 'filter_size', 'stride', 'padding', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'use_mkldnn', 'act', 'name'], varargs=None, keywords=None, defaults=(1, 0, 1, None, None, None, True, False, None, None)) +paddle.fluid.layers.conv2d ArgSpec(args=['input', 'num_filters', 'filter_size', 'stride', 'padding', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'act', 'name'], varargs=None, keywords=None, defaults=(1, 0, 1, None, None, None, True, None, None)) +paddle.fluid.layers.conv3d ArgSpec(args=['input', 'num_filters', 'filter_size', 'stride', 'padding', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'act', 'name'], varargs=None, keywords=None, defaults=(1, 0, 1, None, None, None, True, None, None)) paddle.fluid.layers.sequence_pool ArgSpec(args=['input', 'pool_type'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.sequence_softmax ArgSpec(args=['input', 'param_attr', 'bias_attr', 'use_cudnn'], varargs=None, keywords=None, defaults=(None, None, False)) paddle.fluid.layers.softmax ArgSpec(args=['input', 'param_attr', 'bias_attr', 'use_cudnn', 'name'], varargs=None, keywords=None, defaults=(None, None, True, None)) -paddle.fluid.layers.pool2d ArgSpec(args=['input', 'pool_size', 'pool_type', 'pool_stride', 'pool_padding', 'global_pooling', 'use_cudnn', 'ceil_mode', 'use_mkldnn', 'name'], varargs=None, keywords=None, defaults=(-1, 'max', 1, 0, False, True, False, False, None)) -paddle.fluid.layers.pool3d ArgSpec(args=['input', 'pool_size', 'pool_type', 'pool_stride', 'pool_padding', 'global_pooling', 'use_cudnn', 'ceil_mode', 'use_mkldnn', 'name'], varargs=None, keywords=None, defaults=(-1, 'max', 1, 0, False, True, False, False, None)) -paddle.fluid.layers.batch_norm ArgSpec(args=['input', 'act', 'is_test', 'momentum', 'epsilon', 'param_attr', 'bias_attr', 'data_layout', 'in_place', 'use_mkldnn', 'name', 'moving_mean_name', 'moving_variance_name', 'do_model_average_for_mean_and_var', 'fuse_with_relu'], varargs=None, keywords=None, defaults=(None, False, 0.9, 1e-05, None, None, 'NCHW', False, False, None, None, None, False, False)) +paddle.fluid.layers.pool2d ArgSpec(args=['input', 'pool_size', 'pool_type', 'pool_stride', 'pool_padding', 'global_pooling', 'use_cudnn', 'ceil_mode', 'name'], varargs=None, keywords=None, defaults=(-1, 'max', 1, 0, False, True, False, None)) +paddle.fluid.layers.pool3d ArgSpec(args=['input', 'pool_size', 'pool_type', 'pool_stride', 'pool_padding', 'global_pooling', 'use_cudnn', 'ceil_mode', 'name'], varargs=None, keywords=None, defaults=(-1, 'max', 1, 0, False, True, False, None)) +paddle.fluid.layers.batch_norm ArgSpec(args=['input', 'act', 'is_test', 'momentum', 'epsilon', 'param_attr', 'bias_attr', 'data_layout', 'in_place', 'name', 'moving_mean_name', 'moving_variance_name', 'do_model_average_for_mean_and_var', 'fuse_with_relu'], varargs=None, keywords=None, defaults=(None, False, 0.9, 1e-05, None, None, 'NCHW', False, None, None, None, False, False)) paddle.fluid.layers.beam_search_decode ArgSpec(args=['ids', 'scores', 'beam_size', 'end_id', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.conv2d_transpose ArgSpec(args=['input', 'num_filters', 'output_size', 'filter_size', 'padding', 'stride', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'act', 'name'], varargs=None, keywords=None, defaults=(None, None, 0, 1, 1, None, None, None, True, None, None)) paddle.fluid.layers.conv3d_transpose ArgSpec(args=['input', 'num_filters', 'output_size', 'filter_size', 'padding', 'stride', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'act', 'name'], varargs=None, keywords=None, defaults=(None, None, 0, 1, 1, None, None, None, True, None, None)) @@ -166,13 +146,30 @@ paddle.fluid.layers.sequence_enumerate ArgSpec(args=['input', 'win_size', 'pad_v paddle.fluid.layers.expand ArgSpec(args=['x', 'expand_times', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.sequence_concat ArgSpec(args=['input', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.scale ArgSpec(args=['x', 'scale', 'bias', 'bias_after_scale', 'act', 'name'], varargs=None, keywords=None, defaults=(1.0, 0.0, True, None, None)) -paddle.fluid.layers.elementwise_add ArgSpec(args=['x', 'y', 'axis', 'use_mkldnn', 'act', 'name'], varargs=None, keywords=None, defaults=(-1, False, None, None)) -paddle.fluid.layers.elementwise_div ArgSpec(args=['x', 'y', 'axis', 'use_mkldnn', 'act', 'name'], varargs=None, keywords=None, defaults=(-1, False, None, None)) -paddle.fluid.layers.elementwise_sub ArgSpec(args=['x', 'y', 'axis', 'use_mkldnn', 'act', 'name'], varargs=None, keywords=None, defaults=(-1, False, None, None)) -paddle.fluid.layers.elementwise_mul ArgSpec(args=['x', 'y', 'axis', 'use_mkldnn', 'act', 'name'], varargs=None, keywords=None, defaults=(-1, False, None, None)) -paddle.fluid.layers.elementwise_max ArgSpec(args=['x', 'y', 'axis', 'use_mkldnn', 'act', 'name'], varargs=None, keywords=None, defaults=(-1, False, None, None)) -paddle.fluid.layers.elementwise_min ArgSpec(args=['x', 'y', 'axis', 'use_mkldnn', 'act', 'name'], varargs=None, keywords=None, defaults=(-1, False, None, None)) -paddle.fluid.layers.elementwise_pow ArgSpec(args=['x', 'y', 'axis', 'use_mkldnn', 'act', 'name'], varargs=None, keywords=None, defaults=(-1, False, None, None)) +paddle.fluid.layers.elementwise_add ArgSpec(args=['x', 'y', 'axis', 'act', 'name'], varargs=None, keywords=None, defaults=(-1, None, None)) +paddle.fluid.layers.elementwise_div ArgSpec(args=['x', 'y', 'axis', 'act', 'name'], varargs=None, keywords=None, defaults=(-1, None, None)) +paddle.fluid.layers.elementwise_sub ArgSpec(args=['x', 'y', 'axis', 'act', 'name'], varargs=None, keywords=None, defaults=(-1, None, None)) +paddle.fluid.layers.elementwise_mul ArgSpec(args=['x', 'y', 'axis', 'act', 'name'], varargs=None, keywords=None, defaults=(-1, None, None)) +paddle.fluid.layers.elementwise_max ArgSpec(args=['x', 'y', 'axis', 'act', 'name'], varargs=None, keywords=None, defaults=(-1, None, None)) +paddle.fluid.layers.elementwise_min ArgSpec(args=['x', 'y', 'axis', 'act', 'name'], varargs=None, keywords=None, defaults=(-1, None, None)) +paddle.fluid.layers.elementwise_pow ArgSpec(args=['x', 'y', 'axis', 'act', 'name'], varargs=None, keywords=None, defaults=(-1, None, None)) +paddle.fluid.layers.uniform_random_batch_size_like ArgSpec(args=['input', 'shape', 'dtype', 'input_dim_idx', 'output_dim_idx', 'min', 'max', 'seed'], varargs=None, keywords=None, defaults=('float32', 0, 0, -1.0, 1.0, 0)) +paddle.fluid.layers.gaussian_random ArgSpec(args=['shape', 'mean', 'std', 'seed', 'dtype'], varargs=None, keywords=None, defaults=(0.0, 1.0, 0, 'float32')) +paddle.fluid.layers.sampling_id ArgSpec(args=['x', 'min', 'max', 'seed', 'dtype'], varargs=None, keywords=None, defaults=(0.0, 1.0, 0, 'float32')) +paddle.fluid.layers.gaussian_random_batch_size_like ArgSpec(args=['input', 'shape', 'input_dim_idx', 'output_dim_idx', 'mean', 'std', 'seed', 'dtype'], varargs=None, keywords=None, defaults=(0, 0, 0.0, 1.0, 0, 'float32')) +paddle.fluid.layers.sum ArgSpec(args=['x'], varargs=None, keywords=None, defaults=None) +paddle.fluid.layers.slice ArgSpec(args=['input', 'axes', 'starts', 'ends'], varargs=None, keywords=None, defaults=None) +paddle.fluid.layers.shape ArgSpec(args=['input'], varargs=None, keywords=None, defaults=None) +paddle.fluid.layers.logical_and ArgSpec(args=['x', 'y', 'out', 'name'], varargs=None, keywords=None, defaults=(None, None)) +paddle.fluid.layers.logical_or ArgSpec(args=['x', 'y', 'out', 'name'], varargs=None, keywords=None, defaults=(None, None)) +paddle.fluid.layers.logical_xor ArgSpec(args=['x', 'y', 'out', 'name'], varargs=None, keywords=None, defaults=(None, None)) +paddle.fluid.layers.logical_not ArgSpec(args=['x', 'out', 'name'], varargs=None, keywords=None, defaults=(None, None)) +paddle.fluid.layers.clip ArgSpec(args=['x', 'min', 'max', 'name'], varargs=None, keywords=None, defaults=(None,)) +paddle.fluid.layers.clip_by_norm ArgSpec(args=['x', 'max_norm', 'name'], varargs=None, keywords=None, defaults=(None,)) +paddle.fluid.layers.mean ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)) +paddle.fluid.layers.mul ArgSpec(args=['x', 'y', 'x_num_col_dims', 'y_num_col_dims', 'name'], varargs=None, keywords=None, defaults=(1, 1, None)) +paddle.fluid.layers.sigmoid_cross_entropy_with_logits ArgSpec(args=['x', 'label', 'name'], varargs=None, keywords=None, defaults=(None,)) +paddle.fluid.layers.maxout ArgSpec(args=['x', 'groups', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.data ArgSpec(args=['name', 'shape', 'append_batch_size', 'dtype', 'lod_level', 'type', 'stop_gradient'], varargs=None, keywords=None, defaults=(True, 'float32', 0, VarType.LOD_TENSOR, True)) paddle.fluid.layers.open_files ArgSpec(args=['filenames', 'shapes', 'lod_levels', 'dtypes', 'thread_num', 'buffer_size', 'pass_num', 'is_test'], varargs=None, keywords=None, defaults=(None, None, 1, None)) paddle.fluid.layers.read_file ArgSpec(args=['reader'], varargs=None, keywords=None, defaults=None) @@ -235,23 +232,6 @@ paddle.fluid.layers.StaticRNN.update_memory ArgSpec(args=['self', 'mem', 'var'], paddle.fluid.layers.reorder_lod_tensor_by_rank ArgSpec(args=['x', 'rank_table'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.Print ArgSpec(args=['input', 'first_n', 'message', 'summarize', 'print_tensor_name', 'print_tensor_type', 'print_tensor_shape', 'print_tensor_lod', 'print_phase'], varargs=None, keywords=None, defaults=(-1, None, -1, True, True, True, True, 'both')) paddle.fluid.layers.is_empty ArgSpec(args=['x', 'cond'], varargs=None, keywords='ignored', defaults=(None,)) -paddle.fluid.layers.mean ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None) -paddle.fluid.layers.mul ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None) -paddle.fluid.layers.sigmoid_cross_entropy_with_logits ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None) -paddle.fluid.layers.clip ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None) -paddle.fluid.layers.clip_by_norm ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None) -paddle.fluid.layers.logical_and ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None) -paddle.fluid.layers.logical_or ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None) -paddle.fluid.layers.logical_xor ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None) -paddle.fluid.layers.logical_not ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None) -paddle.fluid.layers.uniform_random_batch_size_like ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None) -paddle.fluid.layers.gaussian_random ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None) -paddle.fluid.layers.sampling_id ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None) -paddle.fluid.layers.gaussian_random_batch_size_like ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None) -paddle.fluid.layers.sum ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None) -paddle.fluid.layers.slice ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None) -paddle.fluid.layers.shape ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None) -paddle.fluid.layers.maxout ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None) paddle.fluid.layers.sigmoid ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.logsigmoid ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.exp ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)) @@ -285,11 +265,11 @@ paddle.fluid.layers.anchor_generator ArgSpec(args=['input', 'anchor_sizes', 'asp paddle.fluid.layers.roi_perspective_transform ArgSpec(args=['input', 'rois', 'transformed_height', 'transformed_width', 'spatial_scale'], varargs=None, keywords=None, defaults=(1.0,)) paddle.fluid.layers.generate_proposal_labels ArgSpec(args=['rpn_rois', 'gt_classes', 'is_crowd', 'gt_boxes', 'im_info', 'batch_size_per_im', 'fg_fraction', 'fg_thresh', 'bg_thresh_hi', 'bg_thresh_lo', 'bbox_reg_weights', 'class_nums', 'use_random'], varargs=None, keywords=None, defaults=(256, 0.25, 0.25, 0.5, 0.0, [0.1, 0.1, 0.2, 0.2], None, True)) paddle.fluid.layers.generate_proposals ArgSpec(args=['scores', 'bbox_deltas', 'im_info', 'anchors', 'variances', 'pre_nms_top_n', 'post_nms_top_n', 'nms_thresh', 'min_size', 'eta', 'name'], varargs=None, keywords=None, defaults=(6000, 1000, 0.5, 0.1, 1.0, None)) -paddle.fluid.layers.iou_similarity ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None) -paddle.fluid.layers.box_coder ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None) -paddle.fluid.layers.polygon_box_transform ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None) +paddle.fluid.layers.iou_similarity ArgSpec(args=['x', 'y', 'name'], varargs=None, keywords=None, defaults=(None,)) +paddle.fluid.layers.box_coder ArgSpec(args=['prior_box', 'prior_box_var', 'target_box', 'code_type', 'box_normalized', 'name'], varargs=None, keywords=None, defaults=('encode_center_size', True, None)) +paddle.fluid.layers.polygon_box_transform ArgSpec(args=['input', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.accuracy ArgSpec(args=['input', 'label', 'k', 'correct', 'total'], varargs=None, keywords=None, defaults=(1, None, None)) -paddle.fluid.layers.auc ArgSpec(args=['input', 'label', 'curve', 'num_thresholds', 'topk'], varargs=None, keywords=None, defaults=('ROC', 4095, 1)) +paddle.fluid.layers.auc ArgSpec(args=['input', 'label', 'curve', 'num_thresholds', 'topk', 'slide_steps'], varargs=None, keywords=None, defaults=('ROC', 4095, 1, 1)) paddle.fluid.layers.exponential_decay ArgSpec(args=['learning_rate', 'decay_steps', 'decay_rate', 'staircase'], varargs=None, keywords=None, defaults=(False,)) paddle.fluid.layers.natural_exp_decay ArgSpec(args=['learning_rate', 'decay_steps', 'decay_rate', 'staircase'], varargs=None, keywords=None, defaults=(False,)) paddle.fluid.layers.inverse_time_decay ArgSpec(args=['learning_rate', 'decay_steps', 'decay_rate', 'staircase'], varargs=None, keywords=None, defaults=(False,)) @@ -318,13 +298,18 @@ paddle.fluid.contrib.BeamSearchDecoder.early_stop ArgSpec(args=['self'], varargs paddle.fluid.contrib.BeamSearchDecoder.read_array ArgSpec(args=['self', 'init', 'is_ids', 'is_scores'], varargs=None, keywords=None, defaults=(False, False)) paddle.fluid.contrib.BeamSearchDecoder.update_array ArgSpec(args=['self', 'array', 'value'], varargs=None, keywords=None, defaults=None) paddle.fluid.contrib.memory_usage ArgSpec(args=['program', 'batch_size'], varargs=None, keywords=None, defaults=None) +paddle.fluid.contrib.op_freq_statistic ArgSpec(args=['program'], varargs=None, keywords=None, defaults=None) +paddle.fluid.contrib.QuantizeTranspiler.__init__ ArgSpec(args=['self', 'weight_bits', 'activation_bits', 'activation_quantize_type', 'weight_quantize_type', 'window_size'], varargs=None, keywords=None, defaults=(8, 8, 'abs_max', 'abs_max', 10000)) +paddle.fluid.contrib.QuantizeTranspiler.convert_to_int8 ArgSpec(args=['self', 'program', 'place', 'scope'], varargs=None, keywords=None, defaults=(None,)) +paddle.fluid.contrib.QuantizeTranspiler.freeze_program ArgSpec(args=['self', 'program', 'place', 'fuse_bn', 'scope'], varargs=None, keywords=None, defaults=(False, None)) +paddle.fluid.contrib.QuantizeTranspiler.training_transpile ArgSpec(args=['self', 'program', 'startup_program'], varargs=None, keywords=None, defaults=(None, None)) paddle.fluid.transpiler.DistributeTranspiler.__init__ ArgSpec(args=['self', 'config'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.transpiler.DistributeTranspiler.get_pserver_program ArgSpec(args=['self', 'endpoint'], varargs=None, keywords=None, defaults=None) paddle.fluid.transpiler.DistributeTranspiler.get_pserver_programs ArgSpec(args=['self', 'endpoint'], varargs=None, keywords=None, defaults=None) paddle.fluid.transpiler.DistributeTranspiler.get_startup_program ArgSpec(args=['self', 'endpoint', 'pserver_program', 'startup_program'], varargs=None, keywords=None, defaults=(None, None)) paddle.fluid.transpiler.DistributeTranspiler.get_trainer_program ArgSpec(args=['self', 'wait_port'], varargs=None, keywords=None, defaults=(True,)) paddle.fluid.transpiler.DistributeTranspiler.transpile ArgSpec(args=['self', 'trainer_id', 'program', 'pservers', 'trainers', 'sync_mode', 'startup_program', 'current_endpoint'], varargs=None, keywords=None, defaults=(None, '127.0.0.1:6174', 1, True, None, '127.0.0.1:6174')) -paddle.fluid.transpiler.memory_optimize ArgSpec(args=['input_program', 'skip_opt_set', 'print_log', 'level'], varargs=None, keywords=None, defaults=(None, False, 0)) +paddle.fluid.transpiler.memory_optimize ArgSpec(args=['input_program', 'skip_opt_set', 'print_log', 'level', 'skip_grads'], varargs=None, keywords=None, defaults=(None, False, 0, False)) paddle.fluid.transpiler.release_memory ArgSpec(args=['input_program', 'skip_opt_set'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.transpiler.HashName.__init__ ArgSpec(args=['self', 'pserver_endpoints'], varargs=None, keywords=None, defaults=None) paddle.fluid.transpiler.HashName.dispatch ArgSpec(args=['self', 'varlist'], varargs=None, keywords=None, defaults=None) @@ -333,11 +318,11 @@ paddle.fluid.transpiler.RoundRobin.__init__ ArgSpec(args=['self', 'pserver_endpo paddle.fluid.transpiler.RoundRobin.dispatch ArgSpec(args=['self', 'varlist'], varargs=None, keywords=None, defaults=None) paddle.fluid.transpiler.RoundRobin.reset ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) paddle.fluid.transpiler.DistributeTranspilerConfig.__init__ -paddle.fluid.nets.simple_img_conv_pool ArgSpec(args=['input', 'num_filters', 'filter_size', 'pool_size', 'pool_stride', 'pool_padding', 'pool_type', 'global_pooling', 'conv_stride', 'conv_padding', 'conv_dilation', 'conv_groups', 'param_attr', 'bias_attr', 'act', 'use_cudnn', 'use_mkldnn'], varargs=None, keywords=None, defaults=(0, 'max', False, 1, 0, 1, 1, None, None, None, True, False)) +paddle.fluid.nets.simple_img_conv_pool ArgSpec(args=['input', 'num_filters', 'filter_size', 'pool_size', 'pool_stride', 'pool_padding', 'pool_type', 'global_pooling', 'conv_stride', 'conv_padding', 'conv_dilation', 'conv_groups', 'param_attr', 'bias_attr', 'act', 'use_cudnn'], varargs=None, keywords=None, defaults=(0, 'max', False, 1, 0, 1, 1, None, None, None, True)) paddle.fluid.nets.sequence_conv_pool ArgSpec(args=['input', 'num_filters', 'filter_size', 'param_attr', 'act', 'pool_type'], varargs=None, keywords=None, defaults=(None, 'sigmoid', 'max')) paddle.fluid.nets.glu ArgSpec(args=['input', 'dim'], varargs=None, keywords=None, defaults=(-1,)) paddle.fluid.nets.scaled_dot_product_attention ArgSpec(args=['queries', 'keys', 'values', 'num_heads', 'dropout_rate'], varargs=None, keywords=None, defaults=(1, 0.0)) -paddle.fluid.nets.img_conv_group ArgSpec(args=['input', 'conv_num_filter', 'pool_size', 'conv_padding', 'conv_filter_size', 'conv_act', 'param_attr', 'conv_with_batchnorm', 'conv_batchnorm_drop_rate', 'pool_stride', 'pool_type', 'use_cudnn', 'use_mkldnn'], varargs=None, keywords=None, defaults=(1, 3, None, None, False, 0.0, 1, 'max', True, False)) +paddle.fluid.nets.img_conv_group ArgSpec(args=['input', 'conv_num_filter', 'pool_size', 'conv_padding', 'conv_filter_size', 'conv_act', 'param_attr', 'conv_with_batchnorm', 'conv_batchnorm_drop_rate', 'pool_stride', 'pool_type', 'use_cudnn'], varargs=None, keywords=None, defaults=(1, 3, None, None, False, 0.0, 1, 'max', True)) paddle.fluid.optimizer.SGDOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'regularization', 'name'], varargs=None, keywords=None, defaults=(None, None)) paddle.fluid.optimizer.SGDOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)) paddle.fluid.optimizer.MomentumOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'momentum', 'use_nesterov', 'regularization', 'name'], varargs=None, keywords=None, defaults=(False, None, None)) @@ -377,7 +362,7 @@ paddle.fluid.CPUPlace.__init__ __init__(self: paddle.fluid.core.CPUPlace) -> Non paddle.fluid.CUDAPlace.__init__ __init__(self: paddle.fluid.core.CUDAPlace, arg0: int) -> None paddle.fluid.CUDAPinnedPlace.__init__ __init__(self: paddle.fluid.core.CUDAPinnedPlace) -> None paddle.fluid.ParamAttr.__init__ ArgSpec(args=['self', 'name', 'initializer', 'learning_rate', 'regularizer', 'trainable', 'gradient_clip', 'do_model_average'], varargs=None, keywords=None, defaults=(None, None, 1.0, None, True, None, False)) -paddle.fluid.WeightNormParamAttr.__init__ ArgSpec(args=['self', 'dim'], varargs=None, keywords='kwargs', defaults=(None,)) +paddle.fluid.WeightNormParamAttr.__init__ ArgSpec(args=['self', 'dim', 'name', 'initializer', 'learning_rate', 'regularizer', 'trainable', 'gradient_clip', 'do_model_average'], varargs=None, keywords=None, defaults=(None, None, None, 1.0, None, True, None, False)) paddle.fluid.DataFeeder.__init__ ArgSpec(args=['self', 'feed_list', 'place', 'program'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.DataFeeder.decorate_reader ArgSpec(args=['self', 'reader', 'multi_devices', 'num_places', 'drop_last'], varargs=None, keywords=None, defaults=(None, True)) paddle.fluid.DataFeeder.feed ArgSpec(args=['self', 'iterable'], varargs=None, keywords=None, defaults=None) diff --git a/paddle/fluid/CMakeLists.txt b/paddle/fluid/CMakeLists.txt index ee1f655e25dedb8846bb26275072fd9f6c1f123e..519a00fb073b08f6c88de8186de187476b548fd3 100644 --- a/paddle/fluid/CMakeLists.txt +++ b/paddle/fluid/CMakeLists.txt @@ -13,3 +13,5 @@ if(WITH_INFERENCE) # NOTE: please add subdirectory inference at last. add_subdirectory(inference) endif() + +add_subdirectory(train) diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt index 6d8cbe5d9e491555a94e9420995149041213ab79..844291140602a7a0aac9d9d40256deaf9d8a4c60 100644 --- a/paddle/fluid/framework/CMakeLists.txt +++ b/paddle/fluid/framework/CMakeLists.txt @@ -1,3 +1,4 @@ + # windows treat symbolic file as a real file, which is different with unix # We create a hidden file and compile it instead of origin source file. function(windows_symbolic TARGET) @@ -9,11 +10,23 @@ function(windows_symbolic TARGET) if (NOT EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${src}.cc OR NOT EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${src}.cu) message(FATAL " ${src}.cc and ${src}.cu must exsits, and ${src}.cu must be symbolic file.") endif() - add_custom_command(OUTPUT .${src}.cu + + # only copy the xx.cu to .xx.cu when the content are modified + set(copy_flag 1) + if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/.${src}.cu) + file(READ ${CMAKE_CURRENT_SOURCE_DIR}/${src}.cc SOURCE_STR) + file(READ ${CMAKE_CURRENT_SOURCE_DIR}/.${src}.cu TARGET_STR) + if (SOURCE_STR STREQUAL TARGET_STR) + set(copy_flag 0) + endif() + endif() + if (copy_flag) + add_custom_command(OUTPUT .${src}.cu COMMAND ${CMAKE_COMMAND} -E remove ${CMAKE_CURRENT_SOURCE_DIR}/.${src}.cu COMMAND ${CMAKE_COMMAND} -E copy "${CMAKE_CURRENT_SOURCE_DIR}/${src}.cc" "${CMAKE_CURRENT_SOURCE_DIR}/.${src}.cu" COMMENT "create hidden file of ${src}.cu") - add_custom_target(${TARGET} ALL DEPENDS .${src}.cu) + endif(copy_flag) + add_custom_target(${TARGET} ALL DEPENDS .${src}.cu) endforeach() endfunction() @@ -56,9 +69,9 @@ else() cc_test(mixed_vector_test SRCS mixed_vector_test.cc DEPS place memory device_context tensor) endif() if (NOT WIN32) -cc_library(lod_tensor SRCS lod_tensor.cc DEPS ddim place tensor framework_proto recordio version) + cc_library(lod_tensor SRCS lod_tensor.cc DEPS ddim place tensor framework_proto recordio version) else() -cc_library(lod_tensor SRCS lod_tensor.cc DEPS ddim place tensor framework_proto version) + cc_library(lod_tensor SRCS lod_tensor.cc DEPS ddim place tensor framework_proto version) endif (NOT WIN32) cc_test(lod_tensor_test SRCS lod_tensor_test.cc DEPS lod_tensor memory) @@ -81,6 +94,8 @@ nv_test(data_device_transform_test SRCS data_device_transform_test.cu if(WITH_GPU) if (WIN32) + # windows treat symbolic file as a real file, which is different with unix + # We create a hidden file and compile it instead of origin source file. windows_symbolic(hidden_file SRCS data_type_transform.cu) nv_library(data_type_transform SRCS .data_type_transform.cu DEPS tensor) add_dependencies(data_type_transform hidden_file) @@ -141,20 +156,22 @@ cc_library(lod_rank_table SRCS lod_rank_table.cc DEPS lod_tensor) cc_library(feed_fetch_method SRCS feed_fetch_method.cc DEPS lod_tensor scope glog) +cc_library(naive_executor SRCS naive_executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass) + if(WITH_DISTRIBUTE) cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method sendrecvop_grpc cares grpc++_unsecure grpc_unsecure gpr graph_to_program_pass) set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor") set_source_files_properties(executor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) else() cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass) + cc_test(test_naive_executor SRCS naive_executor_test.cc DEPS naive_executor elementwise_add_op) endif() if (NOT WIN32) - cc_library(parallel_executor SRCS parallel_executor.cc DEPS - threaded_ssa_graph_executor scope_buffered_ssa_graph_executor - graph graph_viz_pass multi_devices_graph_pass - multi_devices_graph_print_pass multi_devices_graph_check_pass - fast_threaded_ssa_graph_executor fuse_elewise_add_act_pass) +cc_library(parallel_executor SRCS parallel_executor.cc DEPS + threaded_ssa_graph_executor scope_buffered_ssa_graph_executor + graph build_strategy + fast_threaded_ssa_graph_executor) endif() # NOT WIN32 cc_library(prune SRCS prune.cc DEPS framework_proto) @@ -167,15 +184,8 @@ cc_test(selected_rows_test SRCS selected_rows_test.cc DEPS selected_rows) cc_test(op_kernel_type_test SRCS op_kernel_type_test.cc DEPS place device_context framework_proto) cc_test(cow_ptr_tests SRCS details/cow_ptr_test.cc) -# cc_test(channel_test SRCS channel_test.cc) cc_test(tuple_test SRCS tuple_test.cc ) if (NOT WIN32) cc_test(rw_lock_test SRCS rw_lock_test.cc) endif (NOT WIN32) - -# disable test temporarily. -# TODO https://github.com/PaddlePaddle/Paddle/issues/11971 -# cc_test(concurrency_test SRCS concurrency_test.cc DEPS go_op channel_close_op channel_create_op -# channel_send_op channel_recv_op sum_op select_op elementwise_add_op compare_op -# conditional_block_op while_op assign_op print_op executor proto_desc) diff --git a/paddle/fluid/framework/channel.h b/paddle/fluid/framework/channel.h deleted file mode 100644 index 722bf8e8ecba0c9cbc5e3ad737dbf73148d2873c..0000000000000000000000000000000000000000 --- a/paddle/fluid/framework/channel.h +++ /dev/null @@ -1,291 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include // for size_t -#include // NOLINT -#include -#include "paddle/fluid/platform/enforce.h" - -namespace paddle { -namespace framework { - -enum class ChannelAction { - SEND = 0, - RECEIVE = 1, - CLOSE = 2, -}; - -// Channel is the abstract class of buffered and un-buffered channels. -template -class Channel { - public: - virtual bool CanSend() = 0; - virtual bool CanReceive() = 0; - virtual void Send(T*) = 0; - virtual bool Receive(T*) = 0; - virtual size_t Cap() = 0; - virtual void Lock() = 0; - - virtual void Unlock() = 0; - virtual bool IsClosed() = 0; - virtual void Close() = 0; - virtual ~Channel() {} - - virtual void AddToSendQ(const void* referrer, T* data, - std::shared_ptr cond, - std::function cb) = 0; - virtual void AddToReceiveQ(const void* referrer, T* data, - std::shared_ptr cond, - std::function cb) = 0; - virtual void RemoveFromSendQ(const void* referrer) = 0; - virtual void RemoveFromReceiveQ(const void* referrer) = 0; -}; - -// Forward declaration of channel implementations. -template -class ChannelImpl; - -template -Channel* MakeChannel(size_t buffer_size) { - return new ChannelImpl(buffer_size); -} - -template -void CloseChannel(Channel* ch) { - ch->Close(); -} - -/* - * The ChannelHolder class serves two main purposes: - * 1. It acts as a unified wrapper for the different kinds of - * channels, i.e. Buffered and Unbuffered channels. This is - * similar to the ReaderHolder class. - * 2. It also helps us in TypeHiding. This is similar to the - * PlaceHolder implementations in variable.h and tensor.h. - */ -class ChannelHolder { - public: - template - void Reset(size_t buffer_size) { - holder_.reset(new PlaceholderImpl(buffer_size)); - } - - template - void Send(T* data) { - PADDLE_ENFORCE_EQ(IsInitialized(), true, - "The Channel hasn't been initialized"); - PADDLE_ENFORCE_EQ( - holder_->Type(), std::type_index(typeid(T)), - "Channel type is not same as the type of the data being sent"); - // Static cast should be safe because we have ensured that types are same - Channel* channel = static_cast*>(holder_->Ptr()); - PADDLE_ENFORCE_EQ(channel != nullptr, true, "Channel should not be null."); - channel->Send(data); - } - - template - bool Receive(T* data) { - PADDLE_ENFORCE_EQ(IsInitialized(), true, - "The Channel hasn't been initialized"); - PADDLE_ENFORCE_EQ( - holder_->Type(), std::type_index(typeid(T)), - "Channel type is not same as the type of the data being sent"); - Channel* channel = static_cast*>(holder_->Ptr()); - PADDLE_ENFORCE_EQ(channel != nullptr, true, "Channel should not be null."); - return channel->Receive(data); - } - - bool IsClosed() { - PADDLE_ENFORCE_EQ(IsInitialized(), true, - "The Channel hasn't been initialized"); - return holder_->IsClosed(); - } - - bool CanSend() { - PADDLE_ENFORCE_EQ(IsInitialized(), true, - "The Channel hasn't been initialized"); - return holder_->CanSend(); - } - - bool CanReceive() { - PADDLE_ENFORCE_EQ(IsInitialized(), true, - "The Channel hasn't been initialized"); - return holder_->CanReceive(); - } - - void close() { - PADDLE_ENFORCE_EQ(IsInitialized(), true, - "The Channel hasn't been initialized"); - holder_->Close(); - } - - size_t Cap() { - PADDLE_ENFORCE_EQ(IsInitialized(), true, - "The Channel hasn't been initialized"); - return holder_->Cap(); - } - - void Lock() { - PADDLE_ENFORCE_EQ(IsInitialized(), true, - "The Channel hasn't been initialized"); - holder_->Lock(); - } - - void Unlock() { - PADDLE_ENFORCE_EQ(IsInitialized(), true, - "The Channel hasn't been initialized"); - holder_->Unlock(); - } - - template - void AddToSendQ(const void* referrer, T* data, - std::shared_ptr cond, - std::function cb) { - PADDLE_ENFORCE_EQ(IsInitialized(), true, - "The Channel hasn't been initialized"); - Channel* channel = static_cast*>(holder_->Ptr()); - if (channel != nullptr) { - channel->AddToSendQ(referrer, data, cond, cb); - } - } - - template - void AddToReceiveQ(const void* referrer, T* data, - std::shared_ptr cond, - std::function cb) { - PADDLE_ENFORCE_EQ(IsInitialized(), true, - "The Channel hasn't been initialized"); - Channel* channel = static_cast*>(holder_->Ptr()); - if (channel != nullptr) { - channel->AddToReceiveQ(referrer, data, cond, cb); - } - } - - void RemoveFromSendQ(const void* referrer) { - PADDLE_ENFORCE_EQ(IsInitialized(), true, - "The Channel hasn't been initialized"); - holder_->RemoveFromSendQ(referrer); - } - - void RemoveFromReceiveQ(const void* referrer) { - PADDLE_ENFORCE_EQ(IsInitialized(), true, - "The Channel hasn't been initialized"); - holder_->RemoveFromReceiveQ(referrer); - } - - inline bool IsInitialized() const { return holder_ != nullptr; } - - inline const std::type_index Type() { - PADDLE_ENFORCE_EQ(IsInitialized(), true, - "The Channel hasn't been initialized"); - return holder_->Type(); - } - - private: - /** - * @note Placeholder hides type T, so it doesn't appear as a template - * parameter of ChannelHolder. - */ - struct Placeholder { - virtual ~Placeholder() {} - virtual const std::type_index Type() const = 0; - virtual void* Ptr() const = 0; - virtual bool IsClosed() = 0; - virtual bool CanSend() = 0; - virtual bool CanReceive() = 0; - virtual void RemoveFromSendQ(const void* referrer) = 0; - virtual void RemoveFromReceiveQ(const void* referrer) = 0; - virtual void Close() = 0; - virtual void Lock() = 0; - virtual void Unlock() = 0; - virtual size_t Cap() = 0; - }; - - template - struct PlaceholderImpl : public Placeholder { - explicit PlaceholderImpl(size_t buffer_size) - : type_(std::type_index(typeid(T))) { - channel_.reset(MakeChannel(buffer_size)); - } - - virtual const std::type_index Type() const { return type_; } - - virtual void* Ptr() const { return static_cast(channel_.get()); } - - virtual bool IsClosed() { - if (channel_) { - return channel_->IsClosed(); - } - return false; - } - - virtual bool CanSend() { - if (channel_) { - return channel_->CanSend(); - } - return false; - } - - virtual bool CanReceive() { - if (channel_) { - return channel_->CanReceive(); - } - return false; - } - - virtual void RemoveFromSendQ(const void* referrer) { - if (channel_) { - channel_->RemoveFromSendQ(referrer); - } - } - - virtual void RemoveFromReceiveQ(const void* referrer) { - if (channel_) { - channel_->RemoveFromReceiveQ(referrer); - } - } - - virtual void Close() { - if (channel_) channel_->Close(); - } - - virtual size_t Cap() { - if (channel_) - return channel_->Cap(); - else - return -1; - } - - virtual void Lock() { - if (channel_) channel_->Lock(); - } - - virtual void Unlock() { - if (channel_) channel_->Unlock(); - } - - std::unique_ptr> channel_; - const std::type_index type_; - }; - - // Pointer to a PlaceholderImpl object - std::unique_ptr holder_; -}; - -} // namespace framework -} // namespace paddle - -#include "paddle/fluid/framework/channel_impl.h" diff --git a/paddle/fluid/framework/channel_impl.h b/paddle/fluid/framework/channel_impl.h deleted file mode 100644 index 26d454534e1ae38c4f83376c0836a45781ea9101..0000000000000000000000000000000000000000 --- a/paddle/fluid/framework/channel_impl.h +++ /dev/null @@ -1,369 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once -#include // for size_t -#include -#include // NOLINT -#include -#include "paddle/fluid/framework/channel.h" -#include "paddle/fluid/platform/enforce.h" - -namespace paddle { -namespace framework { - -template -class ChannelImpl : public paddle::framework::Channel { - friend Channel *paddle::framework::MakeChannel(size_t); - friend void paddle::framework::CloseChannel(Channel *); - - public: - virtual bool CanSend(); - virtual bool CanReceive(); - virtual void Send(T *); - virtual bool Receive(T *); - virtual size_t Cap() { return cap_; } - virtual void Lock(); - virtual void Unlock(); - virtual bool IsClosed(); - virtual void Close(); - explicit ChannelImpl(size_t); - virtual ~ChannelImpl(); - - virtual void AddToSendQ(const void *referrer, T *data, - std::shared_ptr cond, - std::function cb); - virtual void AddToReceiveQ(const void *referrer, T *data, - std::shared_ptr cond, - std::function cb); - - virtual void RemoveFromSendQ(const void *referrer); - virtual void RemoveFromReceiveQ(const void *referrer); - - private: - struct QueueMessage { - T *data; - std::shared_ptr cond; - bool chan_closed = false; - bool completed = false; - const void *referrer; // TODO(thuan): figure out better way to do this - std::function callback; - - explicit QueueMessage(T *item) - : data(item), cond(std::make_shared()) {} - - QueueMessage(T *item, std::shared_ptr cond) - : data(item), cond(cond) {} - - void Wait(std::unique_lock &lock) { - cond->wait(lock, [this]() { return completed; }); - } - - void Notify() { - completed = true; - cond->notify_all(); - } - }; - - void send_return() { - send_ctr--; - destructor_cond_.notify_all(); - } - - bool recv_return(bool value) { - recv_ctr--; - destructor_cond_.notify_all(); - return value; - } - - std::shared_ptr get_first_message( - std::deque> *queue, ChannelAction action) { - while (!queue->empty()) { - // Check whether this message was added by Select - // If this was added by Select then execute the callback - // to check if you can execute this message. The callback - // can return false if some other case was executed in Select. - // In that case just discard this QueueMessage and process next. - std::shared_ptr m = queue->front(); - queue->pop_front(); - if (m->callback == nullptr || m->callback(action)) return m; - } - return nullptr; - } - - size_t cap_; - std::recursive_mutex mu_; - bool closed_; - std::deque buf_; - std::deque> recvq; - std::deque> sendq; - std::atomic send_ctr{0}; - std::atomic recv_ctr{0}; - std::condition_variable_any destructor_cond_; -}; - -template -ChannelImpl::ChannelImpl(size_t capacity) - : cap_(capacity), closed_(false), send_ctr(0), recv_ctr(0) { - PADDLE_ENFORCE_GE(capacity, 0); -} - -template -bool ChannelImpl::CanSend() { - std::lock_guard lock{mu_}; - return !closed_ && (!recvq.empty() || buf_.size() < cap_); -} - -template -bool ChannelImpl::CanReceive() { - std::lock_guard lock{mu_}; - return !(closed_ && buf_.empty()) && (!sendq.empty() || buf_.size() > 0); -} - -template -void ChannelImpl::Send(T *item) { - send_ctr++; - std::unique_lock lock{mu_}; - - // If channel is closed, throw exception - if (closed_) { - send_return(); - lock.unlock(); - PADDLE_THROW("Cannot send on closed channel"); - } - - // If there is a receiver, directly pass the value we want - // to send to the receiver, bypassing the channel buffer if any - if (!recvq.empty()) { - std::shared_ptr m = - get_first_message(&recvq, ChannelAction::SEND); - - if (m != nullptr) { - *(m->data) = std::move(*item); - m->Notify(); - send_return(); - return; - } else { - Send(item); - send_return(); - return; - } - } - - // Unbuffered channel will always bypass this - // If buffered channel has space in buffer, - // write the element to the buffer. - if (buf_.size() < cap_) { - // Copy to buffer - buf_.push_back(std::move(*item)); - send_return(); - return; - } - - // Block on channel, because some receiver will complete - // the operation for us - auto m = std::make_shared(item); - sendq.push_back(m); - m->Wait(lock); - if (m->chan_closed) { - send_return(); - lock.unlock(); - PADDLE_THROW("Cannot send on closed channel"); - } - send_return(); -} - -template -bool ChannelImpl::Receive(T *item) { - recv_ctr++; - std::unique_lock lock{mu_}; - - // If channel is closed and buffer is empty or - // channel is unbuffered - if (closed_ && buf_.empty()) return recv_return(false); - - // If there is a sender, directly receive the value we want - // from the sender. In case of a buffered channel, read from - // buffer and move front of send queue to the buffer - if (!sendq.empty()) { - std::shared_ptr m = - get_first_message(&sendq, ChannelAction::RECEIVE); - if (buf_.size() > 0) { - // Case 1 : Channel is Buffered - // Do Data transfer from front of buffer - // and add a QueueMessage to the buffer - *item = std::move(buf_.front()); - buf_.pop_front(); - // If first message from sendq is not null - // add it to the buffer and notify it - if (m != nullptr) { - // Copy to buffer - buf_.push_back(std::move(*(m->data))); - m->Notify(); - } // Ignore if there is no first message - } else { - // Case 2: Channel is Unbuffered - // Do data transfer from front of SendQ - // If front is nullptr, then recursively call itself - if (m != nullptr) { - *item = std::move(*(m->data)); - m->Notify(); - } else { - return recv_return(Receive(item)); - } - } - return recv_return(true); - } - - // If this is a buffered channel and there are items in buffer - if (buf_.size() > 0) { - // Directly read from buffer - *item = std::move(buf_.front()); - buf_.pop_front(); - // return true - return recv_return(true); - } - - // No sender available, block on this channel - // Some receiver will complete the option for us - auto m = std::make_shared(item); - recvq.push_back(m); - m->Wait(lock); - - return recv_return(!m->chan_closed); -} - -template -void ChannelImpl::Lock() { - mu_.lock(); -} - -template -void ChannelImpl::Unlock() { - mu_.unlock(); -} - -template -bool ChannelImpl::IsClosed() { - std::lock_guard lock{mu_}; - return closed_; -} - -template -void ChannelImpl::Close() { - std::unique_lock lock{mu_}; - - if (closed_) { - // TODO(abhinavarora): closing an already closed channel should panic - lock.unlock(); - return; - } - - closed_ = true; - - // Empty the readers - while (!recvq.empty()) { - std::shared_ptr m = recvq.front(); - recvq.pop_front(); - m->chan_closed = true; - - // Execute callback function (if any) - if (m->callback != nullptr) { - m->callback(ChannelAction::CLOSE); - } - - m->Notify(); - } - - // Empty the senders - while (!sendq.empty()) { - std::shared_ptr m = sendq.front(); - sendq.pop_front(); - m->chan_closed = true; - - // Execute callback function (if any) - if (m->callback != nullptr) { - m->callback(ChannelAction::CLOSE); - } - - m->Notify(); - } -} - -template -void ChannelImpl::AddToSendQ( - const void *referrer, T *data, - std::shared_ptr cond, - std::function cb) { - std::lock_guard lock{mu_}; - auto m = std::make_shared(data, cond); - m->referrer = referrer; - m->callback = cb; - sendq.push_back(m); -} - -template -void ChannelImpl::AddToReceiveQ( - const void *referrer, T *data, - std::shared_ptr cond, - std::function cb) { - std::lock_guard lock{mu_}; - auto m = std::make_shared(data, cond); - m->referrer = referrer; - m->callback = cb; - recvq.push_back(m); -} - -template -void ChannelImpl::RemoveFromSendQ(const void *referrer) { - std::lock_guard lock{mu_}; - - for (auto it = sendq.begin(); it != sendq.end();) { - std::shared_ptr sendMsg = (std::shared_ptr)*it; - - if (sendMsg->referrer == referrer) { - it = sendq.erase(it); - } else { - ++it; - } - } -} - -template -void ChannelImpl::RemoveFromReceiveQ(const void *referrer) { - std::lock_guard lock{mu_}; - - for (auto it = recvq.begin(); it != recvq.end();) { - std::shared_ptr recvMsg = (std::shared_ptr)*it; - - if (recvMsg->referrer == referrer) { - it = recvq.erase(it); - } else { - ++it; - } - } -} - -template -ChannelImpl::~ChannelImpl() { - Close(); - // The destructor must wait for all readers and writers to complete their task - // The channel has been closed, so we will not accept new readers and writers - std::unique_lock lock{mu_}; - destructor_cond_.wait(lock, - [this]() { return send_ctr == 0 && recv_ctr == 0; }); -} - -} // namespace framework -} // namespace paddle diff --git a/paddle/fluid/framework/channel_test.cc b/paddle/fluid/framework/channel_test.cc deleted file mode 100644 index 542d791f6bbdf7d68a4786998ccc0233fff6473d..0000000000000000000000000000000000000000 --- a/paddle/fluid/framework/channel_test.cc +++ /dev/null @@ -1,1008 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/framework/channel.h" - -#include // NOLINT -#include // NOLINT -#include "gtest/gtest.h" - -using paddle::framework::Channel; -using paddle::framework::ChannelHolder; -using paddle::framework::MakeChannel; -using paddle::framework::CloseChannel; - -TEST(Channel, ChannelCapacityTest) { - const size_t buffer_size = 10; - auto ch = MakeChannel(buffer_size); - EXPECT_EQ(ch->Cap(), buffer_size); - CloseChannel(ch); - delete ch; - - ch = MakeChannel(0); - EXPECT_EQ(ch->Cap(), 0U); - CloseChannel(ch); - delete ch; -} - -void RecevingOrderEqualToSendingOrder(Channel *ch, int num_items) { - unsigned sum_send = 0; - std::thread t([&]() { - for (int i = 0; i < num_items; i++) { - ch->Send(&i); - sum_send += i; - } - }); - std::this_thread::sleep_for(std::chrono::milliseconds(200)); - for (int i = 0; i < num_items; i++) { - int recv = -1; - EXPECT_EQ(ch->Receive(&recv), true); - EXPECT_EQ(recv, i); - } - std::this_thread::sleep_for(std::chrono::milliseconds(200)); - CloseChannel(ch); - t.join(); - unsigned expected_sum = (num_items * (num_items - 1)) / 2; - EXPECT_EQ(sum_send, expected_sum); - delete ch; -} - -TEST(Channel, SufficientBufferSizeDoesntBlock) { - const size_t buffer_size = 10; - auto ch = MakeChannel(buffer_size); - for (size_t i = 0; i < buffer_size; ++i) { - ch->Send(&i); - } - - size_t out; - for (size_t i = 0; i < buffer_size; ++i) { - EXPECT_EQ(ch->Receive(&out), true); // should not block - EXPECT_EQ(out, i); - } - CloseChannel(ch); - delete ch; -} - -// This tests that a channel must return false -// on send and receive performed after closing the channel. -// Receive will only return false after close when queue is empty. -// By creating separate threads for sending and receiving, we make this -// function able to test both buffered and unbuffered channels. -void SendReceiveWithACloseChannelShouldPanic(Channel *ch) { - const size_t data = 5; - std::thread send_thread{[&]() { - size_t i = data; - ch->Send(&i); // should not block - }}; - - std::thread recv_thread{[&]() { - size_t i; - EXPECT_EQ(ch->Receive(&i), true); // should not block - EXPECT_EQ(i, data); - }}; - - send_thread.join(); - recv_thread.join(); - - // After closing send should panic. Receive should - // also false as there is no data in queue. - CloseChannel(ch); - send_thread = std::thread{[&]() { - size_t i = data; - bool is_exception = false; - try { - ch->Send(&i); - } catch (paddle::platform::EnforceNotMet e) { - is_exception = true; - } - EXPECT_EQ(is_exception, true); - }}; - recv_thread = std::thread{[&]() { - size_t i; - // should return false because channel is closed and queue is empty - EXPECT_EQ(ch->Receive(&i), false); - }}; - - send_thread.join(); - recv_thread.join(); -} - -TEST(Channel, SendReceiveClosedBufferedChannelPanics) { - size_t buffer_size = 10; - auto ch = MakeChannel(buffer_size); - SendReceiveWithACloseChannelShouldPanic(ch); - delete ch; -} - -TEST(Channel, SendReceiveClosedUnBufferedChannelPanics) { - auto ch = MakeChannel(0); - SendReceiveWithACloseChannelShouldPanic(ch); - delete ch; -} - -TEST(Channel, ReceiveFromBufferedChannelReturnResidualValuesTest) { - const size_t buffer_size = 10; - auto ch = MakeChannel(buffer_size); - - for (size_t i = 0; i < buffer_size; ++i) { - ch->Send(&i); // sending should not block - } - - size_t out; - for (size_t i = 0; i < buffer_size / 2; ++i) { - EXPECT_EQ(ch->Receive(&out), true); // receiving should not block - EXPECT_EQ(out, i); - } - - CloseChannel(ch); - - for (size_t i = buffer_size / 2; i < buffer_size; ++i) { - EXPECT_EQ(ch->Receive(&out), - true); // receving should return residual values. - EXPECT_EQ(out, i); - } - - for (size_t i = 0; i < buffer_size; ++i) { - EXPECT_EQ(ch->Receive(&out), - false); // receiving on closed channel should return false - } - delete ch; -} - -TEST(Channel, ConcurrentSendNonConcurrentReceiveWithSufficientBufferSize) { - const size_t buffer_size = 10; - auto ch = MakeChannel(buffer_size); - std::thread t([&]() { - // Try to write more than buffer size. - for (size_t i = 0; i < 2 * buffer_size; ++i) { - if (i < buffer_size) { - ch->Send(&i); // should block after 10 iterations - } else { - bool is_exception = false; - try { - ch->Send(&i); - } catch (paddle::platform::EnforceNotMet e) { - is_exception = true; - } - EXPECT_EQ(is_exception, true); - } - } - }); - std::this_thread::sleep_for(std::chrono::milliseconds(200)); // wait 0.2 sec - CloseChannel(ch); - t.join(); - delete ch; -} - -TEST(Channel, RecevingOrderEqualToSendingOrderWithUnBufferedChannel) { - auto ch = MakeChannel(0); - RecevingOrderEqualToSendingOrder(ch, 20); -} - -TEST(Channel, RecevingOrderEqualToSendingOrderWithBufferedChannel1) { - // Test that Receive Order is same as Send Order when number of items - // sent is less than size of buffer - auto ch = MakeChannel(10); - RecevingOrderEqualToSendingOrder(ch, 5); -} - -TEST(Channel, RecevingOrderEqualToSendingOrderWithBufferedChannel2) { - // Test that Receive Order is same as Send Order when number of items - // sent is equal to size of buffer - auto ch = MakeChannel(10); - RecevingOrderEqualToSendingOrder(ch, 10); -} - -TEST(Channel, RecevingOrderEqualToSendingOrderWithBufferedChannel3) { - // Test that Receive Order is same as Send Order when number of items - // sent is greater than the size of buffer - auto ch = MakeChannel(10); - RecevingOrderEqualToSendingOrder(ch, 20); -} - -void ChannelCloseUnblocksReceiversTest(Channel *ch) { - const size_t kNumThreads = 5; - std::thread t[kNumThreads]; - bool thread_ended[kNumThreads]; - - // Launches threads that try to read and are blocked because of no writers - for (size_t i = 0; i < kNumThreads; i++) { - thread_ended[i] = false; - t[i] = std::thread( - [&](bool *p) { - int data; - EXPECT_EQ(ch->Receive(&data), false); - *p = true; - }, - &thread_ended[i]); - } - std::this_thread::sleep_for(std::chrono::milliseconds(200)); // wait 0.2 sec - - // Verify that all the threads are blocked - for (size_t i = 0; i < kNumThreads; i++) { - EXPECT_EQ(thread_ended[i], false); - } - - // Explicitly close the channel - // This should unblock all receivers - CloseChannel(ch); - - std::this_thread::sleep_for(std::chrono::milliseconds(200)); // wait 0.2 sec - - // Verify that all threads got unblocked - for (size_t i = 0; i < kNumThreads; i++) { - EXPECT_EQ(thread_ended[i], true); - } - - for (size_t i = 0; i < kNumThreads; i++) t[i].join(); -} - -void ChannelCloseUnblocksSendersTest(Channel *ch, bool isBuffered) { - const size_t kNumThreads = 5; - std::thread t[kNumThreads]; - bool thread_ended[kNumThreads]; - bool send_success[kNumThreads]; - - // Launches threads that try to write and are blocked because of no readers - for (size_t i = 0; i < kNumThreads; i++) { - thread_ended[i] = false; - send_success[i] = false; - t[i] = std::thread( - [&](bool *ended, bool *success) { - int data = 10; - bool is_exception = false; - try { - ch->Send(&data); - } catch (paddle::platform::EnforceNotMet e) { - is_exception = true; - } - *success = !is_exception; - *ended = true; - }, - &thread_ended[i], &send_success[i]); - } - std::this_thread::sleep_for(std::chrono::milliseconds(200)); // wait - - if (isBuffered) { - // If ch is Buffered, atleast 4 threads must be blocked. - int ct = 0; - for (size_t i = 0; i < kNumThreads; i++) { - if (!thread_ended[i]) ct++; - } - EXPECT_GE(ct, 4); - } else { - // If ch is UnBuffered, all the threads should be blocked. - for (size_t i = 0; i < kNumThreads; i++) { - EXPECT_EQ(thread_ended[i], false); - } - } - // Explicitly close the thread - // This should unblock all senders - CloseChannel(ch); - - std::this_thread::sleep_for(std::chrono::milliseconds(200)); // wait - - // Verify that all threads got unblocked - for (size_t i = 0; i < kNumThreads; i++) { - EXPECT_EQ(thread_ended[i], true); - } - - if (isBuffered) { - // Verify that only 1 send was successful - int ct = 0; - for (size_t i = 0; i < kNumThreads; i++) { - if (send_success[i]) ct++; - } - // Only 1 send must be successful - EXPECT_EQ(ct, 1); - } - - for (size_t i = 0; i < kNumThreads; i++) t[i].join(); -} - -// This tests that closing a buffered channel also unblocks -// any receivers waiting on the channel -TEST(Channel, BufferedChannelCloseUnblocksReceiversTest) { - auto ch = MakeChannel(1); - ChannelCloseUnblocksReceiversTest(ch); - delete ch; -} - -// This tests that closing a buffered channel also unblocks -// any senders waiting for channel to have write space -TEST(Channel, BufferedChannelCloseUnblocksSendersTest) { - auto ch = MakeChannel(1); - ChannelCloseUnblocksSendersTest(ch, true); - delete ch; -} - -// This tests that closing an unbuffered channel also unblocks -// unblocks any receivers waiting for senders -TEST(Channel, UnbufferedChannelCloseUnblocksReceiversTest) { - auto ch = MakeChannel(0); - ChannelCloseUnblocksReceiversTest(ch); - delete ch; -} - -// This tests that closing an unbuffered channel also unblocks -// unblocks any senders waiting for senders -TEST(Channel, UnbufferedChannelCloseUnblocksSendersTest) { - auto ch = MakeChannel(0); - ChannelCloseUnblocksSendersTest(ch, false); - delete ch; -} - -TEST(Channel, UnbufferedLessReceiveMoreSendTest) { - auto ch = MakeChannel(0); - unsigned sum_send = 0; - // Send should block after three iterations - // since we only have three receivers. - std::thread t([&]() { - // Try to send more number of times - // than receivers - for (int i = 0; i < 4; i++) { - try { - ch->Send(&i); - sum_send += i; - } catch (paddle::platform::EnforceNotMet e) { - } - } - }); - for (int i = 0; i < 3; i++) { - int recv; - ch->Receive(&recv); - EXPECT_EQ(recv, i); - } - std::this_thread::sleep_for(std::chrono::milliseconds(200)); // wait 0.2 sec - EXPECT_EQ(sum_send, 3U); - - CloseChannel(ch); - t.join(); - delete ch; -} - -TEST(Channel, UnbufferedMoreReceiveLessSendTest) { - auto ch = MakeChannel(0); - unsigned sum_send = 0; - unsigned sum_receive = 0; - // The receiver should block after 5 - // iterations, since there are only 5 senders. - std::thread t([&]() { - for (int i = 0; i < 8; i++) { - int recv; - ch->Receive(&recv); // should block after the fifth iteration. - EXPECT_EQ(recv, i); - sum_receive += i; - } - }); - for (int i = 0; i < 5; i++) { - ch->Send(&i); - sum_send += i; - } - std::this_thread::sleep_for(std::chrono::milliseconds(200)); // wait 0.2 sec - EXPECT_EQ(sum_send, 10U); - EXPECT_EQ(sum_receive, 10U); - // send three more elements - for (int i = 5; i < 8; i++) { - ch->Send(&i); - sum_send += i; - } - - CloseChannel(ch); - t.join(); - EXPECT_EQ(sum_send, 28U); - EXPECT_EQ(sum_receive, 28U); - delete ch; -} - -// This tests that destroying a channel unblocks -// any senders waiting for channel to have write space -void ChannelDestroyUnblockSenders(Channel *ch, bool isBuffered) { - const size_t kNumThreads = 5; - std::thread t[kNumThreads]; - bool thread_ended[kNumThreads]; - bool send_success[kNumThreads]; - - // Launches threads that try to write and are blocked because of no readers - for (size_t i = 0; i < kNumThreads; i++) { - thread_ended[i] = false; - send_success[i] = false; - t[i] = std::thread( - [&](bool *ended, bool *success) { - int data = 10; - bool is_exception = false; - try { - ch->Send(&data); - } catch (paddle::platform::EnforceNotMet e) { - is_exception = true; - } - *success = !is_exception; - *ended = true; - }, - &thread_ended[i], &send_success[i]); - } - - std::this_thread::sleep_for(std::chrono::milliseconds(200)); // wait 0.2 sec - - if (isBuffered) { - // If channel is buffered, verify that atleast 4 threads are blocked - int ct = 0; - for (size_t i = 0; i < kNumThreads; i++) { - if (thread_ended[i] == false) ct++; - } - // Atleast 4 threads must be blocked - EXPECT_GE(ct, 4); - } else { - // Verify that all the threads are blocked - for (size_t i = 0; i < kNumThreads; i++) { - EXPECT_EQ(thread_ended[i], false); - } - } - // Explicitly destroy the channel - delete ch; - std::this_thread::sleep_for(std::chrono::milliseconds(200)); // wait - - // Verify that all threads got unblocked - for (size_t i = 0; i < kNumThreads; i++) { - EXPECT_EQ(thread_ended[i], true); - } - - // Count number of successful sends - int ct = 0; - for (size_t i = 0; i < kNumThreads; i++) { - if (send_success[i]) ct++; - } - - if (isBuffered) { - // Only 1 send must be successful - EXPECT_EQ(ct, 1); - } else { - // In unbuffered channel, no send should be successful - EXPECT_EQ(ct, 0); - } - - // Join all threads - for (size_t i = 0; i < kNumThreads; i++) t[i].join(); -} - -// This tests that destroying a channel also unblocks -// any receivers waiting on the channel -void ChannelDestroyUnblockReceivers(Channel *ch) { - const size_t kNumThreads = 5; - std::thread t[kNumThreads]; - bool thread_ended[kNumThreads]; - - // Launches threads that try to read and are blocked because of no writers - for (size_t i = 0; i < kNumThreads; i++) { - thread_ended[i] = false; - t[i] = std::thread( - [&](bool *p) { - int data; - // All reads should return false - EXPECT_EQ(ch->Receive(&data), false); - *p = true; - }, - &thread_ended[i]); - } - std::this_thread::sleep_for(std::chrono::milliseconds(100)); // wait - - // Verify that all threads are blocked - for (size_t i = 0; i < kNumThreads; i++) { - EXPECT_EQ(thread_ended[i], false); - } - // delete the channel - delete ch; - std::this_thread::sleep_for(std::chrono::milliseconds(200)); // wait - // Verify that all threads got unblocked - for (size_t i = 0; i < kNumThreads; i++) { - EXPECT_EQ(thread_ended[i], true); - } - - for (size_t i = 0; i < kNumThreads; i++) t[i].join(); -} - -TEST(Channel, BufferedChannelDestroyUnblocksReceiversTest) { - size_t buffer_size = 1; - auto ch = MakeChannel(buffer_size); - ChannelDestroyUnblockReceivers(ch); -} - -TEST(Channel, BufferedChannelDestroyUnblocksSendersTest) { - size_t buffer_size = 1; - auto ch = MakeChannel(buffer_size); - ChannelDestroyUnblockSenders(ch, true); -} - -// This tests that destroying an unbuffered channel also unblocks -// unblocks any receivers waiting for senders -TEST(Channel, UnbufferedChannelDestroyUnblocksReceiversTest) { - auto ch = MakeChannel(0); - ChannelDestroyUnblockReceivers(ch); -} - -TEST(Channel, UnbufferedChannelDestroyUnblocksSendersTest) { - auto ch = MakeChannel(0); - ChannelDestroyUnblockSenders(ch, false); -} - -TEST(ChannelHolder, ChannelHolderCapacityTest) { - const size_t buffer_size = 10; - ChannelHolder *ch = new ChannelHolder(); - ch->Reset(buffer_size); - EXPECT_EQ(ch->Cap(), buffer_size); - delete ch; - - ch = new ChannelHolder(); - ch->Reset(0); - EXPECT_EQ(ch->Cap(), 0U); - delete ch; -} - -void ChannelHolderSendReceive(ChannelHolder *ch) { - unsigned sum_send = 0; - std::thread t([&]() { - for (int i = 0; i < 5; i++) { - ch->Send(&i); - sum_send += i; - } - }); - for (int i = 0; i < 5; i++) { - int recv; - EXPECT_EQ(ch->Receive(&recv), true); - EXPECT_EQ(recv, i); - } - - ch->close(); - t.join(); - EXPECT_EQ(sum_send, 10U); -} - -TEST(ChannelHolder, ChannelHolderBufferedSendReceiveTest) { - ChannelHolder *ch = new ChannelHolder(); - ch->Reset(10); - ChannelHolderSendReceive(ch); - delete ch; -} - -TEST(ChannelHolder, ChannelHolderUnBufferedSendReceiveTest) { - ChannelHolder *ch = new ChannelHolder(); - ch->Reset(0); - ChannelHolderSendReceive(ch); - delete ch; -} - -TEST(ChannelHolder, ChannelUninitializedTest) { - ChannelHolder *ch = new ChannelHolder(); - EXPECT_EQ(ch->IsInitialized(), false); - int i = 10; - bool send_exception = false; - try { - ch->Send(&i); - } catch (paddle::platform::EnforceNotMet e) { - send_exception = true; - } - EXPECT_EQ(send_exception, true); - - bool recv_exception = false; - try { - ch->Receive(&i); - } catch (paddle::platform::EnforceNotMet e) { - recv_exception = true; - } - EXPECT_EQ(recv_exception, true); - - bool is_exception = false; - try { - ch->Type(); - } catch (paddle::platform::EnforceNotMet e) { - is_exception = true; - } - EXPECT_EQ(is_exception, true); - delete ch; -} - -TEST(ChannelHolder, ChannelInitializedTest) { - ChannelHolder *ch = new ChannelHolder(); - ch->Reset(2); - EXPECT_EQ(ch->IsInitialized(), true); - // Channel should remain intialized even after close - ch->close(); - EXPECT_EQ(ch->IsInitialized(), true); - delete ch; -} - -TEST(ChannelHolder, TypeMismatchSendTest) { - // Test with unbuffered channel - ChannelHolder *ch = new ChannelHolder(); - ch->Reset(0); - bool is_exception = false; - bool boolean_data = true; - try { - ch->Send(&boolean_data); - } catch (paddle::platform::EnforceNotMet e) { - is_exception = true; - } - EXPECT_EQ(is_exception, true); - delete ch; - - // Test with Buffered Channel - ch = new ChannelHolder(); - ch->Reset(10); - is_exception = false; - int int_data = 23; - try { - ch->Send(&int_data); - } catch (paddle::platform::EnforceNotMet e) { - is_exception = true; - } - EXPECT_EQ(is_exception, true); - delete ch; -} - -TEST(ChannelHolder, TypeMismatchReceiveTest) { - // Test with unbuffered channel - ChannelHolder *ch = new ChannelHolder(); - ch->Reset(0); - bool is_exception = false; - bool float_data; - try { - ch->Receive(&float_data); - } catch (paddle::platform::EnforceNotMet e) { - is_exception = true; - } - EXPECT_EQ(is_exception, true); - delete ch; - - // Test with Buffered Channel - ch = new ChannelHolder(); - ch->Reset(10); - is_exception = false; - int int_data = 23; - try { - ch->Receive(&int_data); - } catch (paddle::platform::EnforceNotMet e) { - is_exception = true; - } - EXPECT_EQ(is_exception, true); - delete ch; -} - -void ChannelHolderCloseUnblocksReceiversTest(ChannelHolder *ch) { - const size_t kNumThreads = 5; - std::thread t[kNumThreads]; - bool thread_ended[kNumThreads]; - - // Launches threads that try to read and are blocked because of no writers - for (size_t i = 0; i < kNumThreads; i++) { - thread_ended[i] = false; - t[i] = std::thread( - [&](bool *p) { - int data; - EXPECT_EQ(ch->Receive(&data), false); - *p = true; - }, - &thread_ended[i]); - } - std::this_thread::sleep_for(std::chrono::milliseconds(200)); // wait 0.2 sec - - // Verify that all the threads are blocked - for (size_t i = 0; i < kNumThreads; i++) { - EXPECT_EQ(thread_ended[i], false); - } - - // Explicitly close the channel - // This should unblock all receivers - ch->close(); - - std::this_thread::sleep_for(std::chrono::milliseconds(200)); // wait 0.2 sec - - // Verify that all threads got unblocked - for (size_t i = 0; i < kNumThreads; i++) { - EXPECT_EQ(thread_ended[i], true); - } - - for (size_t i = 0; i < kNumThreads; i++) t[i].join(); -} - -void ChannelHolderCloseUnblocksSendersTest(ChannelHolder *ch, bool isBuffered) { - const size_t kNumThreads = 5; - std::thread t[kNumThreads]; - bool thread_ended[kNumThreads]; - bool send_success[kNumThreads]; - - // Launches threads that try to write and are blocked because of no readers - for (size_t i = 0; i < kNumThreads; i++) { - thread_ended[i] = false; - send_success[i] = false; - t[i] = std::thread( - [&](bool *ended, bool *success) { - int data = 10; - bool is_exception = false; - try { - ch->Send(&data); - } catch (paddle::platform::EnforceNotMet e) { - is_exception = true; - } - *success = !is_exception; - *ended = true; - }, - &thread_ended[i], &send_success[i]); - } - std::this_thread::sleep_for(std::chrono::milliseconds(200)); // wait - - if (isBuffered) { - // If ch is Buffered, atleast 4 threads must be blocked. - int ct = 0; - for (size_t i = 0; i < kNumThreads; i++) { - if (!thread_ended[i]) ct++; - } - EXPECT_GE(ct, 4); - } else { - // If ch is UnBuffered, all the threads should be blocked. - for (size_t i = 0; i < kNumThreads; i++) { - EXPECT_EQ(thread_ended[i], false); - } - } - // Explicitly close the thread - // This should unblock all senders - ch->close(); - - std::this_thread::sleep_for(std::chrono::milliseconds(200)); // wait - - // Verify that all threads got unblocked - for (size_t i = 0; i < kNumThreads; i++) { - EXPECT_EQ(thread_ended[i], true); - } - - if (isBuffered) { - // Verify that only 1 send was successful - int ct = 0; - for (size_t i = 0; i < kNumThreads; i++) { - if (send_success[i]) ct++; - } - // Only 1 send must be successful - EXPECT_EQ(ct, 1); - } - - for (size_t i = 0; i < kNumThreads; i++) t[i].join(); -} - -// This tests that closing a channelholder unblocks -// any receivers waiting on the channel -TEST(ChannelHolder, ChannelHolderCloseUnblocksReceiversTest) { - // Check for buffered channel - ChannelHolder *ch = new ChannelHolder(); - ch->Reset(1); - ChannelHolderCloseUnblocksReceiversTest(ch); - delete ch; - - // Check for unbuffered channel - ch = new ChannelHolder(); - ch->Reset(0); - ChannelHolderCloseUnblocksReceiversTest(ch); - delete ch; -} - -// This tests that closing a channelholder unblocks -// any senders waiting for channel to have write space -TEST(Channel, ChannelHolderCloseUnblocksSendersTest) { - // Check for buffered channel - ChannelHolder *ch = new ChannelHolder(); - ch->Reset(1); - ChannelHolderCloseUnblocksSendersTest(ch, true); - delete ch; - - // Check for unbuffered channel - ch = new ChannelHolder(); - ch->Reset(0); - ChannelHolderCloseUnblocksSendersTest(ch, false); - delete ch; -} - -// This tests that destroying a channelholder unblocks -// any senders waiting for channel -void ChannelHolderDestroyUnblockSenders(ChannelHolder *ch, bool isBuffered) { - const size_t kNumThreads = 5; - std::thread t[kNumThreads]; - bool thread_ended[kNumThreads]; - bool send_success[kNumThreads]; - - // Launches threads that try to write and are blocked because of no readers - for (size_t i = 0; i < kNumThreads; i++) { - thread_ended[i] = false; - send_success[i] = false; - t[i] = std::thread( - [&](bool *ended, bool *success) { - int data = 10; - bool is_exception = false; - try { - ch->Send(&data); - } catch (paddle::platform::EnforceNotMet e) { - is_exception = true; - } - *success = !is_exception; - *ended = true; - }, - &thread_ended[i], &send_success[i]); - } - - std::this_thread::sleep_for(std::chrono::milliseconds(200)); // wait 0.2 sec - if (isBuffered) { - // If channel is buffered, verify that atleast 4 threads are blocked - int ct = 0; - for (size_t i = 0; i < kNumThreads; i++) { - if (thread_ended[i] == false) ct++; - } - // Atleast 4 threads must be blocked - EXPECT_GE(ct, 4); - } else { - // Verify that all the threads are blocked - for (size_t i = 0; i < kNumThreads; i++) { - EXPECT_EQ(thread_ended[i], false); - } - } - // Explicitly destroy the channel - delete ch; - std::this_thread::sleep_for(std::chrono::milliseconds(200)); // wait - - // Verify that all threads got unblocked - for (size_t i = 0; i < kNumThreads; i++) { - EXPECT_EQ(thread_ended[i], true); - } - - // Count number of successfuld sends - int ct = 0; - for (size_t i = 0; i < kNumThreads; i++) { - if (send_success[i]) ct++; - } - - if (isBuffered) { - // Only 1 send must be successful - EXPECT_EQ(ct, 1); - } else { - // In unbuffered channel, no send should be successful - EXPECT_EQ(ct, 0); - } - - // Join all threads - for (size_t i = 0; i < kNumThreads; i++) t[i].join(); -} - -// This tests that destroying a channelholder also unblocks -// any receivers waiting on the channel -void ChannelHolderDestroyUnblockReceivers(ChannelHolder *ch) { - const size_t kNumThreads = 5; - std::thread t[kNumThreads]; - bool thread_ended[kNumThreads]; - - // Launches threads that try to read and are blocked because of no writers - for (size_t i = 0; i < kNumThreads; i++) { - thread_ended[i] = false; - t[i] = std::thread( - [&](bool *p) { - int data; - // All reads should return false - EXPECT_EQ(ch->Receive(&data), false); - *p = true; - }, - &thread_ended[i]); - } - std::this_thread::sleep_for(std::chrono::milliseconds(200)); // wait - - // Verify that all threads are blocked - for (size_t i = 0; i < kNumThreads; i++) { - EXPECT_EQ(thread_ended[i], false); - } - // delete the channel - delete ch; - std::this_thread::sleep_for(std::chrono::milliseconds(200)); // wait - // Verify that all threads got unblocked - for (size_t i = 0; i < kNumThreads; i++) { - EXPECT_EQ(thread_ended[i], true); - } - - for (size_t i = 0; i < kNumThreads; i++) t[i].join(); -} - -TEST(ChannelHolder, ChannelHolderDestroyUnblocksReceiversTest) { - // Check for Buffered Channel - ChannelHolder *ch = new ChannelHolder(); - ch->Reset(1); - ChannelHolderDestroyUnblockReceivers(ch); - // ch is already deleted already deleted in - // ChannelHolderDestroyUnblockReceivers - - // Check for Unbuffered channel - ch = new ChannelHolder(); - ch->Reset(0); - ChannelHolderDestroyUnblockReceivers(ch); -} - -TEST(ChannelHolder, ChannelHolderDestroyUnblocksSendersTest) { - // Check for Buffered Channel - ChannelHolder *ch = new ChannelHolder(); - ch->Reset(1); - ChannelHolderDestroyUnblockSenders(ch, true); - // ch is already deleted already deleted in - // ChannelHolderDestroyUnblockReceivers - - // Check for Unbuffered channel - ch = new ChannelHolder(); - ch->Reset(0); - ChannelHolderDestroyUnblockSenders(ch, false); -} - -// This tests that closing a channelholder many times. -void ChannelHolderManyTimesClose(ChannelHolder *ch) { - const int kNumThreads = 15; - std::thread t[kNumThreads]; - bool thread_ended[kNumThreads]; - - // Launches threads that try to send data to channel. - for (size_t i = 0; i < kNumThreads / 3; i++) { - thread_ended[i] = false; - t[i] = std::thread( - [&](bool *ended) { - int data = 10; - ch->Send(&data); - *ended = true; - }, - &thread_ended[i]); - } - - // Launches threads that try to receive data to channel. - for (size_t i = kNumThreads / 3; i < 2 * kNumThreads / 3; i++) { - thread_ended[i] = false; - t[i] = std::thread( - [&](bool *p) { - int data; - if (ch->Receive(&data)) { - EXPECT_EQ(data, 10); - } - *p = true; - }, - &thread_ended[i]); - } - - // Launches threads that try to close the channel. - for (size_t i = 2 * kNumThreads / 3; i < kNumThreads; i++) { - thread_ended[i] = false; - t[i] = std::thread( - [&](bool *p) { - if (!ch->IsClosed()) { - ch->close(); - } - *p = true; - }, - &thread_ended[i]); - } - - std::this_thread::sleep_for(std::chrono::milliseconds(100)); // wait - - // Verify that all threads are unblocked - for (size_t i = 0; i < kNumThreads; i++) { - EXPECT_EQ(thread_ended[i], true); - } - EXPECT_TRUE(ch->IsClosed()); - // delete the channel - delete ch; - for (size_t i = 0; i < kNumThreads; i++) t[i].join(); -} - -TEST(ChannelHolder, ChannelHolderManyTimesCloseTest) { - // Check for Buffered Channel - ChannelHolder *ch = new ChannelHolder(); - ch->Reset(10); - ChannelHolderManyTimesClose(ch); -} diff --git a/paddle/fluid/framework/concurrency_test.cc b/paddle/fluid/framework/concurrency_test.cc deleted file mode 100644 index bbf67f5ba92150f70cf45d49e3f4ca0a16393541..0000000000000000000000000000000000000000 --- a/paddle/fluid/framework/concurrency_test.cc +++ /dev/null @@ -1,292 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include // NOLINT - -#include "gtest/gtest.h" -#include "paddle/fluid/framework/block_desc.h" -#include "paddle/fluid/framework/channel.h" -#include "paddle/fluid/framework/executor.h" -#include "paddle/fluid/framework/op_registry.h" - -USE_NO_KERNEL_OP(go); -USE_NO_KERNEL_OP(channel_close); -USE_NO_KERNEL_OP(channel_create); -USE_NO_KERNEL_OP(channel_recv); -USE_NO_KERNEL_OP(channel_send); -USE_NO_KERNEL_OP(elementwise_add); -USE_NO_KERNEL_OP(select); -USE_NO_KERNEL_OP(conditional_block); -USE_NO_KERNEL_OP(equal); -USE_NO_KERNEL_OP(assign); -USE_NO_KERNEL_OP(while); -USE_NO_KERNEL_OP(print); - -namespace f = paddle::framework; -namespace p = paddle::platform; - -namespace paddle { -namespace framework { - -template -LoDTensor *CreateVariable(Scope *scope, const p::CPUPlace &place, - std::string name, T value) { - // Create LoDTensor of dim [1] - auto var = scope->Var(name); - auto tensor = var->GetMutable(); - tensor->Resize({1}); - T *expect = tensor->mutable_data(place); - expect[0] = value; - return tensor; -} - -void AddOp(const std::string &type, const VariableNameMap &inputs, - const VariableNameMap &outputs, AttributeMap attrs, - BlockDesc *block) { - // insert op - auto op = block->AppendOp(); - op->SetType(type); - for (auto &kv : inputs) { - op->SetInput(kv.first, kv.second); - } - for (auto &kv : outputs) { - op->SetOutput(kv.first, kv.second); - } - op->SetAttrMap(attrs); -} - -void AddCase(ProgramDesc *program, Scope *scope, p::CPUPlace *place, - BlockDesc *casesBlock, int caseId, int caseType, - std::string caseChannel, std::string caseVarName, - std::function func) { - std::string caseCondName = std::string("caseCond") + std::to_string(caseId); - std::string caseCondXVarName = - std::string("caseCondX") + std::to_string(caseId); - - BlockDesc *caseBlock = program->AppendBlock(*casesBlock); - func(caseBlock, scope); - - CreateVariable(scope, *place, caseCondName, false); - CreateVariable(scope, *place, caseCondXVarName, caseId); - CreateVariable(scope, *place, caseVarName, caseId); - - scope->Var("step_scope"); - - AddOp("equal", {{"X", {caseCondXVarName}}, {"Y", {"caseToExecute"}}}, - {{"Out", {caseCondName}}}, {}, casesBlock); - - AddOp("conditional_block", {{"X", {caseCondName}}, {"Params", {}}}, - {{"Out", {}}, {"Scope", {"step_scope"}}}, - {{"sub_block", caseBlock}, {"is_scalar_condition", true}}, casesBlock); -} - -void AddFibonacciSelect(Scope *scope, p::CPUPlace *place, ProgramDesc *program, - BlockDesc *parentBlock, std::string dataChanName, - std::string quitChanName) { - BlockDesc *whileBlock = program->AppendBlock(*parentBlock); - - CreateVariable(scope, *place, "whileExitCond", true); - CreateVariable(scope, *place, "caseToExecute", -1); - CreateVariable(scope, *place, "case1var", 0); - - CreateVariable(scope, *place, "xtemp", 0); - - // TODO(thuan): Need to create fibXToSend, since channel send moves the actual - // data, - // which causes the data to be no longer accessible to do the fib calculation - // TODO(abhinav): Change channel send to do a copy instead of a move! - CreateVariable(scope, *place, "fibXToSend", 0); - - CreateVariable(scope, *place, "fibX", 0); - CreateVariable(scope, *place, "fibY", 1); - CreateVariable(scope, *place, "quitVar", 0); - - BlockDesc *casesBlock = program->AppendBlock(*whileBlock); - std::function f = [](BlockDesc *caseBlock) {}; - - // TODO(thuan): Remove this once we change channel send to do a copy instead - // of move - AddOp("assign", {{"X", {"fibX"}}}, {{"Out", {"fibXToSend"}}}, {}, whileBlock); - - // Case 0: Send to dataChanName - std::function case0Func = [&]( - BlockDesc *caseBlock, Scope *scope) { - AddOp("assign", {{"X", {"fibX"}}}, {{"Out", {"xtemp"}}}, {}, caseBlock); - AddOp("assign", {{"X", {"fibY"}}}, {{"Out", {"fibX"}}}, {}, caseBlock); - AddOp("elementwise_add", {{"X", {"xtemp"}}, {"Y", {"fibY"}}}, - {{"Out", {"fibY"}}}, {}, caseBlock); - }; - AddCase(program, scope, place, casesBlock, 0, 1, dataChanName, "fibXToSend", - case0Func); - std::string case0Config = - std::string("0,1,") + dataChanName + std::string(",fibXToSend"); - - // Case 1: Receive from quitChanName - std::function case2Func = [&]( - BlockDesc *caseBlock, Scope *scope) { - // Exit the while loop after we receive from quit channel. - // We assign a false to "whileExitCond" variable, which will - // break out of while_op loop - CreateVariable(scope, *place, "whileFalse", false); - AddOp("assign", {{"X", {"whileFalse"}}}, {{"Out", {"whileExitCond"}}}, {}, - caseBlock); - }; - AddCase(program, scope, place, casesBlock, 1, 2, quitChanName, "quitVar", - case2Func); - std::string case1Config = - std::string("1,2,") + quitChanName + std::string(",quitVar"); - - // Select block - AddOp("select", {{"X", {dataChanName, quitChanName}}, - {"case_to_execute", {"caseToExecute"}}}, - {{"Out", {}}}, - {{"sub_block", casesBlock}, - {"cases", std::vector{case0Config, case1Config}}}, - whileBlock); - - scope->Var("stepScopes"); - AddOp("while", - {{"X", {dataChanName, quitChanName}}, {"Condition", {"whileExitCond"}}}, - {{"Out", {}}, {"StepScopes", {"stepScopes"}}}, - {{"sub_block", whileBlock}}, parentBlock); -} - -TEST(Concurrency, Go_Op) { - Scope scope; - p::CPUPlace place; - - // Initialize scope variables - p::CPUDeviceContext ctx(place); - - // Create channel variable - scope.Var("Channel"); - - // Create Variables, x0 will be put into channel, - // result will be pulled from channel - CreateVariable(&scope, place, "Status", false); - CreateVariable(&scope, place, "x0", 99); - CreateVariable(&scope, place, "result", 0); - - framework::Executor executor(place); - ProgramDesc program; - BlockDesc *block = program.MutableBlock(0); - - // Create channel OP - AddOp("channel_create", {}, {{"Out", {"Channel"}}}, - {{"capacity", 10}, {"data_type", f::proto::VarType::LOD_TENSOR}}, - block); - - // Create Go Op routine - BlockDesc *goOpBlock = program.AppendBlock(program.Block(0)); - AddOp("channel_send", {{"Channel", {"Channel"}}, {"X", {"x0"}}}, - {{"Status", {"Status"}}}, {}, goOpBlock); - - // Create Go Op - AddOp("go", {{"X", {"Channel", "x0"}}}, {}, {{"sub_block", goOpBlock}}, - block); - - // Create Channel Receive Op - AddOp("channel_recv", {{"Channel", {"Channel"}}}, - {{"Status", {"Status"}}, {"Out", {"result"}}}, {}, block); - - // Create Channel Close Op - AddOp("channel_close", {{"Channel", {"Channel"}}}, {}, {}, block); - - // Check the result tensor to make sure it is set to 0 - const LoDTensor &tensor = (scope.FindVar("result"))->Get(); - auto *initialData = tensor.data(); - EXPECT_EQ(initialData[0], 0); - - executor.Run(program, &scope, 0, true, true); - - // After we call executor.run, the Go operator should do a channel_send to - // set the "result" variable to 99. - auto *finalData = tensor.data(); - EXPECT_EQ(finalData[0], 99); -} - -/** - * This test implements the fibonacci function using go_op and select_op - */ -TEST(Concurrency, Select) { - Scope scope; - p::CPUPlace place; - - // Initialize scope variables - p::CPUDeviceContext ctx(place); - - CreateVariable(&scope, place, "Status", false); - CreateVariable(&scope, place, "result", 0); - CreateVariable(&scope, place, "currentXFib", 0); - - framework::Executor executor(place); - ProgramDesc program; - BlockDesc *block = program.MutableBlock(0); - - // Create channel OP - std::string dataChanName = "Channel"; - scope.Var(dataChanName); - AddOp("channel_create", {}, {{"Out", {dataChanName}}}, - {{"capacity", 0}, {"data_type", f::proto::VarType::LOD_TENSOR}}, block); - - std::string quitChanName = "Quit"; - scope.Var(quitChanName); - AddOp("channel_create", {}, {{"Out", {quitChanName}}}, - {{"capacity", 0}, {"data_type", f::proto::VarType::LOD_TENSOR}}, block); - - // Create Go Op routine, which loops 10 times over fibonacci sequence - CreateVariable(&scope, place, "xReceiveVar", 0); - - BlockDesc *goOpBlock = program.AppendBlock(program.Block(0)); - for (int i = 0; i < 10; ++i) { - AddOp("channel_recv", {{"Channel", {dataChanName}}}, - {{"Status", {"Status"}}, {"Out", {"currentXFib"}}}, {}, goOpBlock); - AddOp("print", {{"In", {"currentXFib"}}}, {{"Out", {"currentXFib"}}}, - {{"first_n", 100}, - {"summarize", -1}, - {"print_tensor_name", false}, - {"print_tensor_type", true}, - {"print_tensor_shape", false}, - {"print_tensor_lod", false}, - {"print_phase", std::string("FORWARD")}, - {"message", std::string("X: ")}}, - goOpBlock); - } - - CreateVariable(&scope, place, "quitSignal", 0); - AddOp("channel_send", {{"Channel", {quitChanName}}, {"X", {"quitSignal"}}}, - {{"Status", {"Status"}}}, {}, goOpBlock); - - // Create Go Op - AddOp("go", {{"X", {dataChanName, quitChanName}}}, {}, - {{"sub_block", goOpBlock}}, block); - - AddFibonacciSelect(&scope, &place, &program, block, dataChanName, - quitChanName); - - // Create Channel Close Op - AddOp("channel_close", {{"Channel", {dataChanName}}}, {}, {}, block); - AddOp("channel_close", {{"Channel", {quitChanName}}}, {}, {}, block); - - executor.Run(program, &scope, 0, true, true); - - // After we call executor.run, "result" variable should be equal to 34 - // (which is 10 loops through fibonacci sequence) - const LoDTensor &tensor = (scope.FindVar("currentXFib"))->Get(); - auto *finalData = tensor.data(); - EXPECT_EQ(finalData[0], 34); -} - -} // namespace framework -} // namespace paddle diff --git a/paddle/fluid/framework/details/CMakeLists.txt b/paddle/fluid/framework/details/CMakeLists.txt index a8e0c4a3fedfd56e38de7568be6b3f2e76a4b25f..e0a3ef5a9c6c53c42ebea1a41cac0d18a77781b2 100644 --- a/paddle/fluid/framework/details/CMakeLists.txt +++ b/paddle/fluid/framework/details/CMakeLists.txt @@ -54,3 +54,8 @@ cc_library(scope_buffered_ssa_graph_executor SRCS scope_buffered_ssa_graph_execu # device_context reduce_op_handle ) cc_library(fast_threaded_ssa_graph_executor SRCS fast_threaded_ssa_graph_executor.cc DEPS fetch_op_handle ssa_graph_executor scope simple_threadpool device_context) + +cc_library(build_strategy SRCS build_strategy.cc DEPS + graph_viz_pass multi_devices_graph_pass + multi_devices_graph_print_pass multi_devices_graph_check_pass + fuse_elewise_add_act_pass) diff --git a/paddle/fluid/framework/details/build_strategy.cc b/paddle/fluid/framework/details/build_strategy.cc new file mode 100644 index 0000000000000000000000000000000000000000..6a6b497fa897e3882995688bf36704b1d77ea962 --- /dev/null +++ b/paddle/fluid/framework/details/build_strategy.cc @@ -0,0 +1,126 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/details/build_strategy.h" + +#include "paddle/fluid/framework/details/multi_devices_graph_check_pass.h" +#include "paddle/fluid/framework/details/multi_devices_graph_print_pass.h" +#include "paddle/fluid/framework/ir/graph.h" +#include "paddle/fluid/framework/ir/graph_viz_pass.h" + +namespace paddle { +namespace framework { +namespace details { + +class ParallelExecutorPassBuilder : public ir::PassBuilder { + public: + explicit ParallelExecutorPassBuilder(const BuildStrategy &strategy) + : ir::PassBuilder(), strategy_(strategy) { + // Add a graph viz pass to record a graph. + if (!strategy_.debug_graphviz_path_.empty()) { + auto viz_pass = AppendPass("graph_viz_pass"); + const std::string graph_path = string::Sprintf( + "%s%s", strategy_.debug_graphviz_path_.c_str(), "_original_graph"); + viz_pass->Set("graph_viz_path", new std::string(graph_path)); + } + + // Add op fusion. + if (strategy.fuse_elewise_add_act_ops_) { + auto fuse_elewise_add_act_pass = AppendPass("fuse_elewise_add_act_pass"); + // Add a graph viz pass to record a graph. + if (!strategy.debug_graphviz_path_.empty()) { + auto viz_pass = AppendPass("graph_viz_pass"); + const std::string graph_path = string::Sprintf( + "%s%s", strategy.debug_graphviz_path_.c_str(), "_fused_graph"); + viz_pass->Set("graph_viz_path", + new std::string(graph_path)); + } + } + + // Convert graph to run on multi-devices. + auto multi_devices_pass = AppendPass("multi_devices_pass"); + multi_devices_pass->SetNotOwned("strategy", + &strategy_); + + // Add a graph print pass to record a graph with device info. + if (!strategy_.debug_graphviz_path_.empty()) { + auto multi_devices_print_pass = AppendPass("multi_devices_print_pass"); + multi_devices_print_pass->SetNotOwned( + "debug_graphviz_path", &strategy_.debug_graphviz_path_); + multi_devices_print_pass->Set( + "graph_printer", new details::GraphvizSSAGraphPrinter); + } + + // Verify that the graph is correct for multi-device executor. + AppendPass("multi_devices_check_pass"); + } + + private: + BuildStrategy strategy_; +}; + +std::shared_ptr BuildStrategy::CreatePassesFromStrategy() + const { + pass_builder_.reset(new ParallelExecutorPassBuilder(*this)); + return pass_builder_; +} + +std::unique_ptr BuildStrategy::Apply( + const ProgramDesc &main_program, const std::vector &places, + const std::string &loss_var_name, + const std::unordered_set ¶m_names, + const std::vector &local_scopes, +#ifdef PADDLE_WITH_CUDA + const bool use_cuda, platform::NCCLContextMap *nccl_ctxs) const { +#else + const bool use_cuda) const { +#endif + // Create a default one if not initialized by user. + if (!pass_builder_) { + CreatePassesFromStrategy(); + } + + std::unique_ptr graph(new ir::Graph(main_program)); + + for (std::shared_ptr &pass : pass_builder_->AllPasses()) { + if (pass->Type() == "multi_devices_pass") { + pass->Erase("places"); + pass->SetNotOwned>("places", &places); + pass->Erase("loss_var_name"); + pass->SetNotOwned("loss_var_name", &loss_var_name); + pass->Erase("params"); + pass->SetNotOwned>("params", + ¶m_names); + pass->Erase("local_scopes"); + pass->SetNotOwned>("local_scopes", + &local_scopes); +#ifdef PADDLE_WITH_CUDA + platform::NCCLContextMap *nctx = use_cuda ? nccl_ctxs : nullptr; + pass->Erase("nccl_ctxs"); + pass->SetNotOwned("nccl_ctxs", nctx); +#endif + } + graph = pass->Apply(std::move(graph)); + } + return graph; +} +} // namespace details +} // namespace framework +} // namespace paddle + +USE_PASS(fuse_elewise_add_act_pass); +USE_PASS(graph_viz_pass); +USE_PASS(multi_devices_pass); +USE_PASS(multi_devices_check_pass); +USE_PASS(multi_devices_print_pass); diff --git a/paddle/fluid/framework/details/build_strategy.h b/paddle/fluid/framework/details/build_strategy.h index 77cafa49f18158ea4187908856e10adaa0f916c9..02c4bea16916d58a6d0fce8918f8fceb9ff9356e 100644 --- a/paddle/fluid/framework/details/build_strategy.h +++ b/paddle/fluid/framework/details/build_strategy.h @@ -15,6 +15,17 @@ #pragma once #include +#include + +#include "paddle/fluid/framework/ir/pass_builder.h" +#include "paddle/fluid/framework/program_desc.h" +#include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/platform/device_context.h" +#include "paddle/fluid/platform/enforce.h" + +#ifdef PADDLE_WITH_CUDA +#include "paddle/fluid/platform/nccl_helper.h" +#endif namespace paddle { namespace framework { @@ -57,6 +68,30 @@ struct BuildStrategy { bool fuse_elewise_add_act_ops_{false}; bool enable_data_balance_{false}; + + // User normally doesn't need to call this API. + // The PassBuilder allows for more customized insert, remove of passes + // from python side. + // A new PassBuilder is created based on configs defined above and + // passes are owned by the PassBuilder. + std::shared_ptr CreatePassesFromStrategy() const; + + // Apply the passes built by the pass_builder_. The passes will be + // applied to the Program and output an ir::Graph. + std::unique_ptr Apply( + const ProgramDesc &main_program, + const std::vector &places, + const std::string &loss_var_name, + const std::unordered_set ¶m_names, + const std::vector &local_scopes, +#ifdef PADDLE_WITH_CUDA + const bool use_cuda, platform::NCCLContextMap *nccl_ctxs) const; +#else + const bool use_cuda) const; +#endif + + private: + mutable std::shared_ptr pass_builder_; }; } // namespace details diff --git a/paddle/fluid/framework/details/cow_ptr.h b/paddle/fluid/framework/details/cow_ptr.h index 21f75957be5f33f3dfc09c41fa9a1e1ca590f99e..090517ff3c1822c2e62e61fad05d49e1c8db8573 100644 --- a/paddle/fluid/framework/details/cow_ptr.h +++ b/paddle/fluid/framework/details/cow_ptr.h @@ -20,79 +20,37 @@ namespace paddle { namespace framework { namespace details { -// Change it to thread safe flags if needed. -class ThreadUnsafeOwnershipFlags { +template +class COWPtr { public: - explicit ThreadUnsafeOwnershipFlags(bool flag) : flag_(flag) {} - - ThreadUnsafeOwnershipFlags(const ThreadUnsafeOwnershipFlags& other) = delete; - ThreadUnsafeOwnershipFlags& operator=( - const ThreadUnsafeOwnershipFlags& other) = delete; - ThreadUnsafeOwnershipFlags(ThreadUnsafeOwnershipFlags&& other) = default; - - void SetOwnership(bool flag) { flag_ = flag; } - - // Invoke the callback if it is not owned. - template - void AcquireOwnershipOnce(Callback acquire) { - if (!flag_) { - acquire(); - flag_ = true; - } - } + typedef std::shared_ptr RefPtr; private: - bool flag_; -}; + RefPtr m_sp; -// Copy-On-Write pointer. -// It will hold a T* pointer, and only copy once when `MutableData` is invoked. -// -// The template parameter OwnershipFlags should have: -// * a constructor takes a bool. True if own. -// * SetOwnership(bool flag). -// * AcquireOwnershipOnce(Callback). It will invoke the callback if it is not -// owned. -// -// https://en.wikipedia.org/wiki/Copy-on-write -template -class COWPtr { public: - // Ctor from raw pointer. - explicit COWPtr(T* ptr) : payload_(ptr), ownership_{true} {} + COWPtr() : m_sp(nullptr) {} + explicit COWPtr(T* t) : m_sp(t) {} - // Move methods. Steal ownership from origin - COWPtr(COWPtr&& other) - : payload_(other.payload_), ownership_{std::move(other.ownership_)} {} - COWPtr& operator=(COWPtr&& origin) = default; + const T& Data() const { return *m_sp; } - // Copy methods. Not own payload - COWPtr(const COWPtr& other) : payload_(other.payload_), ownership_{false} {} - COWPtr& operator=(const COWPtr& other) { - payload_ = other.payload_; - ownership_.SetOwnership(false); - return *this; - } - - // Access read only data. - const T& Data() const { return *payload_; } - - // Access mutable data. If the data is not owned, the data will be copied - // before. T* MutableData() { - ownership_.AcquireOwnershipOnce( - [this] { payload_.reset(new T(*payload_)); }); - return payload_.get(); + DetachIfNotUnique(); + return m_sp.get(); } - private: - // Actual data pointer. - std::shared_ptr payload_; + void DetachIfNotUnique() { + T* tmp = m_sp.get(); + if (!(tmp == nullptr || m_sp.unique())) { + Detach(); + } + } - // Ownership flag. - OwnershipFlags ownership_; + void Detach() { + T* tmp = m_sp.get(); + m_sp = RefPtr(new T(*tmp)); + } }; - } // namespace details } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/details/cow_ptr_test.cc b/paddle/fluid/framework/details/cow_ptr_test.cc index d2142af277c0b356d83941b3baab1947cce31dac..5b055d7cb4d127dc20f2cf70869134f24a93d429 100644 --- a/paddle/fluid/framework/details/cow_ptr_test.cc +++ b/paddle/fluid/framework/details/cow_ptr_test.cc @@ -30,6 +30,14 @@ TEST(COWPtr, all) { ASSERT_EQ(ptr2.Data(), 10); } +TEST(COWPtr, change_old) { + COWPtr ptr(new int{0}); + COWPtr ptr2 = ptr; + *ptr.MutableData() = 10; + ASSERT_EQ(ptr2.Data(), 0); + ASSERT_EQ(ptr.Data(), 10); +} + } // namespace details } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/details/reference_count_pass.cc b/paddle/fluid/framework/details/reference_count_pass.cc index b1ce551ce73de33bcede187c72feebad6e2fa1a5..2d1f688d64ece3322e253b0c070264b9eb73d678 100644 --- a/paddle/fluid/framework/details/reference_count_pass.cc +++ b/paddle/fluid/framework/details/reference_count_pass.cc @@ -80,15 +80,15 @@ std::unique_ptr ReferenceCountPass::ApplyImpl( // This is weird but there is really some variables without var_desc // in computation_op if (var_desc == nullptr) { - if (compute_op->Node()->Op()->Block()->FindVar(var_name) == nullptr) - continue; - } else { - if (var_desc->Persistable()) continue; - auto var_type = var_desc->Proto()->type().type(); - if (var_type != proto::VarType::LOD_TENSOR && - var_type != proto::VarType::SELECTED_ROWS) { - continue; - } + var_desc = compute_op->Node()->Op()->Block()->FindVar(var_name); + if (var_desc == nullptr) continue; + } + + if (var_desc->Persistable()) continue; + auto var_type = var_desc->Proto()->type().type(); + if (var_type != proto::VarType::LOD_TENSOR && + var_type != proto::VarType::SELECTED_ROWS) { + continue; } // compute op only runs in one device diff --git a/paddle/fluid/framework/executor.cc b/paddle/fluid/framework/executor.cc index 8d8042a0563a21dad216ffd53a474322c378ace6..70ec6e90a4d0106b7f838e51b8357798daa4b10d 100644 --- a/paddle/fluid/framework/executor.cc +++ b/paddle/fluid/framework/executor.cc @@ -14,7 +14,6 @@ limitations under the License. */ #include "paddle/fluid/framework/executor.h" -#include "paddle/fluid/framework/channel.h" #include "paddle/fluid/framework/feed_fetch_method.h" #include "paddle/fluid/framework/lod_rank_table.h" #include "paddle/fluid/framework/lod_tensor_array.h" @@ -76,15 +75,13 @@ void InitializeVariable(Variable* var, proto::VarType::Type var_type) { var->GetMutable(); } else if (var_type == proto::VarType::READER) { var->GetMutable(); - } else if (var_type == proto::VarType::CHANNEL) { - var->GetMutable(); } else if (var_type == proto::VarType::RAW) { // GetMutable will be called in operator } else { PADDLE_THROW( "Variable type %d is not in " "[LOD_TENSOR, SELECTED_ROWS, FEED_MINIBATCH, FETCH_LIST, " - "LOD_RANK_TABLE, PLACE_LIST, READER, CHANNEL, RAW]", + "LOD_RANK_TABLE, PLACE_LIST, READER, RAW]", var_type); } } diff --git a/paddle/fluid/framework/framework.proto b/paddle/fluid/framework/framework.proto index 460401df5473f8650f450a2bd247a703d91b6048..25f0ba418433571343c5b2bbfdbf9fb940eaec52 100644 --- a/paddle/fluid/framework/framework.proto +++ b/paddle/fluid/framework/framework.proto @@ -126,7 +126,6 @@ message VarType { LOD_TENSOR_ARRAY = 13; PLACE_LIST = 14; READER = 15; - CHANNEL = 16; // Any runtime decided variable type is raw // raw variables should manage their own allocations // in operators like nccl_op @@ -158,12 +157,6 @@ message VarType { message ReaderDesc { repeated LoDTensorDesc lod_tensor = 1; } optional ReaderDesc reader = 5; - message ChannelDesc { - required Type data_type = 1; - required int64 capacity = 2; - } - optional ChannelDesc channel = 6; - message Tuple { repeated Type element_type = 1; } optional Tuple tuple = 7; } diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt index 4dca3ceb4569fb708c7a98621c5239acbe217586..0076a8bece31f9a977b375717c25688fc0c95819 100644 --- a/paddle/fluid/framework/ir/CMakeLists.txt +++ b/paddle/fluid/framework/ir/CMakeLists.txt @@ -1,5 +1,6 @@ set(pass_file ${PADDLE_BINARY_DIR}/paddle/fluid/inference/api/paddle_inference_pass.h) file(WRITE ${pass_file} "// Generated by the paddle/fluid/framework/ir/CMakeLists.txt. DO NOT EDIT!\n\n") +file(APPEND ${pass_file} "\#pragma once\n") file(APPEND ${pass_file} "\#include \"paddle/fluid/framework/ir/pass.h\"\n") @@ -28,12 +29,13 @@ cc_library(graph_pattern_detector SRCS graph_pattern_detector.cc DEPS graph grap pass_library(graph_to_program_pass base) pass_library(graph_viz_pass base) pass_library(fc_fuse_pass inference) -if(WITH_MKLDNN) - pass_library(conv_relu_mkldnn_fuse_pass inference) -endif() +if (WITH_MKLDNN) + pass_library(conv_relu_mkldnn_fuse_pass inference) +endif () pass_library(attention_lstm_fuse_pass inference) pass_library(infer_clean_graph_pass inference) pass_library(fc_lstm_fuse_pass inference) +pass_library(embedding_fc_lstm_fuse_pass inference) pass_library(fc_gru_fuse_pass inference) pass_library(seq_concat_fc_fuse_pass inference) @@ -41,12 +43,14 @@ cc_library(fuse_elewise_add_act_pass SRCS fuse_elewise_add_act_pass.cc DEPS pass set(GLOB_PASS_LIB ${PASS_LIBRARY} CACHE INTERNAL "Global PASS library") +cc_library(pass_builder SRCS pass_builder.cc DEPS pass) + cc_test(pass_test SRCS pass_test.cc DEPS graph pass graph_helper) cc_test(graph_test SRCS graph_test.cc DEPS graph graph_helper op_registry) cc_test(graph_helper_test SRCS graph_helper_test.cc DEPS graph graph_helper op_registry) cc_test(graph_to_program_pass_test SRCS graph_to_program_pass_test.cc DEPS graph_to_program_pass) cc_test(test_graph_pattern_detector SRCS graph_pattern_detector_tester.cc DEPS graph_pattern_detector) cc_test(test_fc_fuse_pass SRCS fc_fuse_pass_tester.cc DEPS fc_fuse_pass framework_proto) -if(WITH_MKLDNN) - cc_test(test_conv_relu_mkldnn_fuse_pass SRCS conv_relu_mkldnn_fuse_pass_tester.cc DEPS conv_relu_mkldnn_fuse_pass) -endif() +if (WITH_MKLDNN) + cc_test(test_conv_relu_mkldnn_fuse_pass SRCS conv_relu_mkldnn_fuse_pass_tester.cc DEPS conv_relu_mkldnn_fuse_pass) +endif () diff --git a/paddle/fluid/framework/ir/attention_lstm_fuse_pass.cc b/paddle/fluid/framework/ir/attention_lstm_fuse_pass.cc index bb52d7e498e55c02ddc2cd6d07ccccd51ce4edc5..1c75cb5a82029b6a542a3a2f031a353f5e40f4ea 100644 --- a/paddle/fluid/framework/ir/attention_lstm_fuse_pass.cc +++ b/paddle/fluid/framework/ir/attention_lstm_fuse_pass.cc @@ -257,6 +257,22 @@ std::unique_ptr AttentionLSTMFusePass::ApplyImpl( std::unique_ptr graph) const { PDPattern external_pattern, subblock_pattern; + // Use the following variables to tell whether this model is RNN1. + // This fuse can only works on the RNN1 model. + std::unordered_set specified_vars({"data_lod_attention", + "cell_init", "hidden_init", + "data", "week", "minute"}); + int count = 0; + for (auto* node : graph->Nodes()) { + if (node->IsVar() && specified_vars.count(node->Name())) { + ++count; + } + } + if (count < specified_vars.size()) { + return graph; + } + + // Continue to fuse. FindWhileOp(graph.get()); return graph; } diff --git a/paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass.cc b/paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass.cc index 09c5ec59d66445bdbd5349447b125be89cb2efdf..d7df6389cfd595324e284e0da10f65213ccee80f 100644 --- a/paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass.cc +++ b/paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass.cc @@ -26,8 +26,6 @@ std::unique_ptr ConvReLUFusePass::ApplyImpl( PADDLE_ENFORCE(graph.get()); FusePassBase::Init("conv_relu_mkldnn_fuse", graph.get()); - std::unordered_set nodes2delete; - GraphPatternDetector gpd; auto* conv_input = gpd.mutable_pattern() ->NewNode("conv_relu_mkldnn_fuse/conv_input") @@ -42,36 +40,20 @@ std::unique_ptr ConvReLUFusePass::ApplyImpl( Graph* g) { VLOG(4) << "handle ConvReLU fuse"; GET_IR_NODE_FROM_SUBGRAPH(conv_weight, conv_weight, - conv_relu_pattern); // Filter - GET_IR_NODE_FROM_SUBGRAPH(conv_bias, conv_bias, conv_relu_pattern); // Bias - GET_IR_NODE_FROM_SUBGRAPH(conv_out, conv_out, conv_relu_pattern); // tmp + conv_relu_pattern); // Filter + GET_IR_NODE_FROM_SUBGRAPH(conv_out, conv_out, conv_relu_pattern); // tmp GET_IR_NODE_FROM_SUBGRAPH(conv, conv, conv_relu_pattern); // CONV op GET_IR_NODE_FROM_SUBGRAPH(relu_out, relu_out, conv_relu_pattern); // Out GET_IR_NODE_FROM_SUBGRAPH(relu, relu, conv_relu_pattern); // ReLU op - // Create an ConvReLU Node. - OpDesc desc; - std::string conv_relu_i_in = subgraph.at(conv_input)->Name(); - std::string conv_relu_w_in = conv_weight->Name(); - std::string conv_relu_b_in = conv_bias->Name(); - std::string conv_relu_out = relu_out->Name(); - desc.SetInput("Input", std::vector({conv_relu_i_in})); - desc.SetInput("Filter", std::vector({conv_relu_w_in})); - desc.SetInput("Bias", std::vector({conv_relu_b_in})); - desc.SetOutput("Output", std::vector({conv_relu_out})); - desc.SetType("conv2d"); - for (auto& attr : conv->Op()->GetAttrMap()) { - desc.SetAttr(attr.first, attr.second); - } - desc.SetAttr("fuse_relu", true); - auto conv_relu_node = g->CreateOpNode(&desc); // OpDesc will be copied. - GraphSafeRemoveNodes(graph.get(), {conv, relu, conv_out}); + // Transform Conv node into ConvReLU node. + OpDesc* desc = conv->Op(); + desc->SetOutput("Output", std::vector({relu_out->Name()})); + desc->SetAttr("fuse_relu", true); + GraphSafeRemoveNodes(graph.get(), {relu, conv_out}); PADDLE_ENFORCE(subgraph.count(conv_input)); - IR_NODE_LINK_TO(subgraph.at(conv_input), conv_relu_node); - IR_NODE_LINK_TO(conv_weight, conv_relu_node); - IR_NODE_LINK_TO(conv_bias, conv_relu_node); - IR_NODE_LINK_TO(conv_relu_node, relu_out); + IR_NODE_LINK_TO(conv, relu_out); found_conv_relu_count++; }; diff --git a/paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass_tester.cc b/paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass_tester.cc index 82b5fa1886098ca3b19c147c307d3f2fc3ba03d6..9dd780ec89ab991d6d99cb66fa2a9b683be2b9ca 100644 --- a/paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass_tester.cc +++ b/paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass_tester.cc @@ -85,16 +85,13 @@ TEST(ConvReLUFusePass, basic) { for (auto* node : graph->Nodes()) { if (node->IsOp() && node->Op()->Type() == "conv2d") { - if (node->Op()->HasAttr("use_mkldnn")) { - bool use_mkldnn = boost::get(node->Op()->GetAttr("use_mkldnn")); - if (use_mkldnn) { - if (node->Op()->HasAttr("fuse_relu")) { - bool fuse_relu = boost::get(node->Op()->GetAttr("fuse_relu")); - if (fuse_relu) { - ++conv_relu_count; - } - } - } + auto* op = node->Op(); + ASSERT_TRUE(op->HasAttr("use_mkldnn")); + EXPECT_TRUE(boost::get(op->GetAttr("use_mkldnn"))); + ASSERT_TRUE(op->HasAttr("fuse_relu")); + bool fuse_relu = boost::get(op->GetAttr("fuse_relu")); + if (fuse_relu) { + ++conv_relu_count; } } } diff --git a/paddle/fluid/framework/ir/embedding_fc_lstm_fuse_pass.cc b/paddle/fluid/framework/ir/embedding_fc_lstm_fuse_pass.cc new file mode 100644 index 0000000000000000000000000000000000000000..ba11f19c9273650113096be3fa23ca077bbc7dd9 --- /dev/null +++ b/paddle/fluid/framework/ir/embedding_fc_lstm_fuse_pass.cc @@ -0,0 +1,243 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/ir/embedding_fc_lstm_fuse_pass.h" +#include +#include +#include "paddle/fluid/framework/lod_tensor.h" + +#include "paddle/fluid/operators/math/blas.h" +#include "paddle/fluid/operators/math/cpu_vec.h" +#include "paddle/fluid/operators/math/fc_compute.h" +#include "paddle/fluid/platform/cpu_info.h" + +namespace paddle { +namespace framework { +namespace ir { + +static int BuildFusion(Graph* graph, const std::string& name_scope, + Scope* scope, bool with_fc_bias) { + GraphPatternDetector gpd; + auto* pattern = gpd.mutable_pattern(); + + // Build pattern + PDNode* x = pattern->NewNode(patterns::PDNodeName(name_scope, "x")) + ->assert_is_op_input("lookup_table") + ->assert_var_not_persistable(); + patterns::Embedding embedding_pattern(pattern, name_scope); + // TODO(jczaja): Intermediate can only be for val that are not used anywhere + // but lookup table output may go into other LSTM (for reverse + // direction) + auto* embedding_out = embedding_pattern(x); + patterns::FC fc_pattern(pattern, name_scope); + + // fc_out is a tmp var, will be removed after fuse, so marked as intermediate. + auto* fc_out = fc_pattern(embedding_out, with_fc_bias)->AsIntermediate(); + patterns::LSTM lstm_pattern(pattern, name_scope); + lstm_pattern(fc_out); + + // Create New OpDesc + auto embedding_lstm_creator = [&](Node* embedding, Node* W, Node* lstm, + Node* input, Node* weight_x, Node* weight_h, + Node* bias, Node* hidden, Node* cell, + Node* xx, Node* fc_bias) { + OpDesc op_desc; + op_desc.SetType("fused_embedding_fc_lstm"); +#define SET_IN(Key, node__) op_desc.SetInput(#Key, {node__->Name()}); + SET_IN(Ids, input); + SET_IN(WeightH, weight_h); + // Neet to have this passed as We need Wc data for peephole connections + SET_IN(Bias, bias); +#undef SET_IN + + // Multiply embeddings with Weights + PADDLE_ENFORCE(scope); + const std::string& embeddings = patterns::UniqueKey("Embeddings"); + auto* embeddings_var = scope->Var(embeddings); + PADDLE_ENFORCE(embeddings_var); + auto* embeddings_tensor = + embeddings_var->GetMutable(); + // Get WeightX size: [single_embedding, fc_size] + // and embedding size: [dict_size, single_embedding] + // and create new size of embeddings eg. [dict_size , hidden_size] + auto* embedding_var = scope->FindVar(W->Name()); + PADDLE_ENFORCE(embedding_var); + const auto& embedding_tensor = embedding_var->Get(); + + const auto& weightx_tensor = + scope->FindVar(weight_x->Name())->Get(); + embeddings_tensor->Resize( + {embedding_tensor.dims()[0], weightx_tensor.dims()[1]}); + + // Multiplie embeddings via WeightsX and add bias + auto embedding_data = embedding_tensor.data(); + auto weightx_data = weightx_tensor.data(); + auto embeddings_data = + embeddings_tensor->mutable_data(platform::CPUPlace()); + + // Adding biases to GEMM result to be + auto* lstm_bias_var = scope->FindVar(bias->Name()); + PADDLE_ENFORCE(lstm_bias_var); + const auto& lstm_bias_tensor = lstm_bias_var->Get(); + + auto alpha = 1.0f; + auto beta = 1.0f; + int m = embedding_tensor.dims()[0]; + int n = weightx_tensor.dims()[1]; + int k = embedding_tensor.dims()[1]; + + // Copy only gate biases values (only actual bias data, not peephole + // weights) + std::vector combined_biases; + combined_biases.reserve(n); + std::copy_n(lstm_bias_tensor.data(), n, + std::back_inserter(combined_biases)); + + if (with_fc_bias) { + // Add FC-bias with LSTM-bias (into GEMM result to be) + auto* fc_bias_var = scope->FindVar(fc_bias->Name()); + const auto& fc_bias_tensor = fc_bias_var->Get(); + for (int i = 0; i < fc_bias_tensor.numel(); i++) { + combined_biases[i] += fc_bias_tensor.data()[i]; + } + } + + // broadcast biases + std::vector ones(m, 1.0f); + paddle::operators::math::CBlas::GEMM( + CblasRowMajor, CblasNoTrans, CblasNoTrans, m, n, 1, alpha, &ones[0], 1, + &combined_biases[0], n, 0.0f, embeddings_data, n); + + // Wx*embeddings + biases + paddle::operators::math::CBlas::GEMM( + CblasRowMajor, CblasNoTrans, CblasNoTrans, m, n, k, alpha, + embedding_data, k, weightx_data, n, beta, embeddings_data, n); + op_desc.SetInput("Embeddings", {embeddings}); + + // Create temp variables. + const std::string BatchedInput = patterns::UniqueKey("BatchedInput"); + const std::string BatchedCellPreAct = + patterns::UniqueKey("BatchedCellPreAct"); + const std::string BatchedGate = patterns::UniqueKey("BatchedGate"); + + scope->Var(BatchedInput)->GetMutable(); + scope->Var(BatchedCellPreAct)->GetMutable(); + scope->Var(BatchedGate)->GetMutable(); + + op_desc.SetInput("H0", {}); + op_desc.SetInput("C0", {}); + op_desc.SetOutput("Hidden", {hidden->Name()}); + op_desc.SetOutput("Cell", {cell->Name()}); + op_desc.SetOutput("XX", {xx->Name()}); + op_desc.SetOutput("BatchedGate", {BatchedGate}); + op_desc.SetOutput("BatchCellPreAct", {BatchedCellPreAct}); + op_desc.SetOutput("BatchedInput", {BatchedInput}); + op_desc.SetAttr("is_reverse", lstm->Op()->GetAttr("is_reverse")); + op_desc.SetAttr("use_peepholes", lstm->Op()->GetAttr("use_peepholes")); + // TODO(TJ): get from attr + op_desc.SetAttr("use_seq", true); + + PADDLE_ENFORCE(graph->Has(kParamScopeAttr)); + auto* scope = graph->Get(kParamScopeAttr); +#define OP_SET_OUT(x) \ + const std::string x = patterns::UniqueKey(#x); \ + op_desc.SetOutput(#x, {x}); \ + scope->Var(x)->GetMutable() + OP_SET_OUT(BatchedCell); + OP_SET_OUT(BatchedHidden); + OP_SET_OUT(ReorderedH0); + OP_SET_OUT(ReorderedC0); +#undef OP_SET_OUT + + auto* op = graph->CreateOpNode(&op_desc); + IR_NODE_LINK_TO(input, op); + IR_NODE_LINK_TO(weight_x, op); + IR_NODE_LINK_TO(weight_h, op); + IR_NODE_LINK_TO(bias, op); + IR_NODE_LINK_TO(op, hidden); + return op; + }; + + int fusion_count{0}; + + auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph, + Graph* g) { + GET_IR_NODE_FROM_SUBGRAPH(lstm, lstm, lstm_pattern); + GET_IR_NODE_FROM_SUBGRAPH(Weight, Weight, lstm_pattern); + GET_IR_NODE_FROM_SUBGRAPH(Bias, Bias, lstm_pattern); + GET_IR_NODE_FROM_SUBGRAPH(Cell, Cell, lstm_pattern); + GET_IR_NODE_FROM_SUBGRAPH(Hidden, Hidden, lstm_pattern); + GET_IR_NODE_FROM_SUBGRAPH(lookup_table, lookup_table, embedding_pattern); + GET_IR_NODE_FROM_SUBGRAPH(W, W, embedding_pattern); + GET_IR_NODE_FROM_SUBGRAPH(w, w, fc_pattern); + GET_IR_NODE_FROM_SUBGRAPH(mul, mul, fc_pattern); + + // TODO(jczaja): Add support for is_sparse / is_distributed + auto is_sparse = boost::get(lookup_table->Op()->GetAttr("is_sparse")); + auto is_distributed = + boost::get(lookup_table->Op()->GetAttr("is_distributed")); + + if (is_sparse == true || is_distributed == true) { + return; + } + + if (with_fc_bias) { + GET_IR_NODE_FROM_SUBGRAPH(fc_out, Out, fc_pattern); + GET_IR_NODE_FROM_SUBGRAPH(fc_bias, bias, fc_pattern); + GET_IR_NODE_FROM_SUBGRAPH(elementwise_add, elementwise_add, fc_pattern); + embedding_lstm_creator(lookup_table, W, lstm, subgraph.at(x), w, Weight, + Bias, Hidden, Cell, fc_out, fc_bias); + // Remove unneeded nodes. + // TODO(jczaja): Proper removing of lookup table + std::unordered_set marked_nodes( + //{lookup_table, mul, lstm, elementwise_add, fc_bias, W}); + {mul, lstm, elementwise_add, fc_bias}); + GraphSafeRemoveNodes(graph, marked_nodes); + } else { + GET_IR_NODE_FROM_SUBGRAPH(fc_out, mul_out, fc_pattern); + embedding_lstm_creator(lookup_table, W, lstm, subgraph.at(x), w, Weight, + Bias, Hidden, Cell, fc_out, nullptr); + // Remove unneeded nodes. + // TODO(jczaja): Proper removing of lookup table + // std::unordered_set marked_nodes({lookup_table, W, mul, + // lstm}); + std::unordered_set marked_nodes({mul, lstm}); + GraphSafeRemoveNodes(graph, marked_nodes); + } + + ++fusion_count; + }; + + gpd(graph, handler); + + return fusion_count; +} + +std::unique_ptr EmbeddingFCLSTMFusePass::ApplyImpl( + std::unique_ptr graph) const { + FusePassBase::Init(name_scope_, graph.get()); + + int fusion_count = BuildFusion(graph.get(), name_scope_, param_scope(), + true /*with_fc_bias*/); + + AddStatis(fusion_count); + return graph; +} + +} // namespace ir +} // namespace framework +} // namespace paddle + +REGISTER_PASS(embedding_fc_lstm_fuse_pass, + paddle::framework::ir::EmbeddingFCLSTMFusePass); diff --git a/paddle/fluid/inference/api/timer.h b/paddle/fluid/framework/ir/embedding_fc_lstm_fuse_pass.h similarity index 51% rename from paddle/fluid/inference/api/timer.h rename to paddle/fluid/framework/ir/embedding_fc_lstm_fuse_pass.h index 2df5274dc1f2e7ad8e434f1da9d5ae6aee94c784..e5ad3067ec4060e41f1464395f3fc76183de3e66 100644 --- a/paddle/fluid/inference/api/timer.h +++ b/paddle/fluid/framework/ir/embedding_fc_lstm_fuse_pass.h @@ -11,29 +11,30 @@ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. + #pragma once -#include // NOLINT +#include "paddle/fluid/framework/ir/fuse_pass_base.h" +#include "paddle/fluid/framework/ir/graph.h" +#include "paddle/fluid/framework/ir/graph_pattern_detector.h" namespace paddle { -namespace inference { +namespace framework { +namespace ir { + +// Fusing of Embedding , FC and LSTM op -// Timer for timer -class Timer { +// Just FC without bias +class EmbeddingFCLSTMFusePass : public FusePassBase { public: - std::chrono::high_resolution_clock::time_point start; - std::chrono::high_resolution_clock::time_point startu; - - void tic() { start = std::chrono::high_resolution_clock::now(); } - double toc() { - startu = std::chrono::high_resolution_clock::now(); - std::chrono::duration time_span = - std::chrono::duration_cast>(startu - - start); - double used_time_ms = static_cast(time_span.count()) * 1000.0; - return used_time_ms; - } + virtual ~EmbeddingFCLSTMFusePass() {} + + protected: + std::unique_ptr ApplyImpl(std::unique_ptr graph) const; + + const std::string name_scope_{"embedding_fc_lstm_fuse"}; }; -} // namespace inference +} // namespace ir +} // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/ir/fc_lstm_fuse_pass.cc b/paddle/fluid/framework/ir/fc_lstm_fuse_pass.cc index aa95d3e9f6c8221f6e48d192b73ad5135539dc75..f5c286486520391906a6cd7545041c8a7df614ea 100644 --- a/paddle/fluid/framework/ir/fc_lstm_fuse_pass.cc +++ b/paddle/fluid/framework/ir/fc_lstm_fuse_pass.cc @@ -77,10 +77,12 @@ int BuildFusion(Graph* graph, const std::string& name_scope, Scope* scope, const std::string BatchedCellPreAct = patterns::UniqueKey("BatchedCellPreAct"); const std::string BatchedGate = patterns::UniqueKey("BatchedGate"); + const std::string CheckedCell = patterns::UniqueKey("CheckedCell"); scope->Var(BatchedInput)->GetMutable(); scope->Var(BatchedCellPreAct)->GetMutable(); scope->Var(BatchedGate)->GetMutable(); + scope->Var(CheckedCell)->GetMutable(); op_desc.SetInput("H0", {}); op_desc.SetInput("C0", {}); @@ -90,6 +92,7 @@ int BuildFusion(Graph* graph, const std::string& name_scope, Scope* scope, op_desc.SetOutput("BatchedGate", {BatchedGate}); op_desc.SetOutput("BatchCellPreAct", {BatchedCellPreAct}); op_desc.SetOutput("BatchedInput", {BatchedInput}); + op_desc.SetOutput("CheckedCell", {CheckedCell}); op_desc.SetAttr("is_reverse", lstm->Op()->GetAttr("is_reverse")); op_desc.SetAttr("use_peepholes", lstm->Op()->GetAttr("use_peepholes")); // TODO(TJ): get from attr diff --git a/paddle/fluid/framework/ir/graph_helper.cc b/paddle/fluid/framework/ir/graph_helper.cc index 62f94a1c0e5a300438bbe5fea34b9a07df5d9ebf..c54766d95a61ac1a4b61566c6de62cbc86685a1d 100644 --- a/paddle/fluid/framework/ir/graph_helper.cc +++ b/paddle/fluid/framework/ir/graph_helper.cc @@ -12,11 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include "paddle/fluid/framework/ir/graph_helper.h" #include +#include #include -#include "paddle/fluid/framework/ir/graph_helper.h" - namespace paddle { namespace framework { namespace ir { @@ -113,6 +113,74 @@ std::map> BuildOperationAdjList( return adj_list; } +size_t GraphNum(const Graph &graph) { + std::unordered_set nodes = graph.Nodes(); + std::unordered_set visited_nodes; + visited_nodes.reserve(nodes.size()); + std::deque q_nodes; + std::vector> graph_nodes; + std::unordered_set g_nodes; + size_t graph_count = 0; + + auto traverse_nodes = [&visited_nodes, + &q_nodes](const std::vector &nodes) { + std::copy_if( + nodes.begin(), nodes.end(), std::back_inserter(q_nodes), + [&visited_nodes](Node *node) { return !visited_nodes.count(node); }); + }; + + while (visited_nodes.size() != nodes.size()) { + if (!q_nodes.empty()) { + auto cur_node = q_nodes.front(); + q_nodes.pop_front(); + visited_nodes.insert(cur_node); + g_nodes.insert(cur_node); + traverse_nodes(cur_node->inputs); + traverse_nodes(cur_node->outputs); + } else { + ++graph_count; + if (g_nodes.size()) { + graph_nodes.emplace_back(g_nodes); + } + g_nodes.clear(); + for (auto &n : nodes) { + if (visited_nodes.count(n) == 0) { + q_nodes.push_back(n); + break; + } + } + } + } + + if (g_nodes.size()) { + graph_nodes.emplace_back(g_nodes); + } + + if (VLOG_IS_ON(10)) { + VLOG(10) << "graph_num: " << graph_nodes.size(); + for (auto &g_n : graph_nodes) { + VLOG(10) << "graph_nodes: " << g_n.size(); + if (g_n.size() < 10) { + std::stringstream out; + for (auto &node : g_n) { + out << "\nNode: " << node->Name() << " in ["; + for (auto &n : node->inputs) { + out << n->Name() << ", "; + } + out << "], out["; + for (auto &n : node->outputs) { + out << n->Name() << ", "; + } + out << "]"; + } + VLOG(10) << out.str(); + } + } + } + + return graph_count; +} + } // namespace ir } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/ir/graph_helper.h b/paddle/fluid/framework/ir/graph_helper.h index cd6c53a07f8f56781989739d995226bd02b3d3d0..ec46b38c01b8c369ab37b4fbd5497ec120d8db91 100644 --- a/paddle/fluid/framework/ir/graph_helper.h +++ b/paddle/fluid/framework/ir/graph_helper.h @@ -27,6 +27,8 @@ namespace ir { // Test if the graph contains circle. bool HasCircle(const Graph &graph); +size_t GraphNum(const Graph &graph); + // Topology Sort the operations in the graph from inputs to outputs. // `graph` cannot contain circle. std::vector TopologySortOperations(const Graph &graph); diff --git a/paddle/fluid/framework/ir/graph_helper_test.cc b/paddle/fluid/framework/ir/graph_helper_test.cc index a260dd3da2a7863c06e51aa4feafd824ea254139..cea902809339f9d45b0e2525163f08a3c1c44c95 100644 --- a/paddle/fluid/framework/ir/graph_helper_test.cc +++ b/paddle/fluid/framework/ir/graph_helper_test.cc @@ -120,6 +120,97 @@ TEST(GraphHelperTest, Basic) { ASSERT_EQ(node_map.at("op2"), 1UL); ASSERT_TRUE(node_map.at("op3") < node_map.at("op5")); } + +void BuildZeroGraph(Graph* g) {} + +void BuildOneGraph(Graph* g) { + ir::Node* o1 = g->CreateEmptyNode("op1", Node::Type::kOperation); + ir::Node* o2 = g->CreateEmptyNode("op2", Node::Type::kOperation); + ir::Node* o3 = g->CreateEmptyNode("op3", Node::Type::kOperation); + ir::Node* o4 = g->CreateEmptyNode("op4", Node::Type::kOperation); + ir::Node* o5 = g->CreateEmptyNode("op5", Node::Type::kOperation); + ir::Node* v1 = g->CreateEmptyNode("var1", Node::Type::kVariable); + ir::Node* v2 = g->CreateEmptyNode("var2", Node::Type::kVariable); + ir::Node* v3 = g->CreateEmptyNode("var3", Node::Type::kVariable); + ir::Node* v4 = g->CreateEmptyNode("var4", Node::Type::kVariable); + + // o1->v1->o2 + o1->outputs.push_back(v1); + o2->inputs.push_back(v1); + v1->inputs.push_back(o1); + v1->outputs.push_back(o2); + // o2->v2->o3 + // o2->v2->o4 + o2->outputs.push_back(v2); + o3->inputs.push_back(v2); + o4->inputs.push_back(v2); + v2->inputs.push_back(o2); + v2->outputs.push_back(o3); + v2->outputs.push_back(o4); + // o2->v3->o5 + o2->outputs.push_back(v3); + o5->inputs.push_back(v3); + v3->inputs.push_back(o2); + v3->outputs.push_back(o5); + // o3-v4->o5 + o3->outputs.push_back(v4); + o5->inputs.push_back(v4); + v4->inputs.push_back(o3); + v4->outputs.push_back(o5); +} + +void BuildTwoGraphs(Graph* g) { + ir::Node* o1 = g->CreateEmptyNode("op1", Node::Type::kOperation); + ir::Node* o2 = g->CreateEmptyNode("op2", Node::Type::kOperation); + ir::Node* o3 = g->CreateEmptyNode("op3", Node::Type::kOperation); + ir::Node* o4 = g->CreateEmptyNode("op4", Node::Type::kOperation); + ir::Node* o5 = g->CreateEmptyNode("op5", Node::Type::kOperation); + ir::Node* v1 = g->CreateEmptyNode("var1", Node::Type::kVariable); + ir::Node* v2 = g->CreateEmptyNode("var2", Node::Type::kVariable); + ir::Node* v3 = g->CreateEmptyNode("var3", Node::Type::kVariable); + ir::Node* v4 = g->CreateEmptyNode("var4", Node::Type::kVariable); + + // o1->v1->o2 + o1->outputs.push_back(v1); + o2->inputs.push_back(v1); + v1->inputs.push_back(o1); + v1->outputs.push_back(o2); + // o2->v2->o3 + // o2->v2->o4 + o2->outputs.push_back(v2); + o3->inputs.push_back(v2); + o4->inputs.push_back(v2); + v2->inputs.push_back(o2); + v2->outputs.push_back(o3); + v2->outputs.push_back(o4); + // o2->v3->o5 + // o2->outputs.push_back(v3); + o5->inputs.push_back(v3); + // v3->inputs.push_back(o2); + v3->outputs.push_back(o5); + // o3-v4->o5 + o3->outputs.push_back(v4); + // o5->inputs.push_back(v4); + v4->inputs.push_back(o3); + // v4->outputs.push_back(o5); +} + +TEST(GraphHelperTest, GraphNum) { + ProgramDesc prog; + + Graph g(prog); + BuildZeroGraph(&g); + ASSERT_EQ(GraphNum(g), 0); + + Graph g2(prog); + BuildOneGraph(&g2); + ASSERT_EQ(GraphNum(g2), 1); + + Graph g3(prog); + BuildTwoGraphs(&g3); + ASSERT_EQ(GraphNum(g3), 2); +} + } // namespace ir } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.cc b/paddle/fluid/framework/ir/graph_pattern_detector.cc index ef5113819696238d4e06b826bf43064b0f368dea..46c6a52c09e896596aa6d8e1e901955a68a4957d 100644 --- a/paddle/fluid/framework/ir/graph_pattern_detector.cc +++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc @@ -638,11 +638,6 @@ PDNode *patterns::ConvReLU::operator()( ->AsInput() ->assert_is_persistable_var() ->assert_is_op_input("conv2d", "Filter"); - // Bias - auto *conv_bias_var = pattern->NewNode(conv_bias_repr()) - ->AsInput() - ->assert_is_persistable_var() - ->assert_is_op_input("conv2d", "Bias"); // intermediate variable, will be removed in the IR after fuse. auto *conv_out_var = pattern->NewNode(conv_out_repr()) ->AsIntermediate() @@ -653,8 +648,7 @@ PDNode *patterns::ConvReLU::operator()( ->AsOutput() ->assert_is_op_output("relu"); - conv_op->LinksFrom({conv_input, conv_weight_var, conv_bias_var}) - .LinksTo({conv_out_var}); + conv_op->LinksFrom({conv_input, conv_weight_var}).LinksTo({conv_out_var}); relu_op->LinksFrom({conv_out_var}).LinksTo({relu_out_var}); return relu_out_var; } @@ -698,6 +692,24 @@ PDNode *patterns::FC::operator()(paddle::framework::ir::PDNode *x, } } +PDNode *patterns::Embedding::operator()(PDNode *x) { + x->assert_is_op_input("lookup_table", "Ids"); + auto *lookup_table_op = + pattern->NewNode(lookup_table_repr())->assert_is_op("lookup_table"); +#define NEW_NODE(arg__, io__) \ + auto *arg__ = pattern->NewNode(arg__##_repr()) \ + ->assert_is_op_##io__("lookup_table", #arg__); + + NEW_NODE(W, input); + + NEW_NODE(Out, output); +#undef NEW_NODE + + lookup_table_op->LinksFrom({x, W}); + lookup_table_op->LinksTo({Out}); + return Out; +} + PDNode *patterns::LSTM::operator()(PDNode *x) { x->assert_is_op_input("lstm", "Input"); auto *lstm_op = pattern->NewNode(lstm_repr())->assert_is_op("lstm"); diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.h b/paddle/fluid/framework/ir/graph_pattern_detector.h index 46950ed877cda7cd83ead4e9aa9a3aaae5d5ecfa..508113bf4fcab274394f2705c36eddbf4ba3c77a 100644 --- a/paddle/fluid/framework/ir/graph_pattern_detector.h +++ b/paddle/fluid/framework/ir/graph_pattern_detector.h @@ -379,7 +379,7 @@ struct PatternBase { // op: conv + relu // named nodes: // conv_input, conv_weight, -// conv_bias, conv_out, conv, +// conv_out, conv, // relu_out, relu struct ConvReLU : public PatternBase { ConvReLU(PDPattern* pattern, const std::string& name_scope) @@ -392,7 +392,6 @@ struct ConvReLU : public PatternBase { PATTERN_DECL_NODE(relu); // declare variable node's name PATTERN_DECL_NODE(conv_weight); - PATTERN_DECL_NODE(conv_bias); PATTERN_DECL_NODE(conv_out); PATTERN_DECL_NODE(relu_out); }; @@ -419,6 +418,23 @@ struct FC : public PatternBase { PATTERN_DECL_NODE(Out); }; +// Embedding +struct Embedding : public PatternBase { + Embedding(PDPattern* pattern, const std::string& name_scope) + : PatternBase(pattern, name_scope, "embedding") {} + + PDNode* operator()(PDNode* x); + + // declare operator node's name + PATTERN_DECL_NODE(lookup_table); + // Inputs + // + PATTERN_DECL_NODE(Ids); + PATTERN_DECL_NODE(W); // embeddings + // Outputs + PATTERN_DECL_NODE(Out); +}; + struct LSTM : public PatternBase { LSTM(PDPattern* pattern, const std::string& name_scope) : PatternBase(pattern, name_scope, "lstm") {} diff --git a/paddle/fluid/framework/ir/graph_traits.cc b/paddle/fluid/framework/ir/graph_traits.cc index 8f548913e4e1d9d5bc5bdace8b92db9065cf3b5e..084a4ba2def87eaa8badb3ca2c39865c6e5cb981 100644 --- a/paddle/fluid/framework/ir/graph_traits.cc +++ b/paddle/fluid/framework/ir/graph_traits.cc @@ -14,6 +14,8 @@ #include "paddle/fluid/framework/ir/graph_traits.h" +#include + namespace paddle { namespace framework { namespace ir { diff --git a/paddle/fluid/framework/ir/pass.cc b/paddle/fluid/framework/ir/pass.cc index d7158eba62686be57499df697466797e4034ea8f..6cf405efe63d2bc284c4650771a747b27bb4a9f6 100644 --- a/paddle/fluid/framework/ir/pass.cc +++ b/paddle/fluid/framework/ir/pass.cc @@ -19,7 +19,6 @@ namespace paddle { namespace framework { namespace ir { std::unique_ptr Pass::Apply(std::unique_ptr graph) const { - PADDLE_ENFORCE(!applied_, "Pass can only Apply() once."); PADDLE_ENFORCE(graph.get(), "graph passed to Pass::Apply() cannot be empty."); for (const std::string& attr : required_pass_attrs_) { PADDLE_ENFORCE(attrs_.find(attr) != attrs_.end(), diff --git a/paddle/fluid/framework/ir/pass.h b/paddle/fluid/framework/ir/pass.h index 0f14083d259172f5b5f1ed80c7d38312d711beb5..9570c59cff2a6afeb1c607f7219b7b455974d6ce 100644 --- a/paddle/fluid/framework/ir/pass.h +++ b/paddle/fluid/framework/ir/pass.h @@ -42,6 +42,8 @@ class Pass { attr_dels_.clear(); } + std::string Type() const { return type_; } + std::unique_ptr Apply(std::unique_ptr graph) const; // Get a reference to the attributed previously set. @@ -52,6 +54,21 @@ class Pass { return *boost::any_cast(attrs_.at(attr_name)); } + bool Has(const std::string &attr_name) const { + return attrs_.find(attr_name) != attrs_.end(); + } + + void Erase(const std::string &attr_name) { + if (!Has(attr_name)) { + return; + } + if (attr_dels_.find(attr_name) != attr_dels_.end()) { + attr_dels_[attr_name](); + attr_dels_.erase(attr_name); + } + attrs_.erase(attr_name); + } + // Set a pointer to the attribute. Pass takes ownership of the attribute. template void Set(const std::string &attr_name, AttrType *attr) { @@ -68,13 +85,15 @@ class Pass { // should delete the attribute. template void SetNotOwned(const std::string &attr_name, AttrType *attr) { - PADDLE_ENFORCE(attrs_.count(attr_name) == 0); + PADDLE_ENFORCE(attrs_.count(attr_name) == 0, "%s already set in the pass", + attr_name); attrs_[attr_name] = attr; } protected: - virtual std::unique_ptr ApplyImpl( - std::unique_ptr graph) const = 0; + virtual std::unique_ptr ApplyImpl(std::unique_ptr graph) const { + LOG(FATAL) << "Calling virtual Pass not implemented."; + } private: template @@ -89,7 +108,10 @@ class Pass { required_graph_attrs_.insert(attrs.begin(), attrs.end()); } + void RegisterType(const std::string &type) { type_ = type; } + mutable bool applied_{false}; + std::string type_; std::unordered_set required_pass_attrs_; std::unordered_set required_graph_attrs_; std::map attrs_; @@ -143,10 +165,11 @@ struct PassRegistrar : public Registrar { PADDLE_ENFORCE(!PassRegistry::Instance().Has(pass_type), "'%s' is registered more than once.", pass_type); PassRegistry::Instance().Insert( - pass_type, [this]() -> std::unique_ptr { + pass_type, [this, pass_type]() -> std::unique_ptr { std::unique_ptr pass(new PassType()); pass->RegisterRequiredPassAttrs(this->required_pass_attrs_); pass->RegisterRequiredGraphAttrs(this->required_graph_attrs_); + pass->RegisterType(pass_type); return pass; }); } diff --git a/paddle/fluid/framework/ir/pass_builder.cc b/paddle/fluid/framework/ir/pass_builder.cc new file mode 100644 index 0000000000000000000000000000000000000000..e0719867b34d13666672b22070ce14dbaf80d85d --- /dev/null +++ b/paddle/fluid/framework/ir/pass_builder.cc @@ -0,0 +1,43 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/ir/pass_builder.h" + +namespace paddle { +namespace framework { +namespace ir { + +std::shared_ptr PassBuilder::AppendPass(const std::string& pass_type) { + auto pass = ir::PassRegistry::Instance().Get(pass_type); + passes_.emplace_back(pass.release()); + return passes_.back(); +} + +void PassBuilder::RemovePass(size_t idx) { + PADDLE_ENFORCE(passes_.size() > idx); + passes_.erase(passes_.begin() + idx); +} + +std::shared_ptr PassBuilder::InsertPass(size_t idx, + const std::string& pass_type) { + PADDLE_ENFORCE(passes_.size() >= idx); + std::shared_ptr pass( + ir::PassRegistry::Instance().Get(pass_type).release()); + passes_.insert(passes_.begin() + idx, std::move(pass)); + return passes_[idx]; +} + +} // namespace ir +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/ir/pass_builder.h b/paddle/fluid/framework/ir/pass_builder.h new file mode 100644 index 0000000000000000000000000000000000000000..733d3a3ad1ab8989ea30fe45cd7e1ffe9432de13 --- /dev/null +++ b/paddle/fluid/framework/ir/pass_builder.h @@ -0,0 +1,49 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include "paddle/fluid/framework/ir/pass.h" + +namespace paddle { +namespace framework { +namespace ir { + +class PassBuilder { + public: + PassBuilder() {} + + virtual ~PassBuilder() {} + + // Append a new pass to the end. + std::shared_ptr AppendPass(const std::string& pass_type); + + // Insert a new pass after `idx`. + std::shared_ptr InsertPass(size_t idx, const std::string& pass_type); + + // Remove a new pass at `idx`. + void RemovePass(size_t idx); + + // Returns a list of all passes. + std::vector> AllPasses() const { return passes_; } + + protected: + std::vector> passes_; +}; + +} // namespace ir +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/ir/pass_test.cc b/paddle/fluid/framework/ir/pass_test.cc index 5b5011412ed39e033a7a65921e9c64ce2d54c638..6ad7d1df8bdd016b617c820c022ef55f23ba21cd 100644 --- a/paddle/fluid/framework/ir/pass_test.cc +++ b/paddle/fluid/framework/ir/pass_test.cc @@ -82,12 +82,10 @@ TEST(PassTest, TestPassAttrCheck) { ASSERT_EQ(graph->Get("copy_test_pass_attr"), 2); ASSERT_EQ(graph->Get("copy_test_graph_attr"), 2); - try { - graph = pass->Apply(std::move(graph)); - } catch (paddle::platform::EnforceNotMet e) { - exception = std::string(e.what()); - } - ASSERT_TRUE(exception.find("Pass can only Apply() once") != exception.npos); + // Allow apply more than once. + graph.reset(new Graph(prog)); + graph->Set("test_graph_attr", new int); + graph = pass->Apply(std::move(graph)); pass = PassRegistry::Instance().Get("test_pass"); pass->SetNotOwned("test_pass_attr", &val); diff --git a/paddle/fluid/framework/mixed_vector.h b/paddle/fluid/framework/mixed_vector.h index 7836ecb1272a07a79a70c9cb040335f9a42e5684..77386f4f069489b6ff7b927a281bdc286ff816e0 100644 --- a/paddle/fluid/framework/mixed_vector.h +++ b/paddle/fluid/framework/mixed_vector.h @@ -17,10 +17,13 @@ #include #include #include +#include // NOLINT +#include #include - +#include "paddle/fluid/framework/details/cow_ptr.h" #include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/framework/tensor_util.h" +#include "paddle/fluid/memory/memcpy.h" #include "glog/logging.h" @@ -28,206 +31,436 @@ namespace paddle { namespace framework { #if defined(PADDLE_WITH_CUDA) +namespace details { +struct CUDABuffer { + void *data_{nullptr}; + size_t size_{0}; + platform::CUDAPlace place_; + + CUDABuffer() {} + CUDABuffer(platform::Place place, size_t size) + : size_(size), place_(boost::get(place)) { + data_ = memory::Alloc(place_, size); + } + + ~CUDABuffer() { ClearMemory(); } + + CUDABuffer(const CUDABuffer &o) = delete; + CUDABuffer &operator=(const CUDABuffer &o) = delete; + + void Resize(platform::Place place, size_t size) { + ClearMemory(); + place_ = boost::get(place); + data_ = memory::Alloc(place_, size); + PADDLE_ENFORCE_NOT_NULL(data_); + size_ = size; + } + + void Swap(CUDABuffer &o) { + std::swap(data_, o.data_); + std::swap(place_, o.place_); + std::swap(size_, o.size_); + } + + private: + void ClearMemory() const { + if (data_ != nullptr) { + memory::Free(place_, data_); + } + } +}; +} // namespace details + // Vector implements the std::vector interface, and can get Data or // MutableData from any place. The data will be synced implicitly inside. template class Vector { public: using value_type = T; + using iterator = typename std::vector::iterator; + using const_iterator = typename std::vector::const_iterator; - // Default ctor. Create empty Vector - Vector() { InitEmpty(); } + private: + // The actual class to implement vector logic + class VectorData { + public: + VectorData() : flag_(kDataInCPU) {} + VectorData(size_t count, const T &value) + : cpu_(count, value), flag_(kDataInCPU) {} + VectorData(std::initializer_list init) : cpu_(init), flag_(kDataInCPU) {} + template + explicit VectorData(const std::vector &dat) + : cpu_(dat), flag_(kDataInCPU) {} + ~VectorData() {} + + VectorData(const VectorData &o) { + o.ImmutableCPU(); + cpu_ = o.cpu_; + flag_ = kDataInCPU; + } - // Fill vector with value. The vector size is `count`. - explicit Vector(size_t count, const T &value = T()) { - InitEmpty(); - if (count != 0) { - resize(count); - T *ptr = begin(); - for (size_t i = 0; i < count; ++i) { - ptr[i] = value; + VectorData &operator=(const VectorData &o) { + o.ImmutableCPU(); + cpu_ = o.cpu_; + flag_ = kDataInCPU; + details::CUDABuffer null; + gpu_.Swap(null); + return *this; + } + + T &operator[](size_t i) { + MutableCPU(); + return cpu_[i]; + } + + const T &operator[](size_t i) const { + ImmutableCPU(); + return cpu_[i]; + } + + size_t size() const { return cpu_.size(); } + + iterator begin() { + MutableCPU(); + return cpu_.begin(); + } + + iterator end() { + MutableCPU(); + return cpu_.end(); + } + + T &front() { + MutableCPU(); + return cpu_.front(); + } + + T &back() { + MutableCPU(); + return cpu_.back(); + } + + const_iterator begin() const { + ImmutableCPU(); + return cpu_.begin(); + } + + const_iterator end() const { + ImmutableCPU(); + return cpu_.end(); + } + + const T &back() const { + ImmutableCPU(); + return cpu_.back(); + } + + T *data() { return &(*this)[0]; } + + const T *data() const { return &(*this)[0]; } + + const T &front() const { + ImmutableCPU(); + return cpu_.front(); + } + + // assign this from iterator. + // NOTE: the iterator must support `end-begin` + template + void assign(Iter begin, Iter end) { + MutableCPU(); + cpu_.assign(begin, end); + } + + // push_back. If the previous capacity is not enough, the memory will + // double. + void push_back(T elem) { + MutableCPU(); + cpu_.push_back(elem); + } + + // extend a vector by iterator. + // NOTE: the iterator must support end-begin + template + void Extend(It begin, It end) { + MutableCPU(); + auto out_it = std::back_inserter>(this->cpu_); + std::copy(begin, end, out_it); + } + + // resize the vector + void resize(size_t size) { + MutableCPU(); + cpu_.resize(size); + } + + // get cuda ptr. immutable + const T *CUDAData(platform::Place place) const { + PADDLE_ENFORCE(platform::is_gpu_place(place), + "CUDA Data must on CUDA place"); + ImmutableCUDA(place); + return reinterpret_cast(gpu_.data_); + } + + // get cuda ptr. mutable + T *CUDAMutableData(platform::Place place) { + const T *ptr = CUDAData(place); + flag_ = kDirty | kDataInCUDA; + return const_cast(ptr); + } + + // clear + void clear() { + cpu_.clear(); + flag_ = kDirty | kDataInCPU; + } + + size_t capacity() const { return cpu_.capacity(); } + + // reserve data + void reserve(size_t size) const { cpu_.reserve(size); } + + // implicit cast operator. Vector can be cast to std::vector implicitly. + operator std::vector() const { + ImmutableCPU(); + return cpu_; + } + + bool operator==(const VectorData &other) const { + ImmutableCPU(); + other.ImmutableCPU(); + return cpu_ == other.cpu_; + } + + std::mutex &Mutex() const { return mtx_; } + + std::unique_ptr CUDAPlace() const { + if (gpu_.data_ == nullptr) { + return nullptr; + } else { + return std::unique_ptr( + new platform::CUDAPlace(gpu_.place_)); } } - } - // Ctor with init_list - Vector(std::initializer_list init) { - if (init.size() == 0) { - InitEmpty(); - } else { - InitByIter(init.size(), init.begin(), init.end()); + private: + enum DataFlag { + kDataInCPU = 0x01, + kDataInCUDA = 0x02, + // kDirty means the data has been changed in one device. + kDirty = 0x10 + }; + + void CopyToCPU() const { + // COPY GPU Data To CPU + auto *dev_ctx = static_cast( + platform::DeviceContextPool::Instance().Get( + platform::Place(gpu_.place_))); + auto stream = dev_ctx->stream(); + void *src = gpu_.data_; + void *dst = cpu_.data(); + memory::Copy(platform::CPUPlace(), dst, gpu_.place_, src, gpu_.size_, + stream); + dev_ctx->Wait(); + } + + void MutableCPU() { + if (IsInCUDA() && IsDirty()) { + CopyToCPU(); + } + flag_ = kDirty | kDataInCPU; } - } + + void ImmutableCUDA(platform::Place place) const { + if (IsDirty()) { + if (IsInCPU()) { + CopyCPUDataToCUDA(place); + UnsetFlag(kDirty); + SetFlag(kDataInCUDA); + } else if (IsInCUDA() && + !(boost::get(place) == gpu_.place_)) { + PADDLE_THROW("This situation should not happen"); + // Still dirty + } else { + // Dirty && DataInCUDA && Device is same + // Do nothing + } + } else { + if (!IsInCUDA()) { + // Even data is not dirty. However, data is not in CUDA. Copy data. + CopyCPUDataToCUDA(place); + SetFlag(kDataInCUDA); + } else if (!(boost::get(place) == gpu_.place_)) { + PADDLE_THROW("This situation should not happen."); + } else { + // Not Dirty && DataInCUDA && Device is same + // Do nothing. + } + } + } + + void CopyCPUDataToCUDA(const platform::Place &place) const { + void *src = cpu_.data(); + gpu_.Resize(place, cpu_.size() * sizeof(T)); + void *dst = gpu_.data_; + auto *dev_ctx = static_cast( + platform::DeviceContextPool::Instance().Get(place)); + auto stream = dev_ctx->stream(); + memory::Copy(gpu_.place_, dst, platform::CPUPlace(), src, gpu_.size_, + stream); + } + + void ImmutableCPU() const { + if (IsDirty() && !IsInCPU()) { // If data has been changed in CUDA, or + // CPU has no data. + CopyToCPU(); + UnsetFlag(kDirty); + } + SetFlag(kDataInCPU); + } + + void UnsetFlag(int flag) const { flag_ &= ~flag; } + void SetFlag(int flag) const { flag_ |= flag; } + + bool IsDirty() const { return flag_ & kDirty; } + + bool IsInCUDA() const { return flag_ & kDataInCUDA; } + + bool IsInCPU() const { return flag_ & kDataInCPU; } + + mutable std::vector cpu_; + mutable details::CUDABuffer gpu_; + mutable int flag_; + + mutable std::mutex mtx_; + }; + + public: + // Default ctor. Create empty Vector + Vector() : m_(new VectorData()) {} + + // Fill vector with value. The vector size is `count`. + explicit Vector(size_t count, const T &value = T()) + : m_(new VectorData(count, value)) {} + + // Ctor with init_list + Vector(std::initializer_list init) : m_(new VectorData(init)) {} // implicit cast from std::vector. template - Vector(const std::vector &dat) { // NOLINT - if (dat.size() == 0) { - InitEmpty(); - } else { - InitByIter(dat.size(), dat.begin(), dat.end()); - } + Vector(const std::vector &dat) : m_(new VectorData(dat)) { // NOLINT } // Copy ctor - Vector(const Vector &other) { this->operator=(other); } + Vector(const Vector &other) { m_ = other.m_; } // Copy operator Vector &operator=(const Vector &other) { - if (other.size() != 0) { - this->InitByIter(other.size(), other.begin(), other.end()); - } else { - InitEmpty(); - } + m_ = other.m_; return *this; } // Move ctor - Vector(Vector &&other) { - this->size_ = other.size_; - this->flag_ = other.flag_; - if (other.cuda_vec_.memory_size()) { - this->cuda_vec_.ShareDataWith(other.cuda_vec_); - } - if (other.cpu_vec_.memory_size()) { - this->cpu_vec_.ShareDataWith(other.cpu_vec_); - } - } + Vector(Vector &&other) { m_ = std::move(other.m_); } // CPU data access method. Mutable. - T &operator[](size_t i) { - MutableCPU(); - return const_cast(cpu_vec_.data())[i]; - } + T &operator[](size_t i) { return (*m_.MutableData())[i]; } // CPU data access method. Immutable. - const T &operator[](size_t i) const { - ImmutableCPU(); - return cpu_vec_.data()[i]; - } + const T &operator[](size_t i) const { return m_.Data()[i]; } // std::vector iterator methods. Based on CPU data access method - size_t size() const { return size_; } + size_t size() const { return m_.Data().size(); } - T *begin() { return capacity() == 0 ? &EmptyDummy() : &this->operator[](0); } + iterator begin() { return m_.MutableData()->begin(); } - T *end() { - return capacity() == 0 ? &EmptyDummy() : &this->operator[](size()); - } + iterator end() { return m_.MutableData()->end(); } - T &front() { return *begin(); } + T &front() { return m_.MutableData()->front(); } - T &back() { - auto it = end(); - --it; - return *it; - } + T &back() { return m_.MutableData()->back(); } - const T *begin() const { - return capacity() == 0 ? &EmptyDummy() : &this->operator[](0); - } + const_iterator begin() const { return m_.Data().begin(); } - const T *end() const { - return capacity() == 0 ? &EmptyDummy() : &this->operator[](size()); - } + const_iterator end() const { return m_.Data().end(); } - const T *cbegin() const { return begin(); } + const_iterator cbegin() const { return begin(); } - const T *cend() const { return end(); } + const_iterator cend() const { return end(); } - const T &back() const { - auto it = end(); - --it; - return *it; - } + const T &back() const { return m_.Data().back(); } - T *data() { return begin(); } + T *data() { return m_.MutableData()->data(); } - const T *data() const { return begin(); } + const T *data() const { return m_.Data().data(); } - const T &front() const { return *begin(); } + const T &front() const { return m_.Data().front(); } // end of std::vector iterator methods // assign this from iterator. // NOTE: the iterator must support `end-begin` template void assign(Iter begin, Iter end) { - InitByIter(end - begin, begin, end); + m_.MutableData()->assign(begin, end); } // push_back. If the previous capacity is not enough, the memory will // double. - void push_back(T elem) { - if (size_ + 1 > capacity()) { - reserve((size_ + 1) << 1); - } - *end() = elem; - ++size_; - } + void push_back(T elem) { m_.MutableData()->push_back(elem); } // extend a vector by iterator. // NOTE: the iterator must support end-begin template void Extend(It begin, It end) { - size_t pre_size = size_; - resize(pre_size + (end - begin)); - T *ptr = this->begin() + pre_size; - for (; begin < end; ++begin, ++ptr) { - *ptr = *begin; - } + m_.MutableData()->Extend(begin, end); } // resize the vector void resize(size_t size) { - if (size + 1 <= capacity()) { - size_ = size; - } else { - MutableCPU(); - Tensor cpu_tensor; - platform::Place cpu = platform::CPUPlace(); - T *ptr = cpu_tensor.mutable_data( - framework::make_ddim({static_cast(size)}), cpu); - const T *old_ptr = - cpu_vec_.memory_size() == 0 ? nullptr : cpu_vec_.data(); - if (old_ptr != nullptr) { - std::copy(old_ptr, old_ptr + size_, ptr); - } - size_ = size; - cpu_vec_.ShareDataWith(cpu_tensor); + if (m_.Data().size() != size) { + m_.MutableData()->resize(size); } } // get cuda ptr. immutable const T *CUDAData(platform::Place place) const { - PADDLE_ENFORCE(platform::is_gpu_place(place), - "CUDA Data must on CUDA place"); - ImmutableCUDA(place); - return cuda_vec_.data(); + { + auto &mtx = m_.Data().Mutex(); + std::lock_guard guard(mtx); + auto cuda_place = m_.Data().CUDAPlace(); + if (cuda_place == nullptr || + *cuda_place == boost::get(place)) { + return m_.Data().CUDAData(place); + } + } + // If m_ contains CUDAData in a different place. Detach manually. + m_.Detach(); + return CUDAData(place); } // get cuda ptr. mutable T *CUDAMutableData(platform::Place place) { - const T *ptr = CUDAData(place); - flag_ = kDirty | kDataInCUDA; - return const_cast(ptr); + { + auto &mtx = m_.Data().Mutex(); + std::lock_guard guard(mtx); + auto cuda_place = m_.Data().CUDAPlace(); + if (cuda_place == nullptr || + *cuda_place == boost::get(place)) { + return m_.MutableData()->CUDAMutableData(place); + } + } + // If m_ contains CUDAData in a different place. Detach manually. + m_.Detach(); + return CUDAMutableData(place); } // clear - void clear() { - size_ = 0; - flag_ = kDirty | kDataInCPU; - } + void clear() { m_.MutableData()->clear(); } - size_t capacity() const { - return cpu_vec_.memory_size() / SizeOfType(typeid(T)); - } + size_t capacity() const { return m_.Data().capacity(); } // reserve data - void reserve(size_t size) { - size_t pre_size = size_; - resize(size); - resize(pre_size); - } + void reserve(size_t size) { m_.Data().reserve(size); } // the unify method to access CPU or CUDA data. immutable. const T *Data(platform::Place place) const { @@ -248,12 +481,7 @@ class Vector { } // implicit cast operator. Vector can be cast to std::vector implicitly. - operator std::vector() const { - std::vector result; - result.resize(size()); - std::copy(begin(), end(), result.begin()); - return result; - } + operator std::vector() const { return m_.Data(); } bool operator==(const Vector &other) const { if (size() != other.size()) return false; @@ -267,118 +495,11 @@ class Vector { return true; } - private: - void InitEmpty() { - size_ = 0; - flag_ = kDataInCPU; - } - - template - void InitByIter(size_t size, Iter begin, Iter end) { - platform::Place cpu = platform::CPUPlace(); - T *ptr = this->cpu_vec_.template mutable_data( - framework::make_ddim({static_cast(size)}), cpu); - for (size_t i = 0; i < size; ++i) { - *ptr++ = *begin++; - } - flag_ = kDataInCPU | kDirty; - size_ = size; - } - - enum DataFlag { - kDataInCPU = 0x01, - kDataInCUDA = 0x02, - // kDirty means the data has been changed in one device. - kDirty = 0x10 - }; - - void CopyToCPU() const { - // COPY GPU Data To CPU - TensorCopy(cuda_vec_, platform::CPUPlace(), &cpu_vec_); - WaitPlace(cuda_vec_.place()); - } - - void MutableCPU() { - if (IsInCUDA() && IsDirty()) { - CopyToCPU(); - } - flag_ = kDirty | kDataInCPU; - } - - void ImmutableCUDA(platform::Place place) const { - if (IsDirty()) { - if (IsInCPU()) { - TensorCopy(cpu_vec_, boost::get(place), - &cuda_vec_); - WaitPlace(place); - UnsetFlag(kDirty); - SetFlag(kDataInCUDA); - } else if (IsInCUDA() && !(place == cuda_vec_.place())) { - framework::Tensor tmp; - TensorCopy(cuda_vec_, boost::get(place), &tmp); - WaitPlace(cuda_vec_.place()); - cuda_vec_.ShareDataWith(tmp); - // Still dirty - } else { - // Dirty && DataInCUDA && Device is same - // Do nothing - } - } else { - if (!IsInCUDA()) { - // Even data is not dirty. However, data is not in CUDA. Copy data. - TensorCopy(cpu_vec_, boost::get(place), - &cuda_vec_); - WaitPlace(place); - SetFlag(kDataInCUDA); - } else if (!(place == cuda_vec_.place())) { - framework::Tensor tmp; - WaitPlace(cuda_vec_.place()); - TensorCopy(cuda_vec_, boost::get(place), &tmp); - WaitPlace(cuda_vec_.place()); - WaitPlace(place); - cuda_vec_.ShareDataWith(tmp); - } else { - // Not Dirty && DataInCUDA && Device is same - // Do nothing. - } - } - } - - void ImmutableCPU() const { - if (IsDirty() && - !IsInCPU()) { // If data has been changed in CUDA, or CPU has no data. - CopyToCPU(); - UnsetFlag(kDirty); - } - SetFlag(kDataInCPU); - } - - void UnsetFlag(int flag) const { flag_ &= ~flag; } - void SetFlag(int flag) const { flag_ |= flag; } + const void *Handle() const { return &m_.Data(); } - bool IsDirty() const { return flag_ & kDirty; } - - bool IsInCUDA() const { return flag_ & kDataInCUDA; } - - bool IsInCPU() const { return flag_ & kDataInCPU; } - - static void WaitPlace(const platform::Place place) { - if (platform::is_gpu_place(place)) { - platform::DeviceContextPool::Instance() - .Get(boost::get(place)) - ->Wait(); - } - } - - static T &EmptyDummy() { - static T dummy = T(); - return dummy; - } - - mutable int flag_; - mutable Tensor cpu_vec_; - mutable Tensor cuda_vec_; - size_t size_; + private: + // Vector is an COW object. + mutable details::COWPtr m_; }; #else // PADDLE_WITH_CUDA diff --git a/paddle/fluid/framework/naive_executor.cc b/paddle/fluid/framework/naive_executor.cc new file mode 100644 index 0000000000000000000000000000000000000000..ba10687d65cfbbac89cfc76879c8b202ebd03229 --- /dev/null +++ b/paddle/fluid/framework/naive_executor.cc @@ -0,0 +1,167 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include + +#include "paddle/fluid/framework/feed_fetch_method.h" +#include "paddle/fluid/framework/lod_rank_table.h" +#include "paddle/fluid/framework/lod_tensor_array.h" +#include "paddle/fluid/framework/naive_executor.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/reader.h" +#include "paddle/fluid/string/pretty_log.h" + +namespace paddle { +namespace framework { + +// These code can be shared with Executor. +static void InitializeVariable(Variable *var, proto::VarType::Type var_type) { + if (var_type == proto::VarType::LOD_TENSOR) { + var->GetMutable(); + } else if (var_type == proto::VarType::SELECTED_ROWS) { + var->GetMutable(); + } else if (var_type == proto::VarType::FEED_MINIBATCH) { + var->GetMutable(); + } else if (var_type == proto::VarType::FETCH_LIST) { + var->GetMutable(); + } else if (var_type == proto::VarType::STEP_SCOPES) { + var->GetMutable>(); + } else if (var_type == proto::VarType::LOD_RANK_TABLE) { + var->GetMutable(); + } else if (var_type == proto::VarType::LOD_TENSOR_ARRAY) { + var->GetMutable(); + } else if (var_type == proto::VarType::PLACE_LIST) { + var->GetMutable(); + } else if (var_type == proto::VarType::READER) { + var->GetMutable(); + } else if (var_type == proto::VarType::RAW) { + // GetMutable will be called in operator + } else { + PADDLE_THROW( + "Variable type %d is not in " + "[LOD_TENSOR, SELECTED_ROWS, FEED_MINIBATCH, FETCH_LIST, " + "LOD_RANK_TABLE, PLACE_LIST, READER, CHANNEL, RAW]", + var_type); + } +} + +void NaiveExecutor::Prepare(Scope *parent_scope, + const ProgramDesc &program_desc, int block_id, + bool with_feed_fetch_ops) { + if (!parent_scope) { + scope_ = new framework::Scope; + } else { + scope_ = &parent_scope->NewScope(); + } + CreateVariables(program_desc, scope_, block_id); + CreateOps(program_desc, block_id, with_feed_fetch_ops); +} + +void NaiveExecutor::Run() { + for (auto &op : ops_) { + VLOG(4) << "run " << op->Type(); + op->Run(*scope_, place_); + } +} + +void NaiveExecutor::CreateVariables(const ProgramDesc &desc, Scope *scope, + int block_id) { + PADDLE_ENFORCE(scope); + auto &global_block = desc.Block(block_id); + + const Scope *ancestor_scope = scope; + while (ancestor_scope->parent()) { + ancestor_scope = ancestor_scope->parent(); + } + + if (ancestor_scope != scope) { + for (auto &var : global_block.AllVars()) { + if (var->Name() == framework::kEmptyVarName) { + continue; + } + // Create persistable vars in ancestor scope. + if (var->Persistable()) { + auto *ptr = const_cast(ancestor_scope)->Var(var->Name()); + InitializeVariable(ptr, var->GetType()); + VLOG(3) << "Create Variable " << var->Name() + << " global, which pointer is " << ptr; + } else { // Create temporary variables in local scope. + auto *ptr = scope->Var(var->Name()); + InitializeVariable(ptr, var->GetType()); + VLOG(3) << "Create Variable " << var->Name() + << " locally, which pointer is " << ptr; + } + } + } else { + for (auto &var : global_block.AllVars()) { + auto *ptr = scope->Var(var->Name()); + InitializeVariable(ptr, var->GetType()); + VLOG(3) << "Create variable " << var->Name() << ", which pointer is " + << ptr; + } + } +} + +void NaiveExecutor::CreateOps(const ProgramDesc &desc, int block_id, + bool with_feed_fetch_ops) { + for (const auto &op_desc : desc.Block(block_id).AllOps()) { + if (!with_feed_fetch_ops && + (op_desc->Type() == "feed" || op_desc->Type() == "fetch")) { + string::PrettyLogEndl(string::Style::detail(), "--- skip [%s], %s -> %s", + op_desc->Input("X")[0], op_desc->Type(), + op_desc->Output("Out")[0]); + continue; + } + ops_.emplace_back(OpRegistry::CreateOp(*op_desc)); + } +} + +LoDTensor *NaiveExecutor::FindTensor(const std::string &name) { + PADDLE_ENFORCE(scope_, "Need to init scope first"); + auto *var = scope_->FindVar(name); + PADDLE_ENFORCE(var, "No variable [%s] in the scope"); + auto *tensor = const_cast(&var->Get()); + return tensor; +} + +void NaiveExecutor::CleanFeedFetchOps() { + std::vector> ops; + for (auto &op : ops_) { + if (op->Type() != "feed" && op->Type() != "fetch") { + ops.emplace_back(std::move(op)); + } + } + ops_.swap(ops); +} + +void NaiveExecutor::EnableMKLDNN(const ProgramDesc &program) { +#ifdef PADDLE_WITH_MKLDNN + VLOG(3) << "use_mkldnn=True"; + for (size_t block_id = 0; block_id < program.Size(); ++block_id) { + auto *block = const_cast(program).MutableBlock(block_id); + for (auto *op : block->AllOps()) { + if (op->HasAttr("use_mkldnn")) { + op->SetAttr("use_mkldnn", true); + } + } + } +#else + LOG(WARNING) + << "'MKLDNN' is not supported, Please re-compile with WITH_MKLDNN option"; +#endif +} + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/naive_executor.h b/paddle/fluid/framework/naive_executor.h new file mode 100644 index 0000000000000000000000000000000000000000..9374f3f4a35cc0f90e5b2d6e8b397784b8eae123 --- /dev/null +++ b/paddle/fluid/framework/naive_executor.h @@ -0,0 +1,67 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/framework/program_desc.h" +#include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/platform/device_context.h" + +namespace paddle { +namespace framework { + +/* + * Simple, intuitive and effective. Only single thread is supported, and + * currently designed for inference. + */ +class NaiveExecutor { + public: + explicit NaiveExecutor(const platform::Place& place) : place_(place) {} + + // Create child scope. + // Create variables. + // @with_feed_fetch_ops: whether to work with the feed and fetch operators. + void Prepare(Scope* parent_scope, const ProgramDesc& program_desc, + int block_id, bool with_feed_fetch_ops); + + // Run all the operators. + void Run(); + + // Get an tensor to operating directly, without the need for feed_ops. + LoDTensor* FindTensor(const std::string& name); + + Scope* scope() { return scope_; } + + void CleanFeedFetchOps(); + + void EnableMKLDNN(const ProgramDesc& program); + + protected: + void CreateVariables(const ProgramDesc& desc, Scope* scope, int block_id); + + void CreateOps(const ProgramDesc& desc, int block_id, + bool with_feed_fetch_ops); + + private: + const platform::Place place_; + // Catch the required resource to avoid recreate. + std::vector> ops_; + Scope* scope_; +}; + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/naive_executor_test.cc b/paddle/fluid/framework/naive_executor_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..6b9f79b9d398bc5a0ee6ba66587924daad0dbbc5 --- /dev/null +++ b/paddle/fluid/framework/naive_executor_test.cc @@ -0,0 +1,70 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/naive_executor.h" +#include +#include +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/program_desc.h" + +namespace paddle { +namespace framework { + +TEST(NaiveExecutor, Basic) { + ProgramDesc program; + auto* main_block = program.MutableBlock(0); + auto* a = main_block->Var("a"); // input + auto* b = main_block->Var("b"); // input + auto* c = main_block->Var("c"); // input + a->SetType(proto::VarType::LOD_TENSOR); + b->SetType(proto::VarType::LOD_TENSOR); + c->SetType(proto::VarType::LOD_TENSOR); + + auto* add = main_block->AppendOp(); + add->SetType("elementwise_add"); + add->SetInput("X", {"a"}); + add->SetInput("Y", {"b"}); + add->SetOutput("Out", {"c"}); + + auto place = platform::CPUPlace(); + NaiveExecutor exe(place); + exe.Prepare(nullptr, program, 0, false /*with feed fetch ops*/); + auto* a_tensor = exe.FindTensor("a"); + auto* b_tensor = exe.FindTensor("b"); + auto* c_tensor = exe.FindTensor("c"); + + a_tensor->Resize({1, 4}); + b_tensor->Resize({1, 4}); + c_tensor->Resize({1, 4}); + b_tensor->mutable_data(place); + a_tensor->mutable_data(place); + + float a_arr[] = {0, 1, 2, 3}; + float b_arr[] = {0.0, .1, .2, .3}; + + std::copy_n(a_arr, 4, a_tensor->mutable_data(place)); + std::copy_n(b_arr, 4, b_tensor->mutable_data(place)); + + exe.Run(); + + auto* c_data = c_tensor->mutable_data(place); + for (int i = 0; i < 4; i++) { + EXPECT_NEAR(c_data[i], 1.1 * i, 1e-3); + } +} + +} // namespace framework +} // namespace paddle + +USE_OP(elementwise_add); diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index f5a54c0f48c98512dcb393d63a61f3c927e7ac1f..f06bad6c78c05804e583f859906b88fb7b500372 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -13,21 +13,19 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/framework/parallel_executor.h" - #include #include #include +#include "paddle/fluid/framework/ir/graph_helper.h" #include "paddle/fluid/framework/ir/graph.h" -#include "paddle/fluid/framework/ir/graph_viz_pass.h" #ifdef PADDLE_WITH_CUDA #include "paddle/fluid/platform/nccl_helper.h" #endif #include "paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h" -#include "paddle/fluid/framework/details/multi_devices_graph_check_pass.h" -#include "paddle/fluid/framework/details/multi_devices_graph_print_pass.h" +#include "paddle/fluid/framework/details/multi_devices_helper.h" #include "paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h" #include "paddle/fluid/framework/details/threaded_ssa_graph_executor.h" #include "paddle/fluid/platform/profiler.h" @@ -35,80 +33,6 @@ limitations under the License. */ namespace paddle { namespace framework { -std::unique_ptr ApplyParallelExecutorPass( - const ProgramDesc &main_program, const std::vector &places, - const std::string &loss_var_name, - const std::unordered_set ¶m_names, - const std::vector &local_scopes, const bool use_cuda, -#ifdef PADDLE_WITH_CUDA - const BuildStrategy &strategy, platform::NCCLContextMap *nccl_ctxs) { -#else - const BuildStrategy &strategy) { -#endif - // Convert the program to graph. - std::unique_ptr graph(new ir::Graph(main_program)); - - // Apply a graph viz pass to record a graph. - if (!strategy.debug_graphviz_path_.empty()) { - auto viz_pass = ir::PassRegistry::Instance().Get("graph_viz_pass"); - const std::string graph_path = string::Sprintf( - "%s%s", strategy.debug_graphviz_path_.c_str(), "_original_graph"); - viz_pass->Set("graph_viz_path", new std::string(graph_path)); - graph = viz_pass->Apply(std::move(graph)); - } - - // Apply op fusion. - if (strategy.fuse_elewise_add_act_ops_) { - auto fuse_elewise_add_act_pass = - ir::PassRegistry::Instance().Get("fuse_elewise_add_act_pass"); - graph = fuse_elewise_add_act_pass->Apply(std::move(graph)); - // Apply a graph viz pass to record a graph. - if (!strategy.debug_graphviz_path_.empty()) { - auto viz_pass = ir::PassRegistry::Instance().Get("graph_viz_pass"); - const std::string graph_path = string::Sprintf( - "%s%s", strategy.debug_graphviz_path_.c_str(), "_fused_graph"); - viz_pass->Set("graph_viz_path", new std::string(graph_path)); - graph = viz_pass->Apply(std::move(graph)); - } - } - - // Convert graph to run on multi-devices. - auto multi_devices_pass = - ir::PassRegistry::Instance().Get("multi_devices_pass"); - multi_devices_pass->SetNotOwned>("places", - &places); - multi_devices_pass->SetNotOwned("loss_var_name", - &loss_var_name); - multi_devices_pass->SetNotOwned>( - "params", ¶m_names); - multi_devices_pass->SetNotOwned>("local_scopes", - &local_scopes); - multi_devices_pass->SetNotOwned("strategy", &strategy); - -#ifdef PADDLE_WITH_CUDA - platform::NCCLContextMap *nctx = use_cuda ? nccl_ctxs : nullptr; - multi_devices_pass->SetNotOwned("nccl_ctxs", nctx); -#endif - graph = multi_devices_pass->Apply(std::move(graph)); - - // Apply a graph print pass to record a graph with device info. - if (!strategy.debug_graphviz_path_.empty()) { - auto multi_devices_print_pass = - ir::PassRegistry::Instance().Get("multi_devices_print_pass"); - multi_devices_print_pass->SetNotOwned( - "debug_graphviz_path", &strategy.debug_graphviz_path_); - multi_devices_print_pass->Set( - "graph_printer", new details::GraphvizSSAGraphPrinter); - graph = multi_devices_print_pass->Apply(std::move(graph)); - } - - // Verify that the graph is correct for multi-device executor. - auto multi_devices_check_pass = - ir::PassRegistry::Instance().Get("multi_devices_check_pass"); - graph = multi_devices_check_pass->Apply(std::move(graph)); - return graph; -} - class ParallelExecutorPrivate { public: explicit ParallelExecutorPrivate(const std::vector &places) @@ -199,10 +123,9 @@ ParallelExecutor::ParallelExecutor( // Step 3. Convert main_program to SSA form and dependency graph. Also, insert // ncclOp #ifdef PADDLE_WITH_CUDA - std::unique_ptr graph = ApplyParallelExecutorPass( + std::unique_ptr graph = build_strategy.Apply( main_program, member_->places_, loss_var_name, params, - member_->local_scopes_, member_->use_cuda_, build_strategy, - member_->nccl_ctxs_.get()); + member_->local_scopes_, member_->use_cuda_, member_->nccl_ctxs_.get()); auto max_memory_size = GetEagerDeletionThreshold(); if (max_memory_size >= 0) { @@ -228,11 +151,19 @@ ParallelExecutor::ParallelExecutor( } } #else - std::unique_ptr graph = ApplyParallelExecutorPass( - main_program, member_->places_, loss_var_name, params, - member_->local_scopes_, member_->use_cuda_, build_strategy); + std::unique_ptr graph = + build_strategy.Apply(main_program, member_->places_, loss_var_name, + params, member_->local_scopes_, member_->use_cuda_); #endif + if (VLOG_IS_ON(5)) { + // If the loss_var_name is given, the number of graph should be only one. + if (loss_var_name.size()) { + PADDLE_ENFORCE_EQ(ir::GraphNum(*graph), 1, + "The number of graph should be only one"); + } + } + if (exec_strategy.type_ == ExecutionStrategy::kDefault) { member_->executor_.reset(new details::ThreadedSSAGraphExecutor( exec_strategy, member_->local_scopes_, places, std::move(graph))); @@ -319,6 +250,13 @@ void ParallelExecutor::Run(const std::vector &fetch_tensors, #ifdef PADDLE_WITH_CUDA if (!gcs_.empty()) { ResetReferenceCount(); + for (auto &pair : cur_ref_cnts_) { + auto &name_map = *(pair.second); + for (auto &fetch_name : fetch_tensors) { + name_map.erase(fetch_name); + } + name_map.erase(fetched_var_name); + } } #endif auto fetch_data = member_->executor_->Run(fetch_tensors); @@ -373,12 +311,6 @@ ParallelExecutor::~ParallelExecutor() { } // namespace framework } // namespace paddle - -USE_PASS(fuse_elewise_add_act_pass); -USE_PASS(graph_viz_pass); -USE_PASS(multi_devices_pass); -USE_PASS(multi_devices_check_pass); -USE_PASS(multi_devices_print_pass); #ifdef PADDLE_WITH_CUDA USE_PASS(reference_count_pass); #endif diff --git a/paddle/fluid/framework/parallel_executor.h b/paddle/fluid/framework/parallel_executor.h index c64906ff230df5f2b7cc9f5c6b29d68956ab8f33..fd386a5987f11ff64964e95eb7e9b83572dc790c 100644 --- a/paddle/fluid/framework/parallel_executor.h +++ b/paddle/fluid/framework/parallel_executor.h @@ -14,14 +14,14 @@ limitations under the License. */ #pragma once -#include #include #include #include #include #include + +#include "paddle/fluid/framework/details/build_strategy.h" #include "paddle/fluid/framework/details/execution_strategy.h" -#include "paddle/fluid/framework/details/multi_devices_graph_pass.h" #include "paddle/fluid/framework/executor.h" #include "paddle/fluid/framework/op_info.h" #include "paddle/fluid/framework/program_desc.h" diff --git a/paddle/fluid/framework/rw_lock.h b/paddle/fluid/framework/rw_lock.h index da163835e8652ae479121bd67f2eed77332b2740..dbf00f3a79f7d1dcf97b346fccfdb68f119d4aa3 100644 --- a/paddle/fluid/framework/rw_lock.h +++ b/paddle/fluid/framework/rw_lock.h @@ -46,6 +46,7 @@ struct RWLock { private: pthread_rwlock_t lock_; }; +// TODO(paddle-dev): Support RWLock for WIN32 for correctness. #else // https://stackoverflow.com/questions/7125250/making-pthread-rwlock-wrlock-recursive // In windows, rw_lock seems like a hack. Use empty object and do nothing. diff --git a/paddle/fluid/framework/selected_rows_test.cc b/paddle/fluid/framework/selected_rows_test.cc index 5ca864cfdf7176850dd31dd42ef3306061a742cf..928e1ad8b9168e61ddc5782066a4aa29a4296a94 100644 --- a/paddle/fluid/framework/selected_rows_test.cc +++ b/paddle/fluid/framework/selected_rows_test.cc @@ -27,8 +27,11 @@ class SelectedRowsTester : public ::testing::Test { selected_rows_.reset(new SelectedRows(rows, height)); Tensor* value = selected_rows_->mutable_value(); - value->mutable_data( + auto* data = value->mutable_data( make_ddim({static_cast(rows.size()), row_numel}), place_); + for (int64_t i = 0; i < value->numel(); ++i) { + data[i] = static_cast(i); + } } protected: @@ -60,6 +63,10 @@ TEST_F(SelectedRowsTester, SerializeAndDeseralize) { ASSERT_EQ(selected_rows_->height(), dst_tensor.height()); ASSERT_EQ(selected_rows_->value().dims(), dst_tensor.value().dims()); ASSERT_EQ(selected_rows_->GetCompleteDims(), dst_tensor.GetCompleteDims()); + auto* dst_data = dst_tensor.value().data(); + for (int64_t i = 0; i < dst_tensor.value().numel(); ++i) { + ASSERT_EQ(dst_data[i], static_cast(i)); + } } TEST(SelectedRows, SparseTable) { diff --git a/paddle/fluid/framework/tuple.h b/paddle/fluid/framework/tuple.h index f6c6a1fec13d8b12efd1fa71a7a93316e484d045..508ee931c6ed7f66e09abd8f0e4b33c3d3c135fd 100644 --- a/paddle/fluid/framework/tuple.h +++ b/paddle/fluid/framework/tuple.h @@ -17,7 +17,6 @@ limitations under the License. */ #include #include #include -#include "paddle/fluid/framework/channel.h" #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/framework/var_desc.h" diff --git a/paddle/fluid/framework/var_desc.cc b/paddle/fluid/framework/var_desc.cc index 1aa0ae0f7c1946d91736ab61236a65a45c203fe3..7e3f002b53351ba5892aaa50482b21a83db94069 100644 --- a/paddle/fluid/framework/var_desc.cc +++ b/paddle/fluid/framework/var_desc.cc @@ -88,13 +88,7 @@ std::vector> VarDesc::GetShapes() const { } void VarDesc::SetDataType(proto::VarType::Type data_type) { - switch (desc_.type().type()) { - case proto::VarType::CHANNEL: - mutable_channel_desc()->set_data_type(data_type); - break; - default: - mutable_tensor_desc()->set_data_type(data_type); - } + mutable_tensor_desc()->set_data_type(data_type); } void VarDesc::SetDataTypes( @@ -115,13 +109,7 @@ void VarDesc::SetDataTypes( } proto::VarType::Type VarDesc::GetDataType() const { - switch (desc_.type().type()) { - case proto::VarType::CHANNEL: - return channel_desc().data_type(); - break; - default: - return tensor_desc().data_type(); - } + return tensor_desc().data_type(); } std::vector VarDesc::GetDataTypes() const { @@ -134,17 +122,6 @@ std::vector VarDesc::GetDataTypes() const { return res; } -void VarDesc::SetCapacity(int64_t capacity) { - switch (desc_.type().type()) { - case proto::VarType::CHANNEL: - desc_.mutable_type()->mutable_channel()->set_capacity(capacity); - break; - default: - PADDLE_THROW("Setting 'capacity' is not supported by the type of var %s.", - this->Name()); - } -} - void VarDesc::SetLoDLevel(int32_t lod_level) { switch (desc_.type().type()) { case proto::VarType::LOD_TENSOR: @@ -214,19 +191,6 @@ std::vector VarDesc::GetLoDLevels() const { } } -const proto::VarType::ChannelDesc &VarDesc::channel_desc() const { - PADDLE_ENFORCE(desc_.has_type(), "The var's type hasn't been set."); - PADDLE_ENFORCE(desc_.type().has_type(), "The var type hasn't been set."); - switch (desc_.type().type()) { - case proto::VarType::CHANNEL: - return desc_.type().channel(); - default: - PADDLE_THROW( - "Getting 'channel_desc' is not supported by the type of var %s.", - this->Name()); - } -} - const proto::VarType::TensorDesc &VarDesc::tensor_desc() const { PADDLE_ENFORCE(desc_.has_type(), "The var's type hasn't been set."); PADDLE_ENFORCE(desc_.type().has_type(), "The var type hasn't been set."); @@ -262,20 +226,6 @@ std::vector VarDesc::tensor_descs() const { } } -proto::VarType::ChannelDesc *VarDesc::mutable_channel_desc() { - PADDLE_ENFORCE(desc_.has_type(), "The var type hasn't been set."); - PADDLE_ENFORCE(desc_.type().has_type(), "The var type hasn't been set."); - switch (desc_.type().type()) { - case proto::VarType::CHANNEL: - return desc_.mutable_type()->mutable_channel(); - default: - PADDLE_THROW( - "Getting 'mutable_channel_desc' is not supported by the type of var " - "%s.", - this->Name()); - } -} - proto::VarType::TensorDesc *VarDesc::mutable_tensor_desc() { PADDLE_ENFORCE(desc_.has_type(), "The var type hasn't been set."); PADDLE_ENFORCE(desc_.type().has_type(), "The var type hasn't been set."); diff --git a/paddle/fluid/framework/var_desc.h b/paddle/fluid/framework/var_desc.h index 9f7a21ef42b8d3e74b6e211d6254294ba1fa2341..e33849ef502fb10b913e7e28cbd0abdb8b8ff9bb 100644 --- a/paddle/fluid/framework/var_desc.h +++ b/paddle/fluid/framework/var_desc.h @@ -87,8 +87,6 @@ class VarDesc { void SetDataTypes( const std::vector &multiple_data_type); - void SetCapacity(int64_t capacity); - proto::VarType::Type GetDataType() const; std::vector GetDataTypes() const; @@ -110,10 +108,8 @@ class VarDesc { void SetPersistable(bool persistable) { desc_.set_persistable(persistable); } private: - const proto::VarType::ChannelDesc &channel_desc() const; const proto::VarType::TensorDesc &tensor_desc() const; std::vector tensor_descs() const; - proto::VarType::ChannelDesc *mutable_channel_desc(); proto::VarType::TensorDesc *mutable_tensor_desc(); std::vector mutable_tensor_descs(); diff --git a/paddle/fluid/framework/var_type.h b/paddle/fluid/framework/var_type.h index e9550dbfb976bee70741158b94b04084919e8271..3b6f1cdb8f24ab20bfc80eeeba88891d7b41d1f9 100644 --- a/paddle/fluid/framework/var_type.h +++ b/paddle/fluid/framework/var_type.h @@ -13,7 +13,6 @@ See the License for the specific language governing permissions and limitations under the License. */ #pragma once -#include "paddle/fluid/framework/channel.h" #include "paddle/fluid/framework/framework.pb.h" #include "paddle/fluid/framework/lod_rank_table.h" #include "paddle/fluid/framework/lod_tensor.h" @@ -41,8 +40,6 @@ inline proto::VarType::Type ToVarType(std::type_index type) { return proto::VarType_Type_SELECTED_ROWS; } else if (IsType(type)) { return proto::VarType_Type_READER; - } else if (IsType(type)) { - return proto::VarType_Type_CHANNEL; } else { PADDLE_THROW("ToVarType:Unsupported type %s", type.name()); } @@ -66,9 +63,6 @@ inline void VisitVarType(const framework::Variable& var, Visitor visitor) { case proto::VarType_Type_READER: visitor(var.Get()); return; - case proto::VarType_Type_CHANNEL: - visitor(var.Get()); - return; default: PADDLE_THROW("Not supported visit type, %d", ToVarType(var.Type())); } diff --git a/paddle/fluid/inference/CMakeLists.txt b/paddle/fluid/inference/CMakeLists.txt index 6698efd1fa773127a84b4bcb28f57f4226dd7ae2..ec1bc7825dd21628f5c37ea44a154abe7b7e8c73 100644 --- a/paddle/fluid/inference/CMakeLists.txt +++ b/paddle/fluid/inference/CMakeLists.txt @@ -20,7 +20,8 @@ cc_library(paddle_fluid_origin DEPS ${fluid_modules} paddle_fluid_api) add_subdirectory(api) # Create static library -cc_library(paddle_fluid DEPS ${fluid_modules} paddle_fluid_api paddle_inference_api analysis_predictor) +cc_library(paddle_fluid DEPS ${fluid_modules} paddle_fluid_api paddle_inference_api + analysis_predictor zero_copy_tensor) if(NOT APPLE) # TODO(liuyiqu: Temporarily disable the link flag because it is not support on Mac. set(LINK_FLAGS "-Wl,--retain-symbols-file ${CMAKE_CURRENT_SOURCE_DIR}/paddle_fluid.sym") @@ -31,6 +32,7 @@ endif() cc_library(paddle_fluid_shared SHARED SRCS io.cc ${CMAKE_CURRENT_SOURCE_DIR}/api/api.cc ${CMAKE_CURRENT_SOURCE_DIR}/api/api_impl.cc ${CMAKE_CURRENT_SOURCE_DIR}/api/analysis_predictor.cc + ${CMAKE_CURRENT_SOURCE_DIR}/api/details/zero_copy_tensor.cc DEPS ${fluid_modules} paddle_fluid_api) set_target_properties(paddle_fluid_shared PROPERTIES OUTPUT_NAME paddle_fluid) @@ -53,7 +55,7 @@ if(NOT APPLE) endif() if(WITH_TESTING) - # tests/book depends the models that generated by python/paddle/fluid/tests/book + # tests/book depends the models that generated by python/paddle/fluid/tests/book add_subdirectory(tests/book) if(WITH_INFERENCE_API_TEST) add_subdirectory(tests/api) diff --git a/paddle/fluid/inference/analysis/CMakeLists.txt b/paddle/fluid/inference/analysis/CMakeLists.txt index c2a1c6634bd8f8de0796456e91cb3c530d4c6823..d4d2fd4634f9e11f3f002e11e177c332ced49885 100644 --- a/paddle/fluid/inference/analysis/CMakeLists.txt +++ b/paddle/fluid/inference/analysis/CMakeLists.txt @@ -1,6 +1,6 @@ cc_library(ir_pass_manager SRCS ir_pass_manager.cc DEPS graph pass) set(analysis_deps - framework_proto proto_desc ir_pass_manager graph pass paddle_fluid_api executor pretty_log) + framework_proto proto_desc ir_pass_manager graph pass paddle_fluid_api executor pretty_log) cc_library(analysis SRCS pass_manager.cc node.cc data_flow_graph.cc graph_traits.cc subgraph_splitter.cc analyzer.cc @@ -20,8 +20,6 @@ cc_test(test_node SRCS node_tester.cc DEPS analysis) cc_test(test_dot SRCS dot_tester.cc DEPS analysis) cc_binary(inference_analyzer SRCS analyzer_main.cc DEPS analysis paddle_fluid) -set(PYTHON_TESTS_DIR ${PADDLE_BINARY_DIR}/python/paddle/fluid/tests) - function (inference_analysis_test TARGET) if(WITH_TESTING) set(options "") diff --git a/paddle/fluid/inference/analysis/analysis_pass.h b/paddle/fluid/inference/analysis/analysis_pass.h index b6edb5529ace2ad5bd1b35bfbee1f7a744457cc3..13805ea4acf936b242bcd86b2faf89813753a9fe 100644 --- a/paddle/fluid/inference/analysis/analysis_pass.h +++ b/paddle/fluid/inference/analysis/analysis_pass.h @@ -41,12 +41,6 @@ class AnalysisPass { // all passes have run. virtual bool Finalize() { return false; } - // Get a Pass appropriate to print the Node this pass operates on. - virtual AnalysisPass *CreatePrinterPass(std::ostream &os, - const std::string &banner) const { - return nullptr; - } - // Create a debugger Pass that draw the DFG by graphviz toolkit. virtual AnalysisPass *CreateGraphvizDebugerPass() const { return nullptr; } diff --git a/paddle/fluid/inference/analysis/analyzer.h b/paddle/fluid/inference/analysis/analyzer.h index 9bdbefc07cbc4bf7a4714927c84855837610430e..0aa9367bf5692e53e2a1f1247523cf9a4f0b3a1d 100644 --- a/paddle/fluid/inference/analysis/analyzer.h +++ b/paddle/fluid/inference/analysis/analyzer.h @@ -64,14 +64,15 @@ class Analyzer : public OrderedRegistry { // larger fusion. const std::vector all_ir_passes_{{ // Manual update the passes here. - "infer_clean_graph_pass", // - "attention_lstm_fuse_pass", // - "fc_lstm_fuse_pass", // - "mul_lstm_fuse_pass", // - "fc_gru_fuse_pass", // - "mul_gru_fuse_pass", // - "seq_concat_fc_fuse_pass", // - "fc_fuse_pass", // + "infer_clean_graph_pass", // + "attention_lstm_fuse_pass", // + "embedding_fc_lstm_fuse_pass", // + "fc_lstm_fuse_pass", // + "mul_lstm_fuse_pass", // + "fc_gru_fuse_pass", // + "mul_gru_fuse_pass", // + "seq_concat_fc_fuse_pass", // + "fc_fuse_pass", // #ifdef PADDLE_WITH_MKLDNN "conv_relu_mkldnn_fuse_pass", // #endif diff --git a/paddle/fluid/inference/analysis/analyzer_tester.cc b/paddle/fluid/inference/analysis/analyzer_tester.cc index 3b5be7f3ee33c73a9704bafa9f1b736c8a3cd9ea..f90910ac0d0a897ef01d4ca2bd0bca575baf4c40 100644 --- a/paddle/fluid/inference/analysis/analyzer_tester.cc +++ b/paddle/fluid/inference/analysis/analyzer_tester.cc @@ -37,12 +37,16 @@ TEST(Analyzer, analysis_without_tensorrt) { TEST(Analyzer, analysis_with_tensorrt) { FLAGS_IA_enable_tensorrt_subgraph_engine = true; Argument argument; + argument.Set("minimum_subgraph_size", new int(0)); + argument.Set("max_batch_size", new int(3)); + argument.Set("workspace_size", new int(1 << 20)); + argument.Set("precision_mode", new std::string("FP32")); argument.fluid_model_dir.reset(new std::string(FLAGS_inference_model_dir)); Analyzer analyser; analyser.Run(&argument); } -void TestWord2vecPrediction(const std::string &model_path) { +void TestWord2vecPrediction(const std::string& model_path) { NativeConfig config; config.model_dir = model_path; config.use_gpu = false; @@ -73,8 +77,8 @@ void TestWord2vecPrediction(const std::string &model_path) { // The outputs' buffers are in CPU memory. for (size_t i = 0; i < std::min(5UL, num_elements); i++) { LOG(INFO) << "data: " - << static_cast(outputs.front().data.data())[i]; - PADDLE_ENFORCE(static_cast(outputs.front().data.data())[i], + << static_cast(outputs.front().data.data())[i]; + PADDLE_ENFORCE(static_cast(outputs.front().data.data())[i], result[i]); } } diff --git a/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.cc b/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.cc index 5652940ec6d4cc7ba9a1d3a3e65f7dca1690d8c4..cb549f4b50cf56154a951d16b58b022dbad3e990 100644 --- a/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.cc +++ b/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.cc @@ -97,8 +97,10 @@ void DataFlowGraphToFluidPass::AddFluidOp(Node *node) { } } -void CreateTrtEngineOp(Node *node, const DataFlowGraph &graph, +void CreateTrtEngineOp(Node *node, Argument *argument, framework::proto::BlockDesc *block) { + PADDLE_ENFORCE(argument->main_dfg.get()); + const DataFlowGraph &graph = *(argument->main_dfg); static int counter{0}; PADDLE_ENFORCE(node->IsFunctionBlock()); framework::OpDesc desc; @@ -204,7 +206,10 @@ void CreateTrtEngineOp(Node *node, const DataFlowGraph &graph, PADDLE_ENFORCE(!block->vars().empty(), "the block has no var-desc"); // Set attrs + SetAttr(desc.Proto(), "subgraph", block->SerializeAsString()); + SetAttr(desc.Proto(), "max_batch_size", argument->Get("max_batch_size")); + SetAttr(desc.Proto(), "workspace_size", argument->Get("workspace_size")); SetAttr(desc.Proto(), "engine_uniq_key", "trt-" + std::to_string(counter++)); SetAttr(desc.Proto(), "parameters", ExtractParameters(graph.nodes.nodes())); SetAttr(desc.Proto(), "output_name_mapping", output_mapping); @@ -248,7 +253,7 @@ void DataFlowGraphToFluidPass::AddEngineOp(Node *node) { *block_desc.Proto()->mutable_vars() = argument_->origin_program_desc->blocks(0).vars(); PADDLE_ENFORCE(!block_desc.Proto()->vars().empty()); - CreateTrtEngineOp(node, *argument_->main_dfg, block_desc.Proto()); + CreateTrtEngineOp(node, argument_, block_desc.Proto()); auto *main_block = desc_->mutable_blocks(framework::kRootBlockIndex); auto *op = main_block->add_ops(); PADDLE_ENFORCE(!node->pb_msg().empty(), "failed to set desc for block"); diff --git a/paddle/fluid/inference/analysis/subgraph_splitter.cc b/paddle/fluid/inference/analysis/subgraph_splitter.cc index b879067d2f2f6294c50e0adb21f9399a7c36698a..526bbbadfe90c3064d7c620cc22e30f7fef99088 100644 --- a/paddle/fluid/inference/analysis/subgraph_splitter.cc +++ b/paddle/fluid/inference/analysis/subgraph_splitter.cc @@ -309,6 +309,8 @@ void SubGraphFuse::operator()() { ReplaceNodesWithSubGraphs(); } void SubGraphFuse::ReplaceNodesWithSubGraphs() { auto subgraphs = SubGraphSplitter(graph_, node_inside_subgraph_teller_)(); for (auto &subgraph : subgraphs) { + if (subgraph.size() <= argument_->Get("minimum_subgraph_size")) + continue; std::unordered_set subgraph_uniq(subgraph.begin(), subgraph.end()); // replace this sub-graph with the first node. Two steps: 1. Create a Block // Node that contains this subgraph 2. Mark the nodes inside the sub-graph diff --git a/paddle/fluid/inference/analysis/subgraph_splitter.h b/paddle/fluid/inference/analysis/subgraph_splitter.h index a31afbe6933da8d3c7a88142cc12d63b98b55796..76e4fda0249e03c617d1b37c079dcd97f21387c1 100644 --- a/paddle/fluid/inference/analysis/subgraph_splitter.h +++ b/paddle/fluid/inference/analysis/subgraph_splitter.h @@ -20,6 +20,7 @@ limitations under the License. */ #include +#include "paddle/fluid/inference/analysis/argument.h" #include "paddle/fluid/inference/analysis/data_flow_graph.h" #include "paddle/fluid/inference/analysis/node.h" @@ -63,8 +64,11 @@ class SubGraphFuse { public: using NodeInsideSubgraphTeller = SubGraphSplitter::NodeInsideSubgraphTeller; - SubGraphFuse(DataFlowGraph *graph, const NodeInsideSubgraphTeller &teller) - : graph_(graph), node_inside_subgraph_teller_(teller) {} + SubGraphFuse(DataFlowGraph *graph, const NodeInsideSubgraphTeller &teller, + Argument *argument) + : graph_(graph), + node_inside_subgraph_teller_(teller), + argument_(argument) {} // The main method which run all the logic. void operator()(); @@ -76,6 +80,7 @@ class SubGraphFuse { private: DataFlowGraph *graph_; NodeInsideSubgraphTeller node_inside_subgraph_teller_; + Argument *argument_; }; } // namespace analysis diff --git a/paddle/fluid/inference/analysis/subgraph_splitter_tester.cc b/paddle/fluid/inference/analysis/subgraph_splitter_tester.cc index 531a170512f727d891aa6644ee08a60c25f16876..e1dc89fab5fb76d456b07c316ab1cabe6de23b26 100644 --- a/paddle/fluid/inference/analysis/subgraph_splitter_tester.cc +++ b/paddle/fluid/inference/analysis/subgraph_splitter_tester.cc @@ -66,10 +66,12 @@ TEST(SubGraphSplitter, Split) { TEST(SubGraphSplitter, Fuse) { auto desc = LoadProgramDesc(FLAGS_inference_model_dir + "/__model__"); auto dfg = ProgramDescToDFG(desc); + Argument argument; + argument.Set("minimum_subgraph_size", new int(3)); size_t count0 = dfg.nodes.size(); - SubGraphFuse fuse(&dfg, teller); + SubGraphFuse fuse(&dfg, teller, &argument); fuse(); int count1 = 0; diff --git a/paddle/fluid/inference/analysis/tensorrt_subgraph_pass.cc b/paddle/fluid/inference/analysis/tensorrt_subgraph_pass.cc index faf876de6d65d20cf7a084cd97392cfc8d791a42..cc1746ecb34c983d219693bcec17c8789c38fa9f 100644 --- a/paddle/fluid/inference/analysis/tensorrt_subgraph_pass.cc +++ b/paddle/fluid/inference/analysis/tensorrt_subgraph_pass.cc @@ -24,7 +24,7 @@ TensorRTSubGraphPass::TensorRTSubGraphPass( : node_inside_subgraph_teller_(teller) {} void TensorRTSubGraphPass::Run(DataFlowGraph *graph) { - SubGraphFuse(graph, node_inside_subgraph_teller_)(); + SubGraphFuse(graph, node_inside_subgraph_teller_, argument_)(); VLOG(4) << "debug info " << graph->HumanReadableInfo(false /*show_values*/, true /*show_functions*/); diff --git a/paddle/fluid/inference/analysis/tensorrt_subgraph_pass.h b/paddle/fluid/inference/analysis/tensorrt_subgraph_pass.h index 219e3f5470f627e81005aabf94f9c72c33fd2eed..3545da9109d79964f36c3d7e738620cc2e0f9a6c 100644 --- a/paddle/fluid/inference/analysis/tensorrt_subgraph_pass.h +++ b/paddle/fluid/inference/analysis/tensorrt_subgraph_pass.h @@ -33,7 +33,10 @@ class TensorRTSubGraphPass : public DataFlowGraphPass { explicit TensorRTSubGraphPass(const NodeInsideSubgraphTeller& teller); - bool Initialize(Argument* argument) override { return true; } + bool Initialize(Argument* argument) override { + argument_ = argument; + return true; + } // This class get a sub-graph as input and determine whether to transform this // sub-graph into TensorRT. @@ -46,6 +49,7 @@ class TensorRTSubGraphPass : public DataFlowGraphPass { private: NodeInsideSubgraphTeller node_inside_subgraph_teller_; + Argument* argument_; }; } // namespace analysis diff --git a/paddle/fluid/inference/analysis/tensorrt_subgraph_pass_tester.cc b/paddle/fluid/inference/analysis/tensorrt_subgraph_pass_tester.cc index 67a5af83d89b771536ea11be51b35244ff5c09d6..9748e24b06295a4e7c2995429e6588cd0f225fe6 100644 --- a/paddle/fluid/inference/analysis/tensorrt_subgraph_pass_tester.cc +++ b/paddle/fluid/inference/analysis/tensorrt_subgraph_pass_tester.cc @@ -36,6 +36,10 @@ TEST(TensorRTSubGraphPass, main) { }; Argument argument(FLAGS_inference_model_dir); + argument.Set("minimum_subgraph_size", new int(0)); + argument.Set("max_batch_size", new int(3)); + argument.Set("workspace_size", new int(1 << 20)); + argument.Set("precision_mode", new std::string("FP32")); DFG_GraphvizDrawPass::Config config{FLAGS_dot_dir, "origin"}; DFG_GraphvizDrawPass::Config config1{FLAGS_dot_dir, "fusion"}; diff --git a/paddle/fluid/inference/api/CMakeLists.txt b/paddle/fluid/inference/api/CMakeLists.txt index e569df94c54c304852dab7c7496804c1b08d665c..0ddd5d53f836131fe37d412fc867cb38f11ee2b5 100644 --- a/paddle/fluid/inference/api/CMakeLists.txt +++ b/paddle/fluid/inference/api/CMakeLists.txt @@ -18,10 +18,10 @@ if(APPLE) endif(APPLE) -set(inference_deps paddle_inference_api paddle_fluid_api analysis pass ir_pass_manager ${GLOB_PASS_LIB}) +set(inference_deps paddle_inference_api paddle_fluid_api analysis pass ir_pass_manager naive_executor ${GLOB_PASS_LIB}) if(WITH_GPU AND TENSORRT_FOUND) - set(inference_deps ${inference_deps} paddle_inference_tensorrt_subgraph_engine) + set(inference_deps ${inference_deps} paddle_inference_tensorrt_subgraph_engine analysis_predictor) endif() function(inference_api_test TARGET_NAME) @@ -31,7 +31,6 @@ function(inference_api_test TARGET_NAME) set(multiValueArgs ARGS) cmake_parse_arguments(inference_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) - set(PYTHON_TESTS_DIR ${PADDLE_BINARY_DIR}/python/paddle/fluid/tests) cc_test(${TARGET_NAME} SRCS ${inference_test_SRC} DEPS "${inference_deps}" @@ -43,8 +42,10 @@ function(inference_api_test TARGET_NAME) endif(WITH_TESTING) endfunction(inference_api_test) -cc_library(paddle_inference_api SRCS api.cc api_impl.cc helper.cc DEPS lod_tensor) -cc_library(analysis_predictor SRCS analysis_predictor.cc DEPS paddle_inference_api analysis) +cc_library(paddle_inference_api SRCS api.cc api_impl.cc helper.cc DEPS lod_tensor scope) +cc_library(analysis_predictor SRCS analysis_predictor.cc DEPS paddle_inference_api analysis naive_executor zero_copy_tensor) +cc_library(zero_copy_tensor SRCS details/zero_copy_tensor.cc DEPS paddle_inference_api) +cc_library(zero_copy_tensor_dummy SRCS details/zero_copy_tensor_dummy.cc DEPS paddle_inference_api) cc_test(test_paddle_inference_api SRCS api_tester.cc DEPS paddle_inference_api) @@ -52,18 +53,22 @@ cc_test(test_paddle_inference_api inference_api_test(test_api_impl SRC api_impl_tester.cc ARGS test_word2vec test_image_classification) +set(PYTHON_TESTS_DIR ${PADDLE_BINARY_DIR}/python/paddle/fluid/tests) +cc_test(test_analysis_predictor SRCS analysis_predictor_tester.cc DEPS analysis_predictor ${inference_deps} paddle_inference_api + ARGS --dirname=${PYTHON_TESTS_DIR}/book) + if(WITH_GPU AND TENSORRT_FOUND) cc_library(paddle_inference_tensorrt_subgraph_engine SRCS api_tensorrt_subgraph_engine.cc - DEPS paddle_inference_api analysis tensorrt_engine paddle_inference_api paddle_fluid_api tensorrt_converter) + DEPS paddle_inference_api analysis tensorrt_engine paddle_inference_api paddle_fluid_api tensorrt_converter zero_copy_tensor_dummy) inference_api_test(test_api_tensorrt_subgraph_engine SRC api_tensorrt_subgraph_engine_tester.cc ARGS test_word2vec) endif() if (WITH_ANAKIN AND WITH_MKL) # only needed in CI # compile the libinference_anakin_api.a and anakin.so. - cc_library(inference_anakin_api SRCS api.cc api_anakin_engine.cc DEPS anakin_shared anakin_saber mklml) - cc_library(inference_anakin_api_shared SHARED SRCS api.cc api_anakin_engine.cc DEPS anakin_shared anakin_saber) + cc_library(inference_anakin_api SRCS api.cc api_anakin_engine.cc DEPS anakin_shared anakin_saber mklml scope zero_copy_tensor_dummy) + cc_library(inference_anakin_api_shared SHARED SRCS api.cc api_anakin_engine.cc DEPS anakin_shared anakin_saber scope) function(anakin_target target_name) target_compile_options(${target_name} BEFORE PUBLIC ${ANAKIN_COMPILE_EXTRA_FLAGS}) endfunction() diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc index 00cbe28d45dc4393ba1c141912aee7d1b7469a89..3bc6af5241c41bd805699121d614d431d46d863f 100644 --- a/paddle/fluid/inference/api/analysis_predictor.cc +++ b/paddle/fluid/inference/api/analysis_predictor.cc @@ -16,9 +16,12 @@ #include #include #include +#include "paddle/fluid/framework/feed_fetch_method.h" #include "paddle/fluid/framework/ir/fuse_pass_base.h" #include "paddle/fluid/framework/ir/pass.h" +#include "paddle/fluid/framework/naive_executor.h" #include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/inference/api/helper.h" #include "paddle/fluid/inference/api/paddle_inference_api.h" #include "paddle/fluid/inference/api/paddle_inference_pass.h" #include "paddle/fluid/inference/utils/singleton.h" @@ -28,8 +31,11 @@ DECLARE_bool(profile); namespace paddle { +using contrib::AnalysisConfig; + bool AnalysisPredictor::Init( - const std::shared_ptr& parent_scope) { + const std::shared_ptr &parent_scope, + const std::shared_ptr &program) { VLOG(3) << "Predictor::init()"; #if !defined(_WIN32) if (FLAGS_profile) { @@ -43,7 +49,8 @@ bool AnalysisPredictor::Init( if (config_.use_gpu) { place_ = paddle::platform::CUDAPlace(config_.device); - LOG(WARNING) << "ir optimize only supports CPU currently"; + LOG(WARNING) << "ir optimize only supports CPU currently, enable_ir_optim " + "is turned false."; config_.enable_ir_optim = false; } else { place_ = paddle::platform::CPUPlace(); @@ -56,40 +63,143 @@ bool AnalysisPredictor::Init( scope_.reset(new paddle::framework::Scope()); } - executor_.reset(new paddle::framework::Executor(place_)); + executor_.reset(new paddle::framework::NaiveExecutor(place_)); - // Initialize the inference program - if (!config_.model_dir.empty()) { - // Parameters are saved in separate files sited in - // the specified `dirname`. - inference_program_ = paddle::inference::Load(executor_.get(), scope_.get(), - config_.model_dir); - } else if (!config_.prog_file.empty() && !config_.param_file.empty()) { - // All parameters are saved in a single file. - // The file names should be consistent with that used - // in Python API `fluid.io.save_inference_model`. - inference_program_ = paddle::inference::Load( - executor_.get(), scope_.get(), config_.prog_file, config_.param_file); + if (!program) { + if (!LoadProgramDesc()) return false; + OptimizeInferenceProgram(); } else { - LOG(ERROR) << "fail to load inference model from " << config_.model_dir; - return false; + inference_program_ = program; } - OptimizeInferenceProgram(); - ctx_ = executor_->Prepare(*inference_program_, 0); if (config_._use_mkldnn) { executor_->EnableMKLDNN(*inference_program_); } - VLOG(5) << "to create variables"; - PADDLE_ENFORCE(scope_.get()); - executor_->CreateVariables(*inference_program_, - sub_scope_ ? sub_scope_ : scope_.get(), 0); + executor_->Prepare(scope_.get(), *inference_program_, 0, + config_.use_feed_fetch_ops); + // Get the feed_target_names and fetch_target_names PrepareFeedFetch(); return true; } +bool AnalysisPredictor::Run(const std::vector &inputs, + std::vector *output_data, + int batch_size) { + VLOG(3) << "Predictor::predict"; + inference::Timer timer; + timer.tic(); + // set feed variable + std::vector feeds; + framework::Scope *scope = sub_scope_ ? sub_scope_ : scope_.get(); + if (!SetFeed(inputs, scope)) { + LOG(ERROR) << "fail to set feed"; + return false; + } + + // Run the inference program + // if share variables, we need not create variables + executor_->Run(); + + // get fetch variable + if (!GetFetch(output_data, scope)) { + LOG(ERROR) << "fail to get fetches"; + return false; + } + VLOG(3) << "predict cost: " << timer.toc() << "ms"; + return true; +} + +bool AnalysisPredictor::SetFeed(const std::vector &inputs, + framework::Scope *scope) { + VLOG(3) << "Predictor::set_feed"; + if (inputs.size() != feeds_.size()) { + LOG(ERROR) << "wrong feed input size, need " << feeds_.size() << " but get " + << inputs.size(); + return false; + } + + // Cache the inputs memory for better concurrency performance. + feed_tensors_.resize(inputs.size()); + + for (size_t i = 0; i < inputs.size(); ++i) { + auto &input = feed_tensors_[i]; + framework::DDim ddim = framework::make_ddim(inputs[i].shape); + void *input_ptr; + if (inputs[i].dtype == PaddleDType::INT64) { + input_ptr = input.mutable_data(ddim, platform::CPUPlace()); + } else if (inputs[i].dtype == PaddleDType::FLOAT32) { + input_ptr = input.mutable_data(ddim, platform::CPUPlace()); + } else { + LOG(ERROR) << "unsupported feed type " << inputs[i].dtype; + return false; + } + + // TODO(panyx0718): Init LoDTensor from existing memcpy to save a copy. + std::memcpy(static_cast(input_ptr), inputs[i].data.data(), + inputs[i].data.length()); + // TODO(Superjomn) Low performance, need optimization for heavy LoD copy. + framework::LoD lod; + for (auto &level : inputs[i].lod) { + lod.emplace_back(level); + } + input.set_lod(lod); + int idx = -1; + if (config_.specify_input_name) { + idx = feed_names_[inputs[i].name]; + } else { + idx = boost::get(feeds_[i]->GetAttr("col")); + } + framework::SetFeedVariable(scope, input, "feed", idx); + } + return true; +} + +template +void AnalysisPredictor::GetFetchOne(const framework::LoDTensor &fetch, + PaddleTensor *output) { + // set shape. + auto shape = framework::vectorize(fetch.dims()); + output->shape.assign(shape.begin(), shape.end()); + // set data. + const T *data = fetch.data(); + int num_elems = inference::VecReduceToInt(shape); + output->data.Resize(num_elems * sizeof(T)); + // The fetched tensor output by fetch op, should always in CPU memory, so just + // copy. + memcpy(output->data.data(), data, num_elems * sizeof(T)); + // set lod + output->lod.clear(); + for (auto &level : fetch.lod()) { + output->lod.emplace_back(level.begin(), level.end()); + } +} + +bool AnalysisPredictor::GetFetch(std::vector *outputs, + framework::Scope *scope) { + VLOG(3) << "Predictor::get_fetch"; + outputs->resize(fetchs_.size()); + for (size_t i = 0; i < fetchs_.size(); ++i) { + int idx = boost::get(fetchs_[i]->GetAttr("col")); + PADDLE_ENFORCE((size_t)idx == i); + framework::LoDTensor &fetch = + framework::GetFetchVariable(*scope, "fetch", idx); + auto type = fetch.type(); + auto output = &(outputs->at(i)); + if (type == typeid(float)) { + GetFetchOne(fetch, output); + output->dtype = PaddleDType::FLOAT32; + } else if (type == typeid(int64_t)) { + GetFetchOne(fetch, output); + output->dtype = PaddleDType::INT64; + } else { + LOG(ERROR) << "unknown type, only support float32 and int64 now."; + } + } + return true; +} + void AnalysisPredictor::OptimizeInferenceProgram() { LOG(INFO) << "optimize begin"; FLAGS_IA_enable_ir = config_.enable_ir_optim; @@ -107,6 +217,7 @@ void AnalysisPredictor::OptimizeInferenceProgram() { new std::string(config_.prog_file)); argument_.fluid_model_param_path.reset(new std::string(config_.param_file)); } + argument_.origin_program_desc.reset( new ProgramDesc(*inference_program_->Proto())); PADDLE_ENFORCE( @@ -127,9 +238,8 @@ void AnalysisPredictor::OptimizeInferenceProgram() { } template <> -std::unique_ptr -CreatePaddlePredictor( - const contrib::AnalysisConfig& config) { +std::unique_ptr CreatePaddlePredictor< + AnalysisConfig, PaddleEngineKind::kAnalysis>(const AnalysisConfig &config) { VLOG(3) << "create AnalysisConfig"; if (config.use_gpu) { // 1. GPU memeroy @@ -150,15 +260,90 @@ CreatePaddlePredictor( } std::unique_ptr predictor(new AnalysisPredictor(config)); - if (!dynamic_cast(predictor.get())->Init(nullptr)) { + if (!dynamic_cast(predictor.get())->Init(nullptr)) { return nullptr; } return predictor; } +void AnalysisPredictor::PrepareFeedFetch() { + for (auto *op : inference_program_->Block(0).AllOps()) { + if (op->Type() == "feed") { + int idx = boost::get(op->GetAttr("col")); + if (feeds_.size() <= static_cast(idx)) { + feeds_.resize(idx + 1); + } + feeds_[idx] = op; + feed_names_[op->Output("Out")[0]] = idx; + } else if (op->Type() == "fetch") { + int idx = boost::get(op->GetAttr("col")); + if (fetchs_.size() <= static_cast(idx)) { + fetchs_.resize(idx + 1); + } + fetchs_[idx] = op; + } + } +} + +std::unique_ptr AnalysisPredictor::GetInputTensor( + const std::string &name) { + PADDLE_ENFORCE(executor_->scope()->FindVar(name), "no name called %s", name); + std::unique_ptr res( + new ZeroCopyTensor(static_cast(executor_->scope()))); + res->input_or_output_ = true; + res->SetName(name); + return res; +} + +std::unique_ptr AnalysisPredictor::GetOutputTensor( + const std::string &name) { + PADDLE_ENFORCE(executor_->scope()->FindVar(name), "no name called %s", name); + std::unique_ptr res( + new ZeroCopyTensor(static_cast(executor_->scope()))); + res->input_or_output_ = false; + res->SetName(name); + return res; +} + +bool AnalysisPredictor::ZeroCopyRun() { + executor_->Run(); + return true; +} + +bool AnalysisPredictor::LoadProgramDesc() { + // Initialize the inference program + std::unique_ptr tmp_exe( + new framework::Executor(platform::CPUPlace())); + if (!config_.model_dir.empty()) { + // Parameters are saved in separate files sited in + // the specified `dirname`. + inference_program_ = paddle::inference::Load( + static_cast(tmp_exe.get()), scope_.get(), + config_.model_dir); + } else if (!config_.prog_file.empty() && !config_.param_file.empty()) { + // All parameters are saved in a single file. + // The file names should be consistent with that used + // in Python API `fluid.io.save_inference_model`. + inference_program_ = paddle::inference::Load( + static_cast(tmp_exe.get()), scope_.get(), + config_.prog_file, config_.param_file); + } else { + LOG(ERROR) << string::Sprintf( + "not valid model path '%s' or program path '%s'.", config_.model_dir, + config_.param_file); + return false; + } + return true; +} +std::unique_ptr AnalysisPredictor::Clone() { + auto *x = new AnalysisPredictor(config_); + x->Init(scope_, inference_program_); + return std::unique_ptr(x); +} + template <> std::unique_ptr CreatePaddlePredictor( - const contrib::AnalysisConfig& config) { + const contrib::AnalysisConfig &config) { return CreatePaddlePredictor(config); } diff --git a/paddle/fluid/inference/api/analysis_predictor.h b/paddle/fluid/inference/api/analysis_predictor.h index aa00e8be5c28c2e3bfe74fa0bff2c72210bd106e..0d01d7ac2b29ea6364b07af9bb3bdeb5ced6bd00 100644 --- a/paddle/fluid/inference/api/analysis_predictor.h +++ b/paddle/fluid/inference/api/analysis_predictor.h @@ -12,42 +12,81 @@ // See the License for the specific language governing permissions and // limitations under the License. +#pragma once #include #include +#include "paddle/fluid/framework/naive_executor.h" #include "paddle/fluid/inference/analysis/analyzer.h" #include "paddle/fluid/inference/api/api_impl.h" #include "paddle/fluid/inference/api/paddle_inference_api.h" +#include "paddle/fluid/string/printf.h" namespace paddle { using inference::analysis::Argument; using inference::analysis::Analyzer; using framework::proto::ProgramDesc; +using framework::NaiveExecutor; +using contrib::AnalysisConfig; /* This predictor is based on the original native predictor with IR and Analysis * support. It will optimize IR and Parameters in the runtime. * TODO(Superjomn) Replace the Navive predictor? */ -class AnalysisPredictor : public NativePaddlePredictor { +class AnalysisPredictor : public PaddlePredictor { public: - explicit AnalysisPredictor(const contrib::AnalysisConfig& config) - : NativePaddlePredictor(config), config_(config) {} + explicit AnalysisPredictor(const AnalysisConfig &config) : config_(config) {} - bool Init(const std::shared_ptr& parent_scope); + bool Init(const std::shared_ptr &parent_scope, + const std::shared_ptr &program = nullptr); - bool Run(const std::vector& inputs, - std::vector* output_data, - int batch_size = -1) override { - return NativePaddlePredictor::Run(inputs, output_data, batch_size); - } + bool Run(const std::vector &inputs, + std::vector *output_data, + int batch_size = -1) override; + + std::unique_ptr GetInputTensor( + const std::string &name) override; + std::unique_ptr GetOutputTensor( + const std::string &name) override; + + bool ZeroCopyRun() override; + + void PrepareFeedFetch(); void OptimizeInferenceProgram(); - Argument& analysis_argument() { return argument_; } + Argument &analysis_argument() { return argument_; } + + std::unique_ptr Clone() override; + + framework::Scope *scope() { return executor_->scope(); } + framework::ProgramDesc &program() { return *inference_program_; } + + protected: + bool LoadProgramDesc(); + + bool SetFeed(const std::vector &input_datas, + framework::Scope *scope); + bool GetFetch(std::vector *output_data, + framework::Scope *scope); + template + void GetFetchOne(const framework::LoDTensor &fetchs, + PaddleTensor *output_data); private: contrib::AnalysisConfig config_; Argument argument_; + std::unique_ptr executor_; + platform::Place place_; + std::shared_ptr scope_; + framework::Scope *sub_scope_{nullptr}; + std::shared_ptr inference_program_; + std::vector feeds_; + std::map feed_names_; + std::vector fetchs_; + // Memory buffer for feed inputs. The temporary LoDTensor will cause serious + // concurrency problems, so cache them. + std::vector feed_tensors_; }; } // namespace paddle diff --git a/paddle/fluid/inference/api/analysis_predictor_tester.cc b/paddle/fluid/inference/api/analysis_predictor_tester.cc new file mode 100644 index 0000000000000000000000000000000000000000..1d25f55b3188a684fe38df1417d114348cfa2e8a --- /dev/null +++ b/paddle/fluid/inference/api/analysis_predictor_tester.cc @@ -0,0 +1,67 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include "paddle/fluid/inference/api/paddle_inference_api.h" + +DEFINE_string(dirname, "", "dirname to tests."); + +namespace paddle { +namespace inference { +using contrib::AnalysisConfig; + +TEST(AnalysisPredictor, ZeroCopy) { + AnalysisConfig config; + config.model_dir = FLAGS_dirname + "/word2vec.inference.model"; + config.use_feed_fetch_ops = false; + + auto predictor = + CreatePaddlePredictor( + config); + + auto w0 = predictor->GetInputTensor("firstw"); + auto w1 = predictor->GetInputTensor("secondw"); + auto w2 = predictor->GetInputTensor("thirdw"); + auto w3 = predictor->GetInputTensor("forthw"); + + w0->Reshape({4, 1}); + w1->Reshape({4, 1}); + w2->Reshape({4, 1}); + w3->Reshape({4, 1}); + + auto* w0_data = w0->mutable_data(PaddlePlace::kCPU); + auto* w1_data = w1->mutable_data(PaddlePlace::kCPU); + auto* w2_data = w2->mutable_data(PaddlePlace::kCPU); + auto* w3_data = w3->mutable_data(PaddlePlace::kCPU); + + for (int i = 0; i < 4; i++) { + w0_data[i] = i; + w1_data[i] = i; + w2_data[i] = i; + w3_data[i] = i; + } + + predictor->ZeroCopyRun(); + + auto out = predictor->GetOutputTensor("fc_1.tmp_2"); + PaddlePlace place; + int size = 0; + auto* out_data = out->data(&place, &size); + LOG(INFO) << "output size: " << size / sizeof(float); + LOG(INFO) << "output_data: " << out_data; +} + +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/api/api.cc b/paddle/fluid/inference/api/api.cc index c71769a32f604358fe68c927546591310649f116..01ea942d3c8d20180cfc9664b8601ba87a898e86 100644 --- a/paddle/fluid/inference/api/api.cc +++ b/paddle/fluid/inference/api/api.cc @@ -1,16 +1,22 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at -http://www.apache.org/licenses/LICENSE-2.0 -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/scope.h" #include "paddle/fluid/inference/api/paddle_inference_api.h" #include "paddle/fluid/platform/enforce.h" +#include "paddle_inference_api.h" namespace paddle { @@ -26,7 +32,7 @@ int PaddleDtypeSize(PaddleDType dtype) { } } -PaddleBuf::PaddleBuf(PaddleBuf&& other) +PaddleBuf::PaddleBuf(PaddleBuf &&other) : data_(other.data_), length_(other.length_), memory_owned_(other.memory_owned_) { @@ -35,9 +41,9 @@ PaddleBuf::PaddleBuf(PaddleBuf&& other) other.length_ = 0; } -PaddleBuf::PaddleBuf(const PaddleBuf& other) { *this = other; } +PaddleBuf::PaddleBuf(const PaddleBuf &other) { *this = other; } -PaddleBuf& PaddleBuf::operator=(const PaddleBuf& other) { +PaddleBuf &PaddleBuf::operator=(const PaddleBuf &other) { if (!other.memory_owned_) { data_ = other.data_; length_ = other.length_; @@ -51,7 +57,7 @@ PaddleBuf& PaddleBuf::operator=(const PaddleBuf& other) { return *this; } -PaddleBuf& PaddleBuf::operator=(PaddleBuf&& other) { +PaddleBuf &PaddleBuf::operator=(PaddleBuf &&other) { // only the buffer with external memory can be copied data_ = other.data_; length_ = other.length_; @@ -75,7 +81,7 @@ void PaddleBuf::Resize(size_t length) { } } -void PaddleBuf::Reset(void* data, size_t length) { +void PaddleBuf::Reset(void *data, size_t length) { Free(); memory_owned_ = false; data_ = data; @@ -85,7 +91,7 @@ void PaddleBuf::Reset(void* data, size_t length) { void PaddleBuf::Free() { if (memory_owned_ && data_) { PADDLE_ENFORCE_GT(length_, 0); - free(static_cast(data_)); + free(static_cast(data_)); data_ = nullptr; length_ = 0; } diff --git a/paddle/fluid/inference/api/api_impl.cc b/paddle/fluid/inference/api/api_impl.cc index c57fc64bb6bfeebc7935f19d0e977e8fccd4c9a0..6682e0a81b20c82aa668a249d37986386d769c83 100644 --- a/paddle/fluid/inference/api/api_impl.cc +++ b/paddle/fluid/inference/api/api_impl.cc @@ -22,7 +22,7 @@ limitations under the License. */ #include "paddle/fluid/framework/feed_fetch_method.h" #include "paddle/fluid/inference/api/api_impl.h" -#include "paddle/fluid/inference/api/timer.h" +#include "paddle/fluid/inference/api/helper.h" #include "paddle/fluid/platform/profiler.h" DEFINE_bool(profile, false, "Turn on profiler for fluid"); @@ -144,7 +144,7 @@ bool NativePaddlePredictor::Run(const std::vector &inputs, VLOG(4) << "Run prepared context"; executor_->RunPreparedContext(ctx_.get(), scope, false, /* don't create local scope each time*/ - false /* don't create variable eatch time */); + false /* don't create variable each time */); VLOG(4) << "Finish prepared context"; // get fetch variable if (!GetFetch(output_data, scope)) { @@ -215,57 +215,20 @@ bool NativePaddlePredictor::SetFeed(const std::vector &inputs, template void NativePaddlePredictor::GetFetchOne(const framework::LoDTensor &fetch, PaddleTensor *output) { - std::vector shape; - auto dims_i = fetch.dims(); - auto lod = fetch.lod(); - const T *output_ptr = fetch.data(); - auto num = fetch.numel(); - std::vector data; - if (0 == lod.size()) { - std::copy(output_ptr, output_ptr + num, std::back_inserter(data)); - for (int j = 0; j < dims_i.size(); ++j) { - shape.push_back(dims_i[j]); - } - } else { - // for batch detection - // image[0] -> output[0] shape {145, 6} - // image[1] -> output[1] shape {176, 6} - // then, - // the batch output shape {321, 6} - // the lod {{0, 145, 321}} - // so we should append output[0] to {176, 6} - size_t max_dim = 0; - for (size_t j = 1; j < lod[0].size(); j++) { - max_dim = std::max(max_dim, lod[0][j] - lod[0][j - 1]); - } - size_t common_dim = lod[0].back() == 0 ? 0 : num / lod[0].back(); - if (max_dim > 0) { - data.resize((lod[0].size() - 1) * max_dim * common_dim, 0); - } - for (size_t j = 1; j < lod[0].size(); j++) { - size_t start = lod[0][j - 1] * common_dim; - size_t end = lod[0][j] * common_dim; - if (end > start) { - std::copy(output_ptr + start, output_ptr + end, - data.begin() + (j - 1) * max_dim * common_dim); - } - } - shape.push_back(lod[0].size() - 1); - shape.push_back(max_dim); - for (int j = 1; j < dims_i.size(); ++j) { - shape.push_back(dims_i[j]); - } - } - - output->shape = shape; - auto &buffer = output->data; - if (buffer.empty() || buffer.length() < sizeof(T) * data.size()) { - buffer.Resize(sizeof(T) * data.size()); - } - std::memcpy(buffer.data(), data.data(), sizeof(T) * data.size()); - // copy LoD - for (const auto &level : fetch.lod()) { - output->lod.emplace_back(level); + // set shape. + auto shape = framework::vectorize(fetch.dims()); + output->shape.assign(shape.begin(), shape.end()); + // set data. + const T *data = fetch.data(); + int num_elems = inference::VecReduceToInt(shape); + output->data.Resize(num_elems * sizeof(T)); + // The fetched tensor output by fetch op, should always in CPU memory, so just + // copy. + memcpy(output->data.data(), data, num_elems * sizeof(T)); + // set lod + output->lod.clear(); + for (auto &level : fetch.lod()) { + output->lod.emplace_back(level.begin(), level.end()); } } diff --git a/paddle/fluid/inference/api/api_impl.h b/paddle/fluid/inference/api/api_impl.h index 6386d601262b3dac0e957fae991d23768b52f2c0..7882f6a53c7ce9a2486158ea9b50c018d1814091 100644 --- a/paddle/fluid/inference/api/api_impl.h +++ b/paddle/fluid/inference/api/api_impl.h @@ -1,16 +1,16 @@ /* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 +http://www.apache.org/licenses/LICENSE-2.0 - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ #pragma once @@ -30,6 +30,8 @@ #include "paddle/fluid/framework/ddim.h" #include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/naive_executor.h" +#include "paddle/fluid/inference/api/paddle_inference_api.h" #include "paddle/fluid/inference/io.h" #include "paddle/fluid/platform/init.h" #include "paddle/fluid/platform/profiler.h" @@ -52,6 +54,8 @@ class NativePaddlePredictor : public PaddlePredictor { ~NativePaddlePredictor() override; + framework::Scope *scope() { return sub_scope_ ? sub_scope_ : scope_.get(); } + protected: bool SetFeed(const std::vector &input_datas, framework::Scope *scope); diff --git a/paddle/fluid/inference/api/api_impl_tester.cc b/paddle/fluid/inference/api/api_impl_tester.cc index fc1364b80ac1ee2d304eb2fe429eae5f56967516..bed7c871311e476b4ed113d982c690383ad732de 100644 --- a/paddle/fluid/inference/api/api_impl_tester.cc +++ b/paddle/fluid/inference/api/api_impl_tester.cc @@ -21,6 +21,12 @@ limitations under the License. */ #include "paddle/fluid/inference/api/api_impl.h" #include "paddle/fluid/inference/tests/test_helper.h" +#ifdef __clang__ +#define ACC_DIFF 4e-3 +#else +#define ACC_DIFF 1e-3 +#endif + DEFINE_string(dirname, "", "Directory of the inference model."); namespace paddle { @@ -43,7 +49,7 @@ PaddleTensor LodTensorToPaddleTensor(framework::LoDTensor* t) { NativeConfig GetConfig() { NativeConfig config; - config.model_dir = FLAGS_dirname + "word2vec.inference.model"; + config.model_dir = FLAGS_dirname + "/word2vec.inference.model"; LOG(INFO) << "dirname " << config.model_dir; config.fraction_of_gpu_memory = 0.15; #ifdef PADDLE_WITH_CUDA @@ -99,8 +105,8 @@ void MainWord2Vec(bool use_gpu) { float* lod_data = output1.data(); for (int i = 0; i < output1.numel(); ++i) { - EXPECT_LT(lod_data[i] - data[i], 1e-3); - EXPECT_GT(lod_data[i] - data[i], -1e-3); + EXPECT_LT(lod_data[i] - data[i], ACC_DIFF); + EXPECT_GT(lod_data[i] - data[i], -ACC_DIFF); } } @@ -110,7 +116,7 @@ void MainImageClassification(bool use_gpu) { NativeConfig config = GetConfig(); config.use_gpu = use_gpu; config.model_dir = - FLAGS_dirname + "image_classification_resnet.inference.model"; + FLAGS_dirname + "/image_classification_resnet.inference.model"; const bool is_combined = false; std::vector> feed_target_shapes = @@ -144,7 +150,7 @@ void MainImageClassification(bool use_gpu) { float* data = static_cast(outputs[0].data.data()); float* lod_data = output1.data(); for (size_t j = 0; j < len / sizeof(float); ++j) { - EXPECT_NEAR(lod_data[j], data[j], 1e-3); + EXPECT_NEAR(lod_data[j], data[j], ACC_DIFF); } } @@ -199,7 +205,7 @@ void MainThreadsWord2Vec(bool use_gpu) { float* ref_data = refs[tid].data(); EXPECT_EQ(refs[tid].numel(), static_cast(len / sizeof(float))); for (int i = 0; i < refs[tid].numel(); ++i) { - EXPECT_NEAR(ref_data[i], data[i], 1e-3); + EXPECT_NEAR(ref_data[i], data[i], ACC_DIFF); } }); } @@ -214,7 +220,7 @@ void MainThreadsImageClassification(bool use_gpu) { NativeConfig config = GetConfig(); config.use_gpu = use_gpu; config.model_dir = - FLAGS_dirname + "image_classification_resnet.inference.model"; + FLAGS_dirname + "/image_classification_resnet.inference.model"; auto main_predictor = CreatePaddlePredictor(config); std::vector jobs(num_jobs); @@ -251,7 +257,7 @@ void MainThreadsImageClassification(bool use_gpu) { float* ref_data = refs[tid].data(); EXPECT_EQ((size_t)refs[tid].numel(), len / sizeof(float)); for (int i = 0; i < refs[tid].numel(); ++i) { - EXPECT_NEAR(ref_data[i], data[i], 1e-3); + EXPECT_NEAR(ref_data[i], data[i], ACC_DIFF); } }); } diff --git a/paddle/fluid/inference/api/api_tensorrt_subgraph_engine.cc b/paddle/fluid/inference/api/api_tensorrt_subgraph_engine.cc index 6c7e63971b2d93f58e219dbd93637c8d389deb7c..5ee6a5a93168f58770067f76ca7f6bb6f67b2965 100644 --- a/paddle/fluid/inference/api/api_tensorrt_subgraph_engine.cc +++ b/paddle/fluid/inference/api/api_tensorrt_subgraph_engine.cc @@ -35,8 +35,6 @@ class TensorRTSubgraphPredictor : public NativePaddlePredictor { bool Init(const std::shared_ptr& parent_scope) { FLAGS_IA_enable_tensorrt_subgraph_engine = true; VLOG(3) << "Predictor::init()"; - FLAGS_tensorrt_max_batch_size = config_.max_batch_size; - FLAGS_tensorrt_workspace_size = config_.workspace_size; if (config_.use_gpu) { place_ = paddle::platform::CUDAPlace(config_.device); } else { @@ -92,6 +90,14 @@ class TensorRTSubgraphPredictor : public NativePaddlePredictor { void OptimizeInferenceProgram() { // Analyze inference_program Argument argument; + + argument.Set("minimum_subgraph_size", + new int(config_.minimum_subgraph_size)); + argument.Set("max_batch_size", new int(config_.max_batch_size)); + argument.Set("workspace_size", new int(config_.workspace_size)); + argument.Set("precision_mode", + new std::string(config_.precision_mode)); + if (!config_.model_dir.empty()) { argument.fluid_model_dir.reset(new std::string(config_.model_dir)); } else { diff --git a/paddle/fluid/inference/api/demo_ci/run.sh b/paddle/fluid/inference/api/demo_ci/run.sh index 0f7d541c5edfc62e80cf50f83b491f06dcb42644..44335a872f2e00b34e29a9e7601cb390a460362c 100755 --- a/paddle/fluid/inference/api/demo_ci/run.sh +++ b/paddle/fluid/inference/api/demo_ci/run.sh @@ -2,6 +2,9 @@ set -x PADDLE_ROOT=$1 TURN_ON_MKL=$2 # use MKL or Openblas TEST_GPU_CPU=$3 # test both GPU/CPU mode or only CPU mode +DATA_DIR=$4 # dataset +cd `dirname $0` +current_dir=`pwd` if [ $2 == ON ]; then # You can export yourself if move the install path MKL_LIB=${PADDLE_ROOT}/build/fluid_install_dir/third_party/install/mklml/lib @@ -29,15 +32,15 @@ function download() { fi cd .. } -mkdir -p data -cd data +mkdir -p $DATA_DIR +cd $DATA_DIR vis_demo_list='se_resnext50 ocr mobilenet' for vis_demo_name in $vis_demo_list; do download $vis_demo_name done -cd .. # compile and test the demo +cd $current_dir mkdir -p build cd build @@ -73,9 +76,9 @@ for WITH_STATIC_LIB in ON OFF; do for use_gpu in $use_gpu_list; do for vis_demo_name in $vis_demo_list; do ./vis_demo \ - --modeldir=../data/$vis_demo_name/model \ - --data=../data/$vis_demo_name/data.txt \ - --refer=../data/$vis_demo_name/result.txt \ + --modeldir=$DATA_DIR/$vis_demo_name/model \ + --data=$DATA_DIR/$vis_demo_name/data.txt \ + --refer=$DATA_DIR/$vis_demo_name/result.txt \ --use_gpu=$use_gpu if [ $? -ne 0 ]; then echo "vis demo $vis_demo_name runs fail." diff --git a/paddle/fluid/inference/api/details/zero_copy_tensor.cc b/paddle/fluid/inference/api/details/zero_copy_tensor.cc new file mode 100644 index 0000000000000000000000000000000000000000..14698f6dfc8885ec1d35f1912bad10a9caa13db4 --- /dev/null +++ b/paddle/fluid/inference/api/details/zero_copy_tensor.cc @@ -0,0 +1,111 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/inference/api/paddle_inference_api.h" +#include "paddle/fluid/platform/enforce.h" + +namespace paddle { + +void ZeroCopyTensor::Reshape(const std::vector &shape) { + PADDLE_ENFORCE(!name_.empty(), + "Need to SetName first, so that the corresponding tensor can " + "be retrieved."); + PADDLE_ENFORCE(input_or_output_, + "Can't reshape the output tensor, it is readonly"); + PADDLE_ENFORCE(scope_); + auto *scope = static_cast(scope_); + auto *var = scope->FindVar(name_); + PADDLE_ENFORCE(var, "No tensor called [%s] in the runtime scope", name_); + auto *tensor = var->GetMutable(); + tensor->Resize(framework::make_ddim(shape)); +} + +template +T *ZeroCopyTensor::mutable_data(PaddlePlace place) { + auto *tensor = static_cast(FindTensor()); + switch (static_cast(place)) { + case static_cast(PaddlePlace::kCPU): { + return tensor->mutable_data(platform::CPUPlace()); + } + case static_cast(PaddlePlace::kGPU): { + return tensor->mutable_data(platform::CUDAPlace()); + } + default: + PADDLE_THROW("Unsupported place: %d", static_cast(place)); + break; + } + return nullptr; +} + +template +T *ZeroCopyTensor::data(PaddlePlace *place, int *size) { + auto *tensor = static_cast(FindTensor()); + auto *res = tensor->data(); + + if (platform::is_cpu_place(tensor->place())) { + *place = PaddlePlace::kCPU; + } else if (platform::is_gpu_place(tensor->place())) { + *place = PaddlePlace::kGPU; + } else { + *place = PaddlePlace::kUNK; + } + + *size = tensor->numel(); + return res; +} + +template float *ZeroCopyTensor::data(PaddlePlace *place, int *size); +template int64_t *ZeroCopyTensor::data(PaddlePlace *place, int *size); +template float *ZeroCopyTensor::mutable_data(PaddlePlace place); +template int64_t *ZeroCopyTensor::mutable_data(PaddlePlace place); + +void *ZeroCopyTensor::FindTensor() const { + PADDLE_ENFORCE(!name_.empty(), + "Need to SetName first, so that the corresponding tensor can " + "be retrieved."); + PADDLE_ENFORCE(scope_); + auto *scope = static_cast(scope_); + auto *var = scope->FindVar(name_); + PADDLE_ENFORCE(var, "No tensor called [%s] in the runtime scope", name_); + auto *tensor = var->GetMutable(); + return tensor; +} + +std::vector ZeroCopyTensor::shape() { + auto *tensor = static_cast(FindTensor()); + PADDLE_ENFORCE(tensor, "not found tensor called %s in the scope", name_); + return framework::vectorize(tensor->dims()); +} + +void ZeroCopyTensor::SetLoD(const std::vector> &x) { + auto *tensor = static_cast(FindTensor()); + framework::LoD lod; + for (auto &level : x) { + lod.emplace_back(level); + } + tensor->set_lod(lod); +} + +std::vector> ZeroCopyTensor::lod() const { + std::vector> res; + auto *tensor = static_cast(FindTensor()); + for (auto &level : tensor->lod()) { + res.emplace_back(level); + } + return res; +} + +} // namespace paddle diff --git a/paddle/fluid/inference/api/details/zero_copy_tensor_dummy.cc b/paddle/fluid/inference/api/details/zero_copy_tensor_dummy.cc new file mode 100644 index 0000000000000000000000000000000000000000..2d5b561d801cd9e734cab13b28e7285493e30f94 --- /dev/null +++ b/paddle/fluid/inference/api/details/zero_copy_tensor_dummy.cc @@ -0,0 +1,46 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/inference/api/paddle_inference_api.h" + +namespace paddle { + +void ZeroCopyTensor::Reshape(const std::vector &shape) {} + +template +T *ZeroCopyTensor::mutable_data(PaddlePlace place) { + return nullptr; +} + +template +T *ZeroCopyTensor::data(PaddlePlace *place, int *size) { + return nullptr; +} + +template float *ZeroCopyTensor::data(PaddlePlace *place, int *size); +template int64_t *ZeroCopyTensor::data(PaddlePlace *place, int *size); +template float *ZeroCopyTensor::mutable_data(PaddlePlace place); +template int64_t *ZeroCopyTensor::mutable_data(PaddlePlace place); + +void *ZeroCopyTensor::FindTensor() const { return nullptr; } + +std::vector ZeroCopyTensor::shape() { return {}; } + +void ZeroCopyTensor::SetLoD(const std::vector> &x) {} + +std::vector> ZeroCopyTensor::lod() const { + return std::vector>(); +} + +} // namespace paddle diff --git a/paddle/fluid/inference/api/helper.h b/paddle/fluid/inference/api/helper.h index 8e359a67738c0df180933421b45f15b39fd0e78c..24f59cf43a9700ff1732e1ef6ad82e1a6294eede 100644 --- a/paddle/fluid/inference/api/helper.h +++ b/paddle/fluid/inference/api/helper.h @@ -16,17 +16,34 @@ #include #include -#include +#include // NOLINT #include #include #include #include -#include "paddle/fluid/inference/api/paddle_inference_api.h" -#include "paddle/fluid/inference/api/timer.h" +#include "paddle/fluid/string/printf.h" +#include "paddle_inference_api.h" namespace paddle { namespace inference { +// Timer for timer +class Timer { + public: + std::chrono::high_resolution_clock::time_point start; + std::chrono::high_resolution_clock::time_point startu; + + void tic() { start = std::chrono::high_resolution_clock::now(); } + double toc() { + startu = std::chrono::high_resolution_clock::now(); + std::chrono::duration time_span = + std::chrono::duration_cast>(startu - + start); + double used_time_ms = static_cast(time_span.count()) * 1000.0; + return used_time_ms; + } +}; + static void split(const std::string &str, char sep, std::vector *pieces) { pieces->clear(); @@ -74,13 +91,17 @@ template <> std::string to_string>>( const std::vector>> &vec); +template +int VecReduceToInt(const std::vector &v) { + return std::accumulate(v.begin(), v.end(), 1, [](T a, T b) { return a * b; }); +} + template static void TensorAssignData(PaddleTensor *tensor, const std::vector> &data) { // Assign buffer - int dim = std::accumulate(tensor->shape.begin(), tensor->shape.end(), 1, - [](int a, int b) { return a * b; }); - tensor->data.Resize(sizeof(T) * dim); + int num_elems = VecReduceToInt(tensor->shape); + tensor->data.Resize(sizeof(T) * num_elems); int c = 0; for (const auto &f : data) { for (T v : f) { @@ -89,7 +110,21 @@ static void TensorAssignData(PaddleTensor *tensor, } } -std::string DescribeTensor(const PaddleTensor &tensor) { +template +static int ZeroCopyTensorAssignData(ZeroCopyTensor *tensor, + const std::vector> &data) { + int size{0}; + auto *ptr = tensor->mutable_data(PaddlePlace::kCPU); + int c = 0; + for (const auto &f : data) { + for (T v : f) { + ptr[c++] = v; + } + } + return size; +} + +static std::string DescribeTensor(const PaddleTensor &tensor) { std::stringstream os; os << "Tensor [" << tensor.name << "]\n"; os << " - type: "; @@ -113,8 +148,7 @@ std::string DescribeTensor(const PaddleTensor &tensor) { os << "\n"; os << " - data: "; - int dim = std::accumulate(tensor.shape.begin(), tensor.shape.end(), 1, - [](int a, int b) { return a * b; }); + int dim = VecReduceToInt(tensor.shape); for (int i = 0; i < dim; i++) { os << static_cast(tensor.data.data())[i] << " "; } @@ -122,8 +156,8 @@ std::string DescribeTensor(const PaddleTensor &tensor) { return os.str(); } -void PrintTime(int batch_size, int repeat, int num_threads, int tid, - double latency, int epoch = 1) { +static void PrintTime(int batch_size, int repeat, int num_threads, int tid, + double latency, int epoch = 1) { LOG(INFO) << "====== batch_size: " << batch_size << ", repeat: " << repeat << ", threads: " << num_threads << ", thread id: " << tid << ", latency: " << latency << "ms ======"; diff --git a/paddle/fluid/inference/api/paddle_inference_api.h b/paddle/fluid/inference/api/paddle_inference_api.h index 2b4e5ed73704041e18bdbce32338405f3601e082..d2876dc27c8826c2f27be21fa0b9fef92d03067a 100644 --- a/paddle/fluid/inference/api/paddle_inference_api.h +++ b/paddle/fluid/inference/api/paddle_inference_api.h @@ -101,6 +101,40 @@ struct PaddleTensor { std::vector> lod; // Tensor+LoD equals LoDTensor }; +enum class PaddlePlace { kUNK = -1, kCPU, kGPU }; +// Tensor without copy, currently only supports AnalysisPredictor. +class ZeroCopyTensor { + public: + void Reshape(const std::vector& shape); + + // Get the memory in CPU or GPU with specific data type, should Reshape first + // to tell the data size. + // Once can directly call this data to feed the data. + // This is for write the input tensor. + template + T* mutable_data(PaddlePlace place); + // Get the memory directly, will return the place and memory size by pointer. + // This is for reading the output tensor. + template + T* data(PaddlePlace* place, int* size); + + std::vector shape(); + + void SetLoD(const std::vector>& x); + std::vector> lod() const; + + protected: + ZeroCopyTensor(void* scope) : scope_{scope} {} + void SetName(const std::string& name) { name_ = name; } + void* FindTensor() const; + + private: + std::string name_; + bool input_or_output_; + friend class AnalysisPredictor; + void* scope_{nullptr}; +}; + /* * A simple Inference API for Paddle. */ @@ -120,6 +154,19 @@ class PaddlePredictor { std::vector* output_data, int batch_size = -1) = 0; + // Zero copy input and output optimization. + // Get the input or output tensors, and operate on their memory directly, + // without copy. + virtual std::unique_ptr GetInputTensor( + const std::string& name) { + return nullptr; + } + virtual std::unique_ptr GetOutputTensor( + const std::string& name) { + return nullptr; + } + virtual bool ZeroCopyRun() { return false; } + // Clone a predictor that share the model weights, the Cloned predictor should // be thread-safe. virtual std::unique_ptr Clone() = 0; @@ -194,6 +241,14 @@ struct MixedRTConfig : public NativeConfig { // For workspace_size, refer it from here: // https://docs.nvidia.com/deeplearning/sdk/tensorrt-developer-guide/index.html#troubleshooting int workspace_size{1 << 30}; + // We transform the Ops that can be converted into TRT layer in the model, + // and aggregate these Ops into subgraphs for TRT execution. + // We set this variable to control the minimum number of nodes in the + // subgraph, 3 as default value. + int minimum_subgraph_size = 3; + // Reserved configuration + // We just support "FP32" now, "FP16" and "INT8" will be supported. + std::string precision_mode = "FP32"; }; // NOTE WIP, not stable yet. @@ -204,12 +259,17 @@ struct AnalysisConfig : public NativeConfig { kExclude // Specify the disabled passes in `ir_passes`. }; + // Determine whether to perform graph optimization. bool enable_ir_optim = true; + // Manually determine the IR passes to run. IrPassMode ir_mode{IrPassMode::kExclude}; - // attention lstm fuse works only on some specific models, disable as default. - std::vector ir_passes{"attention_lstm_fuse_pass"}; + std::vector ir_passes{"embedding_fc_lstm_fuse_pass"}; + + // NOT stable yet. + bool use_feed_fetch_ops{true}; // NOTE this is just for internal development, please not use it. + // NOT stable yet. bool _use_mkldnn{false}; }; diff --git a/paddle/fluid/inference/tests/api/CMakeLists.txt b/paddle/fluid/inference/tests/api/CMakeLists.txt index 508ef1ce40aa0882a0f39a85f97511fd9ea2a8a5..c3dd1f433691e1c96e9f38ef7b595befad26408f 100644 --- a/paddle/fluid/inference/tests/api/CMakeLists.txt +++ b/paddle/fluid/inference/tests/api/CMakeLists.txt @@ -58,6 +58,11 @@ set(TEXT_CLASSIFICATION_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/text_classifi download_model_and_data(${TEXT_CLASSIFICATION_INSTALL_DIR} "text-classification-Senta.tar.gz" "text_classification_data.txt.tar.gz") inference_analysis_api_test(test_analyzer_text_classification ${TEXT_CLASSIFICATION_INSTALL_DIR} analyzer_text_classification_tester.cc) +# seq_conv1 +set(SEQ_CONV1_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/seq_conv1") +download_model_and_data(${SEQ_CONV1_INSTALL_DIR} "seq_conv1_model.tar.gz" "seq_conv1_data.txt.tar.gz") +inference_analysis_api_test(test_analyzer_seq_conv1 ${SEQ_CONV1_INSTALL_DIR} analyzer_seq_conv1_tester.cc) + # ocr set(OCR_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/ocr") if (NOT EXISTS ${OCR_INSTALL_DIR}) @@ -65,6 +70,14 @@ if (NOT EXISTS ${OCR_INSTALL_DIR}) endif() inference_analysis_api_test(test_analyzer_ocr ${OCR_INSTALL_DIR} analyzer_vis_tester.cc) +# resnet50 +set(RESNET50_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/resnet50") +if (NOT EXISTS ${RESNET50_INSTALL_DIR}) + inference_download_and_uncompress(${RESNET50_INSTALL_DIR} ${INFERENCE_URL} "resnet50_model.tar.gz") +endif() +inference_analysis_test(test_analyzer_resnet50 SRCS analyzer_resnet50_tester.cc + EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} ARGS --infer_model=${RESNET50_INSTALL_DIR}/model) + # anakin if (WITH_ANAKIN AND WITH_MKL) # only needed in CI # anakin rnn1 @@ -85,3 +98,13 @@ if (WITH_ANAKIN AND WITH_MKL) # only needed in CI DEPS inference_anakin_api_shared dynload_cuda SERIAL) endif() endif() + +if(WITH_GPU AND TENSORRT_FOUND) + set(TRT_MODEL_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/trt") + if (NOT EXISTS ${TRT_MODEL_INSTALL_DIR}) + inference_download_and_uncompress(${TRT_MODEL_INSTALL_DIR} ${INFERENCE_URL}/tensorrt_test "trt_test_models.tar.gz") + endif() + cc_test(test_trt_models SRCS trt_models_tester.cc + ARGS --dirname=${TRT_MODEL_INSTALL_DIR}/trt_test_models + DEPS paddle_inference_tensorrt_subgraph_engine) +endif() diff --git a/paddle/fluid/inference/tests/api/anakin_rnn1_tester.cc b/paddle/fluid/inference/tests/api/anakin_rnn1_tester.cc index 82bc83988de688e46613e160b66943c89c4a0391..c4022225fd4526998af8526d0afb87e7a5be6336 100644 --- a/paddle/fluid/inference/tests/api/anakin_rnn1_tester.cc +++ b/paddle/fluid/inference/tests/api/anakin_rnn1_tester.cc @@ -22,7 +22,6 @@ limitations under the License. */ #include #include "paddle/fluid/inference/api/helper.h" #include "paddle/fluid/inference/api/paddle_inference_api.h" -#include "paddle/fluid/inference/api/timer.h" #include "utils/logger/logger.h" DEFINE_string(model, "", "Directory of the inference model."); diff --git a/paddle/fluid/inference/tests/api/analyzer_lac_tester.cc b/paddle/fluid/inference/tests/api/analyzer_lac_tester.cc index 36bbec473114cfd2e68c97a53264957477ade3fb..5fb551810fd4d1c56547a8aa581cb6c4587df031 100644 --- a/paddle/fluid/inference/tests/api/analyzer_lac_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_lac_tester.cc @@ -18,6 +18,8 @@ namespace paddle { namespace inference { namespace analysis { +using contrib::AnalysisConfig; + struct DataRecord { std::vector data; std::vector lod; @@ -78,6 +80,7 @@ struct DataRecord { } } } + DataRecord NextBatch() { DataRecord data; data.data = batched_datas[batch_iter]; @@ -155,7 +158,9 @@ TEST(Analyzer_LAC, fuse_statis) { SetConfig(&cfg); int num_ops; - auto fuse_statis = GetFuseStatis(cfg, &num_ops); + auto predictor = CreatePaddlePredictor(cfg); + auto fuse_statis = GetFuseStatis( + static_cast(predictor.get()), &num_ops); ASSERT_TRUE(fuse_statis.count("fc_fuse")); ASSERT_TRUE(fuse_statis.count("fc_gru_fuse")); EXPECT_EQ(fuse_statis.at("fc_fuse"), 1); diff --git a/paddle/fluid/inference/tests/api/analyzer_ner_tester.cc b/paddle/fluid/inference/tests/api/analyzer_ner_tester.cc index 59020545cd609961487cafc4a08c20951a02c8ce..577b97e271aacab5d6740de7c8bc00bc87ae54dd 100644 --- a/paddle/fluid/inference/tests/api/analyzer_ner_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_ner_tester.cc @@ -16,6 +16,7 @@ namespace paddle { namespace inference { +using contrib::AnalysisConfig; struct DataRecord { std::vector> word_data_all, mention_data_all; @@ -145,7 +146,9 @@ TEST(Analyzer_Chinese_ner, fuse_statis) { SetConfig(&cfg); int num_ops; - auto fuse_statis = GetFuseStatis(cfg, &num_ops); + auto predictor = CreatePaddlePredictor(cfg); + auto fuse_statis = GetFuseStatis( + static_cast(predictor.get()), &num_ops); ASSERT_TRUE(fuse_statis.count("fc_fuse")); ASSERT_TRUE(fuse_statis.count("fc_gru_fuse")); EXPECT_EQ(fuse_statis.at("fc_fuse"), 1); diff --git a/paddle/fluid/inference/tests/api/analyzer_resnet50_tester.cc b/paddle/fluid/inference/tests/api/analyzer_resnet50_tester.cc new file mode 100644 index 0000000000000000000000000000000000000000..8add7a59da613564d657a6fce329a089bab7e799 --- /dev/null +++ b/paddle/fluid/inference/tests/api/analyzer_resnet50_tester.cc @@ -0,0 +1,99 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include +#include "paddle/fluid/inference/tests/api/tester_helper.h" + +namespace paddle { +namespace inference { +namespace analysis { + +void SetConfig(AnalysisConfig *cfg) { + cfg->param_file = FLAGS_infer_model + "/params"; + cfg->prog_file = FLAGS_infer_model + "/model"; + cfg->use_gpu = false; + cfg->device = 0; + cfg->enable_ir_optim = true; + cfg->specify_input_name = true; +#ifdef PADDLE_WITH_MKLDNN + cfg->_use_mkldnn = true; +#endif +} + +void SetInput(std::vector> *inputs) { + PADDLE_ENFORCE_EQ(FLAGS_test_all_data, 0, "Only have single batch of data."); + + PaddleTensor input; + // channel=3, height/width=318 + std::vector shape({FLAGS_batch_size, 3, 318, 318}); + input.shape = shape; + input.dtype = PaddleDType::FLOAT32; + + // fill input data, for profile easily, do not use random data here. + size_t size = FLAGS_batch_size * 3 * 318 * 318; + input.data.Resize(size * sizeof(float)); + float *input_data = static_cast(input.data.data()); + for (size_t i = 0; i < size; i++) { + *(input_data + i) = static_cast(i) / size; + } + + std::vector input_slots; + input_slots.assign({input}); + (*inputs).emplace_back(input_slots); +} + +// Easy for profiling independently. +TEST(Analyzer_resnet50, profile) { + AnalysisConfig cfg; + SetConfig(&cfg); + std::vector outputs; + + std::vector> input_slots_all; + SetInput(&input_slots_all); + TestPrediction(cfg, input_slots_all, &outputs, FLAGS_num_threads); + + if (FLAGS_num_threads == 1 && !FLAGS_test_all_data) { + PADDLE_ENFORCE_EQ(outputs.size(), 1UL); + size_t size = GetSize(outputs[0]); + // output is a 512-dimension feature + EXPECT_EQ(size, 512 * FLAGS_batch_size); + } +} + +// Check the fuse status +TEST(Analyzer_resnet50, fuse_statis) { + AnalysisConfig cfg; + SetConfig(&cfg); + int num_ops; + auto predictor = CreatePaddlePredictor(cfg); + auto fuse_statis = GetFuseStatis( + static_cast(predictor.get()), &num_ops); + ASSERT_TRUE(fuse_statis.count("fc_fuse")); + EXPECT_EQ(fuse_statis.at("fc_fuse"), 1); +} + +// Compare result of NativeConfig and AnalysisConfig +TEST(Analyzer_resnet50, compare) { + AnalysisConfig cfg; + SetConfig(&cfg); + + std::vector> input_slots_all; + SetInput(&input_slots_all); + CompareNativeAndAnalysis(cfg, input_slots_all); +} + +} // namespace analysis +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc b/paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc index 3bf5383d8f35347c767d6caee83e0dcc5fb0a446..c76d72ccd99649913aefcb2aa57fe6061db8ca6d 100644 --- a/paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc @@ -14,10 +14,13 @@ #include "paddle/fluid/inference/tests/api/tester_helper.h" +DEFINE_bool(with_precision_check, true, "turn on test"); + namespace paddle { namespace inference { using namespace framework; // NOLINT +using namespace contrib; // NOLINT struct DataRecord { std::vector>> link_step_data_all; @@ -29,10 +32,12 @@ struct DataRecord { size_t batch_iter{0}; size_t batch_size{1}; DataRecord() = default; + explicit DataRecord(const std::string &path, int batch_size = 1) : batch_size(batch_size) { Load(path); } + DataRecord NextBatch() { DataRecord data; size_t batch_end = batch_iter + batch_size; @@ -101,6 +106,7 @@ struct DataRecord { num_samples = num_lines; } }; + void PrepareInputs(std::vector *input_slots, DataRecord *data, int batch_size) { PaddleTensor lod_attention_tensor, init_zero_tensor, lod_tensor_tensor, @@ -149,7 +155,55 @@ void PrepareInputs(std::vector *input_slots, DataRecord *data, } } -void SetConfig(contrib::AnalysisConfig *cfg) { +void PrepareZeroCopyInputs(ZeroCopyTensor *lod_attention_tensor, + ZeroCopyTensor *cell_init_tensor, + ZeroCopyTensor *data_tensor, + ZeroCopyTensor *hidden_init_tensor, + ZeroCopyTensor *week_tensor, + ZeroCopyTensor *minute_tensor, + DataRecord *data_record, int batch_size) { + auto one_batch = data_record->NextBatch(); + std::vector rnn_link_data_shape( + {static_cast(one_batch.rnn_link_data.size()), + static_cast(one_batch.rnn_link_data.front().size())}); + lod_attention_tensor->Reshape({1, 2}); + lod_attention_tensor->SetLoD({one_batch.lod1, one_batch.lod2}); + + cell_init_tensor->Reshape({batch_size, 15}); + cell_init_tensor->SetLoD({one_batch.lod3}); + + hidden_init_tensor->Reshape({batch_size, 15}); + hidden_init_tensor->SetLoD({one_batch.lod3}); + + data_tensor->Reshape(rnn_link_data_shape); + data_tensor->SetLoD({one_batch.lod1}); + + week_tensor->Reshape( + {static_cast(one_batch.rnn_week_datas.size()), + static_cast(one_batch.rnn_week_datas.front().size())}); + week_tensor->SetLoD({one_batch.lod3}); + + minute_tensor->Reshape( + {static_cast(one_batch.rnn_minute_datas.size()), + static_cast(one_batch.rnn_minute_datas.front().size())}); + minute_tensor->SetLoD({one_batch.lod3}); + + // assign data + float arr0[] = {0, 0}; + std::vector zeros(batch_size * 15, 0); + std::copy_n(arr0, 2, + lod_attention_tensor->mutable_data(PaddlePlace::kCPU)); + std::copy_n(arr0, 2, data_tensor->mutable_data(PaddlePlace::kCPU)); + std::copy_n(zeros.begin(), zeros.size(), + cell_init_tensor->mutable_data(PaddlePlace::kCPU)); + std::copy_n(zeros.begin(), zeros.size(), + hidden_init_tensor->mutable_data(PaddlePlace::kCPU)); + ZeroCopyTensorAssignData(data_tensor, one_batch.rnn_link_data); + ZeroCopyTensorAssignData(week_tensor, one_batch.rnn_week_datas); + ZeroCopyTensorAssignData(minute_tensor, one_batch.rnn_minute_datas); +} + +void SetConfig(AnalysisConfig *cfg) { cfg->prog_file = FLAGS_infer_model + "/__model__"; cfg->param_file = FLAGS_infer_model + "/param"; cfg->use_gpu = false; @@ -187,7 +241,9 @@ TEST(Analyzer_rnn1, fuse_statis) { SetConfig(&cfg); int num_ops; - auto fuse_statis = GetFuseStatis(cfg, &num_ops); + auto predictor = CreatePaddlePredictor(cfg); + auto fuse_statis = GetFuseStatis( + static_cast(predictor.get()), &num_ops); ASSERT_TRUE(fuse_statis.count("fc_fuse")); EXPECT_EQ(fuse_statis.at("fc_fuse"), 1); EXPECT_EQ(fuse_statis.at("fc_nobias_lstm_fuse"), 2); // bi-directional LSTM @@ -214,7 +270,230 @@ TEST(Analyzer_rnn1, multi_thread) { std::vector> input_slots_all; SetInput(&input_slots_all); - TestPrediction(cfg, input_slots_all, &outputs, 4 /* num_threads */); + TestPrediction(cfg, input_slots_all, &outputs, 4 /* multi_thread */); +} + +bool CompareTensors(const framework::Scope &a_scope, + const framework::Scope &b_scope, + const std::vector &tensors) { + for (auto &x : tensors) { + auto *a_var = a_scope.FindVar(x); + auto *b_var = b_scope.FindVar(x); + if (a_var && b_var) { + if (a_var->Type() == typeid(framework::LoDTensor) || + a_var->Type() == typeid(framework::Tensor)) { + LOG(INFO) << "comparing tensor " << x; + auto &a_t = a_var->Get(); + auto &b_t = b_var->Get(); + if (!inference::CompareTensor(a_t, b_t)) { + LOG(ERROR) << string::Sprintf("tensor %s not match in two scopes", x); + } + } else { + LOG(INFO) << "skip no tensor " << x; + } + } else { + LOG(INFO) << "skip tensor " << x; + } + } + return true; +} + +// Validate that the AnalysisPredictor + ZeroCopyTensor really works by testing +// on the complex RNN1 model. +TEST(Analyzer_rnn1, ZeroCopy) { + AnalysisConfig config; + SetConfig(&config); + config.use_feed_fetch_ops = false; + + PaddlePlace place; + int output_size{0}; + + auto predictor = + CreatePaddlePredictor( + config); + + config.use_feed_fetch_ops = true; + auto native_predictor = + CreatePaddlePredictor(config); + + config.use_feed_fetch_ops = true; // the analysis predictor needs feed/fetch. + auto analysis_predictor = + CreatePaddlePredictor( + config); + +#define NEW_TENSOR(name__) \ + auto name__##_tensor = predictor->GetInputTensor(#name__); + NEW_TENSOR(data_lod_attention); + NEW_TENSOR(cell_init); + NEW_TENSOR(data); + NEW_TENSOR(week); + NEW_TENSOR(minute); + NEW_TENSOR(hidden_init); + + // Prepare data for AnalysisPredictor + DataRecord data(FLAGS_infer_data, FLAGS_batch_size); + PrepareZeroCopyInputs(data_lod_attention_tensor.get(), cell_init_tensor.get(), + data_tensor.get(), hidden_init_tensor.get(), + week_tensor.get(), minute_tensor.get(), &data, + FLAGS_batch_size); + + // Prepare data for NativePredictor + std::vector> native_inputs; + SetInput(&native_inputs); + std::vector native_outputs; + std::vector analysis_outputs; + + auto output_tensor = predictor->GetOutputTensor("final_output.tmp_1"); + // Run analysis predictor + + int num_ops; + auto fuse_statis = GetFuseStatis(predictor.get(), &num_ops); + ASSERT_TRUE(fuse_statis.count("fc_fuse")); + ASSERT_EQ(fuse_statis.at("fc_fuse"), 1); + ASSERT_EQ(fuse_statis.at("fc_nobias_lstm_fuse"), 2); // bi-directional LSTM + ASSERT_EQ(fuse_statis.at("seq_concat_fc_fuse"), 1); + ASSERT_EQ(num_ops, + 13); // After graph optimization, only 13 operators exists. + + Timer timer; + double total_time{0}; + double native_total_time{0}; + double analysis_total_time{0.}; + + for (int i = 0; i < FLAGS_repeat; i++) { + timer.tic(); + predictor->ZeroCopyRun(); + total_time += timer.toc(); + } + + auto *output_data = output_tensor->data(&place, &output_size); + ASSERT_GT(output_size, 0); // more than one output! + + for (int i = 0; i < FLAGS_repeat; i++) { + // Run native predictor. + timer.tic(); + ASSERT_TRUE(native_predictor->Run(native_inputs.front(), &native_outputs)); + native_total_time += timer.toc(); + } + + for (int i = 0; i < FLAGS_repeat; i++) { + timer.tic(); + ASSERT_TRUE( + analysis_predictor->Run(native_inputs.front(), &analysis_outputs)); + analysis_total_time += timer.toc(); + } + + if (!FLAGS_with_precision_check) { + return; + } + int native_output_size = VecReduceToInt(native_outputs.front().shape); + + EXPECT_EQ(native_output_size, output_size); + + // Compare tensors between analysis and zerocopy + auto *p0 = static_cast(predictor.get()); + auto *p1 = static_cast(analysis_predictor.get()); + auto *p2 = static_cast(native_predictor.get()); + + std::vector tensor_names; + for (auto &var_desc : p0->program().Block(0).AllVars()) { + tensor_names.push_back(var_desc->Name()); + } + + LOG(INFO) << "Comparing tensors"; + ASSERT_TRUE( + CompareTensors(*p0->scope(), *p1->scope(), {"final_output.tmp_1"})); + ASSERT_TRUE( + CompareTensors(*p0->scope(), *p2->scope(), {"final_output.tmp_1"})); + + LOG(INFO) << "output1 " << inference::LoDTensorSummary( + p0->scope() + ->FindVar("final_output.tmp_1") + ->Get()); + LOG(INFO) << "output2 " << inference::LoDTensorSummary( + p1->scope() + ->FindVar("final_output.tmp_1") + ->Get()); + LOG(INFO) << "output3 " << inference::LoDTensorSummary( + p2->scope() + ->FindVar("final_output.tmp_1") + ->Get()); + + for (int i = 0; i < output_size; i++) { + LOG(INFO) << output_data[i] << " " + << static_cast(native_outputs.front().data.data())[i] + << " " + << static_cast(analysis_outputs.front().data.data())[i]; + EXPECT_NEAR(output_data[i], + static_cast(native_outputs.front().data.data())[i], + 1e-3); + } + + LOG(INFO) << "batch_size: " << FLAGS_batch_size; + + LOG(INFO) << "zero average time: " + << total_time / (FLAGS_repeat * FLAGS_batch_size); + LOG(INFO) << "analysis average time: " + << analysis_total_time / (FLAGS_repeat * FLAGS_batch_size); + LOG(INFO) << "native average time: " + << native_total_time / (FLAGS_repeat * FLAGS_batch_size); +} + +TEST(Analyzer_rnn1, ZeroCopyMultiThread) { + AnalysisConfig config; + SetConfig(&config); + config.use_feed_fetch_ops = false; + +#define NEW_TENSOR(name__) \ + auto name__##_tensor = predictor->GetInputTensor(#name__); + + auto base_predictor = CreatePaddlePredictor(config); + double total_time_of_threads{0}; + std::vector threads; + std::vector> predictors; + for (int tid = 0; tid < FLAGS_num_threads; tid++) { + predictors.emplace_back(CreatePaddlePredictor(config)); + } + + for (int tid = 0; tid < FLAGS_num_threads; tid++) { + threads.emplace_back([config, &total_time_of_threads, &predictors, tid] { + // auto predictor = base_predictor->Clone(); + auto &predictor = predictors[tid]; + NEW_TENSOR(data_lod_attention); + NEW_TENSOR(cell_init); + NEW_TENSOR(data); + NEW_TENSOR(week); + NEW_TENSOR(minute); + NEW_TENSOR(hidden_init); + + // Prepare data for AnalysisPredictor + DataRecord data(FLAGS_infer_data, FLAGS_batch_size); + Timer timer; + double total_time{0}; + + for (int i = 0; i < FLAGS_repeat; i++) { + PrepareZeroCopyInputs(data_lod_attention_tensor.get(), + cell_init_tensor.get(), data_tensor.get(), + hidden_init_tensor.get(), week_tensor.get(), + minute_tensor.get(), &data, FLAGS_batch_size); + + timer.tic(); + predictor->ZeroCopyRun(); + total_time += timer.toc(); + } + + total_time_of_threads += total_time; + + LOG(INFO) << "thread time: " << total_time / FLAGS_repeat; + }); + } + + for (auto &t : threads) { + t.join(); + } + + LOG(INFO) << "average time: " + << total_time_of_threads / FLAGS_num_threads / FLAGS_repeat; } } // namespace inference diff --git a/paddle/fluid/inference/tests/api/analyzer_seq_conv1_tester.cc b/paddle/fluid/inference/tests/api/analyzer_seq_conv1_tester.cc new file mode 100644 index 0000000000000000000000000000000000000000..cb4671c4379b5f6f144bfd5330866aa38163f4d4 --- /dev/null +++ b/paddle/fluid/inference/tests/api/analyzer_seq_conv1_tester.cc @@ -0,0 +1,200 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/inference/tests/api/tester_helper.h" + +namespace paddle { +namespace inference { + +struct DataRecord { + std::vector> title1_all, title2_all, title3_all, l1_all; + std::vector> title1, title2, title3, l1; + std::vector title1_lod, title2_lod, title3_lod, l1_lod; + size_t batch_iter{0}; + size_t batch_size{1}; + size_t num_samples; // total number of samples + DataRecord() = default; + explicit DataRecord(const std::string &path, int batch_size = 1) + : batch_size(batch_size) { + Load(path); + } + DataRecord NextBatch() { + DataRecord data; + size_t batch_end = batch_iter + batch_size; + // NOTE skip the final batch, if no enough data is provided. + if (batch_end <= title1_all.size()) { + data.title1_all.assign(title1_all.begin() + batch_iter, + title1_all.begin() + batch_end); + data.title2_all.assign(title2_all.begin() + batch_iter, + title2_all.begin() + batch_end); + data.title3_all.assign(title3_all.begin() + batch_iter, + title3_all.begin() + batch_end); + data.l1_all.assign(l1_all.begin() + batch_iter, + l1_all.begin() + batch_end); + // Prepare LoDs + data.title1_lod.push_back(0); + data.title2_lod.push_back(0); + data.title3_lod.push_back(0); + data.l1_lod.push_back(0); + CHECK(!data.title1_all.empty()); + CHECK(!data.title2_all.empty()); + CHECK(!data.title3_all.empty()); + CHECK(!data.l1_all.empty()); + CHECK_EQ(data.title1_all.size(), data.title2_all.size()); + CHECK_EQ(data.title1_all.size(), data.title3_all.size()); + CHECK_EQ(data.title1_all.size(), data.l1_all.size()); + for (size_t j = 0; j < data.title1_all.size(); j++) { + data.title1.push_back(data.title1_all[j]); + data.title2.push_back(data.title2_all[j]); + data.title3.push_back(data.title3_all[j]); + data.l1.push_back(data.l1_all[j]); + // calculate lod + data.title1_lod.push_back(data.title1_lod.back() + + data.title1_all[j].size()); + data.title2_lod.push_back(data.title2_lod.back() + + data.title2_all[j].size()); + data.title3_lod.push_back(data.title3_lod.back() + + data.title3_all[j].size()); + data.l1_lod.push_back(data.l1_lod.back() + data.l1_all[j].size()); + } + } + batch_iter += batch_size; + return data; + } + void Load(const std::string &path) { + std::ifstream file(path); + std::string line; + int num_lines = 0; + while (std::getline(file, line)) { + num_lines++; + std::vector data; + split(line, '\t', &data); + // load title1 data + std::vector title1_data; + split_to_int64(data[0], ' ', &title1_data); + // load title2 data + std::vector title2_data; + split_to_int64(data[1], ' ', &title2_data); + // load title3 data + std::vector title3_data; + split_to_int64(data[2], ' ', &title3_data); + // load l1 data + std::vector l1_data; + split_to_int64(data[3], ' ', &l1_data); + title1_all.push_back(std::move(title1_data)); + title2_all.push_back(std::move(title2_data)); + title3_all.push_back(std::move(title3_data)); + l1_all.push_back(std::move(l1_data)); + } + num_samples = num_lines; + } +}; + +void PrepareInputs(std::vector *input_slots, DataRecord *data, + int batch_size) { + PaddleTensor title1_tensor, title2_tensor, title3_tensor, l1_tensor; + title1_tensor.name = "title1"; + title2_tensor.name = "title2"; + title3_tensor.name = "title3"; + l1_tensor.name = "l1"; + auto one_batch = data->NextBatch(); + int title1_size = one_batch.title1_lod[one_batch.title1_lod.size() - 1]; + title1_tensor.shape.assign({title1_size, 1}); + title1_tensor.lod.assign({one_batch.title1_lod}); + int title2_size = one_batch.title2_lod[one_batch.title2_lod.size() - 1]; + title2_tensor.shape.assign({title2_size, 1}); + title2_tensor.lod.assign({one_batch.title2_lod}); + int title3_size = one_batch.title3_lod[one_batch.title3_lod.size() - 1]; + title3_tensor.shape.assign({title3_size, 1}); + title3_tensor.lod.assign({one_batch.title3_lod}); + int l1_size = one_batch.l1_lod[one_batch.l1_lod.size() - 1]; + l1_tensor.shape.assign({l1_size, 1}); + l1_tensor.lod.assign({one_batch.l1_lod}); + + // assign data + TensorAssignData(&title1_tensor, one_batch.title1); + TensorAssignData(&title2_tensor, one_batch.title2); + TensorAssignData(&title3_tensor, one_batch.title3); + TensorAssignData(&l1_tensor, one_batch.l1); + // Set inputs. + input_slots->assign({title1_tensor, title2_tensor, title3_tensor, l1_tensor}); + for (auto &tensor : *input_slots) { + tensor.dtype = PaddleDType::INT64; + } +} + +void SetConfig(AnalysisConfig *cfg) { + cfg->model_dir = FLAGS_infer_model; + cfg->use_gpu = false; + cfg->device = 0; + cfg->specify_input_name = true; + cfg->enable_ir_optim = true; +} + +void SetInput(std::vector> *inputs) { + DataRecord data(FLAGS_infer_data, FLAGS_batch_size); + std::vector input_slots; + int epoch = FLAGS_test_all_data ? data.num_samples / FLAGS_batch_size : 1; + LOG(INFO) << "number of samples: " << epoch * FLAGS_batch_size; + for (int bid = 0; bid < epoch; ++bid) { + PrepareInputs(&input_slots, &data, FLAGS_batch_size); + (*inputs).emplace_back(input_slots); + } +} + +// Easy for profiling independently. +TEST(Analyzer_seq_conv1, profile) { + AnalysisConfig cfg; + SetConfig(&cfg); + std::vector outputs; + + std::vector> input_slots_all; + SetInput(&input_slots_all); + TestPrediction(cfg, input_slots_all, &outputs, FLAGS_num_threads); + + if (FLAGS_num_threads == 1 && !FLAGS_test_all_data) { + // the first inference result + PADDLE_ENFORCE_EQ(outputs.size(), 1UL); + size_t size = GetSize(outputs[0]); + PADDLE_ENFORCE_GT(size, 0); + float *result = static_cast(outputs[0].data.data()); + // output is probability, which is in (0, 1). + for (size_t i = 0; i < size; i++) { + EXPECT_GT(result[i], 0); + EXPECT_LT(result[i], 1); + } + } +} + +// Check the fuse status +TEST(Analyzer_seq_conv1, fuse_statis) { + AnalysisConfig cfg; + SetConfig(&cfg); + int num_ops; + auto predictor = CreatePaddlePredictor(cfg); + GetFuseStatis(predictor.get(), &num_ops); +} + +// Compare result of NativeConfig and AnalysisConfig +TEST(Analyzer_seq_conv1, compare) { + AnalysisConfig cfg; + SetConfig(&cfg); + + std::vector> input_slots_all; + SetInput(&input_slots_all); + CompareNativeAndAnalysis(cfg, input_slots_all); +} + +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/tests/api/analyzer_text_classification_tester.cc b/paddle/fluid/inference/tests/api/analyzer_text_classification_tester.cc index 340ef152f0b1a15a451f840b36ae845ef4984740..ca19475bda372398d425b0fa6f9a732cd79a8166 100644 --- a/paddle/fluid/inference/tests/api/analyzer_text_classification_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_text_classification_tester.cc @@ -104,5 +104,18 @@ TEST(Analyzer_Text_Classification, compare) { CompareNativeAndAnalysis(cfg, input_slots_all); } +TEST(Analyzer_Text_Classification, compare_against_embedding_fc_lstm_fused) { + AnalysisConfig cfg; + SetConfig(&cfg); + // Enable embedding_fc_lstm_fuse_pass (disabled by default) + auto it = std::find(cfg.ir_passes.begin(), cfg.ir_passes.end(), + "embedding_fc_lstm_fuse_pass"); + if (it != cfg.ir_passes.end()) cfg.ir_passes.erase(it); + + std::vector> input_slots_all; + SetInput(&input_slots_all); + CompareNativeAndAnalysis(cfg, input_slots_all); +} + } // namespace inference } // namespace paddle diff --git a/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc b/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc index 483ae66c5b24f6147b1b07da86494a914f80c34c..305b8bfe158150d5dfd8bdaee2c0a89afe264de4 100644 --- a/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc @@ -19,6 +19,7 @@ limitations under the License. */ namespace paddle { namespace inference { namespace analysis { +using contrib::AnalysisConfig; struct Record { std::vector data; @@ -60,8 +61,6 @@ void SetConfig(AnalysisConfig *cfg) { cfg->ir_passes.push_back("fc_gru_fuse_pass"); #ifdef PADDLE_WITH_MKLDNN cfg->_use_mkldnn = true; - // disable mkldnn fuse since it should have some bugs - cfg->ir_passes.push_back("conv_relu_mkldnn_fuse_pass"); #endif } @@ -114,7 +113,8 @@ TEST(Analyzer_vis, fuse_statis) { AnalysisConfig cfg; SetConfig(&cfg); int num_ops; - GetFuseStatis(cfg, &num_ops); + auto predictor = CreatePaddlePredictor(cfg); + GetFuseStatis(predictor.get(), &num_ops); } // Compare result of NativeConfig and AnalysisConfig diff --git a/paddle/fluid/inference/tests/api/tester_helper.h b/paddle/fluid/inference/tests/api/tester_helper.h index 7189df775227680726a9d4840386280c5ad44c23..8603d09cbd09e4cee254faf52c2ccbe8d661994d 100644 --- a/paddle/fluid/inference/tests/api/tester_helper.h +++ b/paddle/fluid/inference/tests/api/tester_helper.h @@ -15,6 +15,7 @@ #pragma once #include +#include #include #include // NOLINT #include @@ -47,11 +48,8 @@ void CompareResult(const std::vector &outputs, for (size_t i = 0; i < outputs.size(); i++) { auto &out = outputs[i]; auto &ref_out = ref_outputs[i]; - size_t size = std::accumulate(out.shape.begin(), out.shape.end(), 1, - [](int a, int b) { return a * b; }); - size_t ref_size = - std::accumulate(ref_out.shape.begin(), ref_out.shape.end(), 1, - [](int a, int b) { return a * b; }); + size_t size = VecReduceToInt(out.shape); + size_t ref_size = VecReduceToInt(ref_out.shape); EXPECT_GT(size, 0); EXPECT_EQ(size, ref_size); EXPECT_EQ(out.dtype, ref_out.dtype); @@ -87,16 +85,11 @@ std::unique_ptr CreateTestPredictor( } } -size_t GetSize(const PaddleTensor &out) { - return std::accumulate(out.shape.begin(), out.shape.end(), 1, - [](int a, int b) { return a * b; }); -} +size_t GetSize(const PaddleTensor &out) { return VecReduceToInt(out.shape); } -std::unordered_map GetFuseStatis(AnalysisConfig config, +std::unordered_map GetFuseStatis(PaddlePredictor *predictor, int *num_ops) { - auto predictor = CreateTestPredictor(config); - AnalysisPredictor *analysis_predictor = - dynamic_cast(predictor.get()); + auto *analysis_predictor = static_cast(predictor); auto &fuse_statis = analysis_predictor->analysis_argument() .Get>( framework::ir::kFuseStatisAttr); @@ -190,5 +183,127 @@ void CompareNativeAndAnalysis( CompareResult(analysis_outputs, native_outputs); } +template +std::string LoDTensorSummary(const framework::LoDTensor &tensor) { + std::stringstream ss; + ss << "\n---- tensor ---" << '\n'; + ss << "lod: ["; + for (const auto &level : tensor.lod()) { + ss << "[ "; + for (auto i : level) { + ss << i << ", "; + } + ss << "]"; + } + ss << "]\n"; + + ss << "shape: ["; + int size = 1; + for (int i = 0; i < tensor.dims().size(); i++) { + int dim = tensor.dims()[i]; + ss << dim << ", "; + size *= dim; + } + ss << "]\n"; + + ss << "data: "; + for (int i = 0; i < std::min(20, size); i++) { + ss << tensor.data()[i] << " "; + } + ss << "\n"; + + return ss.str(); +} + +static bool CompareLoD(const framework::LoD &a, const framework::LoD &b) { + if (a.size() != b.size()) { + LOG(ERROR) << string::Sprintf("lod size not match %d != %d", a.size(), + b.size()); + return false; + } + for (size_t i = 0; i < a.size(); i++) { + auto &al = a[i]; + auto &bl = b[i]; + if (al.size() != bl.size()) { + LOG(ERROR) << string::Sprintf("level size %d != %d", al.size(), + bl.size()); + return false; + } + } + return true; +} + +static bool CompareShape(const std::vector &a, + const std::vector &b) { + if (a.size() != b.size()) { + LOG(ERROR) << string::Sprintf("shape size not match %d != %d", a.size(), + b.size()); + return false; + } + for (size_t i = 0; i < a.size(); i++) { + if (a[i] != b[i]) { + LOG(ERROR) << string::Sprintf("shape %d-th element not match %d != %d", i, + a[i], b[i]); + return false; + } + } + return true; +} + +static bool CompareTensorData(const framework::LoDTensor &a, + const framework::LoDTensor &b) { + auto a_shape = framework::vectorize(a.dims()); + auto b_shape = framework::vectorize(b.dims()); + size_t a_size = std::accumulate(a_shape.begin(), a_shape.end(), 1, + [](int a, int b) { return a * b; }); + size_t b_size = std::accumulate(b_shape.begin(), b_shape.end(), 1, + [](int a, int b) { return a * b; }); + if (a_size != b_size) { + LOG(ERROR) << string::Sprintf("tensor data size not match, %d != %d", + a_size, b_size); + } + + for (size_t i = 0; i < a_size; i++) { + if (a.type() == typeid(float)) { + const auto *a_data = a.data(); + const auto *b_data = b.data(); + if (std::abs(a_data[i] - b_data[i]) > 1e-3) { + LOG(ERROR) << string::Sprintf( + "tensor data %d-th element not match, %f != %f", i, a_data[i], + b_data[i]); + return false; + } + } else if (a.type() == typeid(int64_t)) { + const auto *a_data = a.data(); + const auto *b_data = b.data(); + if (std::abs(a_data[i] - b_data[i]) > 1e-3) { + LOG(ERROR) << string::Sprintf( + "tensor data %d-th element not match, %f != %f", i, a_data[i], + b_data[i]); + return false; + } + } + } + + return true; +} + +static bool CompareTensor(const framework::LoDTensor &a, + const framework::LoDTensor &b) { + if (!CompareLoD(a.lod(), b.lod())) { + return false; + } + if (!CompareShape(framework::vectorize(a.dims()), + framework::vectorize(b.dims()))) { + return false; + } + + if (!CompareTensorData(a, b)) { + return false; + } + + return true; +} + } // namespace inference } // namespace paddle diff --git a/paddle/fluid/inference/tests/api/trt_models_tester.cc b/paddle/fluid/inference/tests/api/trt_models_tester.cc new file mode 100644 index 0000000000000000000000000000000000000000..bf320a0cbc2fff5f973c48768281e26d0fde232b --- /dev/null +++ b/paddle/fluid/inference/tests/api/trt_models_tester.cc @@ -0,0 +1,106 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include +#include "paddle/fluid/inference/analysis/analyzer.h" +#include "paddle/fluid/inference/api/paddle_inference_api.h" + +namespace paddle { +using paddle::contrib::MixedRTConfig; + +DEFINE_string(dirname, "", "Directory of the inference model."); + +NativeConfig GetConfigNative() { + NativeConfig config; + config.model_dir = FLAGS_dirname; + // LOG(INFO) << "dirname " << config.model_dir; + config.fraction_of_gpu_memory = 0.45; + config.use_gpu = true; + config.device = 0; + return config; +} + +MixedRTConfig GetConfigTRT() { + MixedRTConfig config; + config.model_dir = FLAGS_dirname; + config.use_gpu = true; + config.fraction_of_gpu_memory = 0.2; + config.device = 0; + config.max_batch_size = 3; + return config; +} + +void CompareTensorRTWithFluid(int batch_size, std::string model_dirname) { + NativeConfig config0 = GetConfigNative(); + config0.model_dir = model_dirname; + + MixedRTConfig config1 = GetConfigTRT(); + config1.model_dir = model_dirname; + config1.max_batch_size = batch_size; + + auto predictor0 = + CreatePaddlePredictor(config0); + auto predictor1 = + CreatePaddlePredictor(config1); + // Prepare inputs + int height = 224; + int width = 224; + float *data = new float[batch_size * 3 * height * width]; + memset(data, 0, sizeof(float) * (batch_size * 3 * height * width)); + data[0] = 1.0f; + + // Prepare inputs + PaddleTensor tensor; + tensor.name = "input_0"; + tensor.shape = std::vector({batch_size, 3, height, width}); + tensor.data = PaddleBuf(static_cast(data), + sizeof(float) * (batch_size * 3 * height * width)); + tensor.dtype = PaddleDType::FLOAT32; + std::vector paddle_tensor_feeds(1, tensor); + + // Prepare outputs + std::vector outputs0; + std::vector outputs1; + CHECK(predictor0->Run(paddle_tensor_feeds, &outputs0)); + + CHECK(predictor1->Run(paddle_tensor_feeds, &outputs1, batch_size)); + + // Get output. + ASSERT_EQ(outputs0.size(), 1UL); + ASSERT_EQ(outputs1.size(), 1UL); + + const size_t num_elements = outputs0.front().data.length() / sizeof(float); + const size_t num_elements1 = outputs1.front().data.length() / sizeof(float); + EXPECT_EQ(num_elements, num_elements1); + + auto *data0 = static_cast(outputs0.front().data.data()); + auto *data1 = static_cast(outputs1.front().data.data()); + + ASSERT_GT(num_elements, 0UL); + for (size_t i = 0; i < std::min(num_elements, num_elements1); i++) { + EXPECT_NEAR(data0[i], data1[i], 1e-3); + } +} + +TEST(trt_models_test, main) { + std::vector infer_models = {"mobilenet", "resnet50", + "resnext50"}; + for (auto &model_dir : infer_models) { + CompareTensorRTWithFluid(1, FLAGS_dirname + "/" + model_dir); + } +} +} // namespace paddle diff --git a/paddle/fluid/inference/tests/book/CMakeLists.txt b/paddle/fluid/inference/tests/book/CMakeLists.txt index 017fc4cd7b11c150cb941fffca2606a4d707330f..977155440df5294216382cff1c67c2aaca1f546d 100644 --- a/paddle/fluid/inference/tests/book/CMakeLists.txt +++ b/paddle/fluid/inference/tests/book/CMakeLists.txt @@ -4,7 +4,6 @@ function(inference_test TARGET_NAME) set(multiValueArgs ARGS) cmake_parse_arguments(inference_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) - set(PYTHON_TESTS_DIR ${PADDLE_BINARY_DIR}/python/paddle/fluid/tests) set(arg_list "") if(inference_test_ARGS) foreach(arg ${inference_test_ARGS}) diff --git a/paddle/fluid/memory/malloc.cc b/paddle/fluid/memory/malloc.cc index 283745e977533358ef52521b36e67f0ada950e61..0f13a4ea9c1af175771f5cc201ea5c0a8a0f7555 100644 --- a/paddle/fluid/memory/malloc.cc +++ b/paddle/fluid/memory/malloc.cc @@ -36,6 +36,8 @@ namespace memory { using BuddyAllocator = detail::BuddyAllocator; BuddyAllocator* GetCPUBuddyAllocator() { + // We tried thread_local for inference::RNN1 model, but that not works much + // for multi-thread test. static std::once_flag init_flag; static detail::BuddyAllocator* a = nullptr; @@ -48,6 +50,25 @@ BuddyAllocator* GetCPUBuddyAllocator() { return a; } +// We compared the NaiveAllocator with BuddyAllocator in CPU memory allocation, +// seems they are almost the same overhead. +struct NaiveAllocator { + void* Alloc(size_t size) { return malloc(size); } + + void Free(void* p) { + PADDLE_ENFORCE(p); + free(p); + } + + static NaiveAllocator* Instance() { + static NaiveAllocator x; + return &x; + } + + private: + std::mutex lock_; +}; + template <> void* Alloc(platform::CPUPlace place, size_t size) { VLOG(10) << "Allocate " << size << " bytes on " << platform::Place(place); diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt index 9c67df7bdfb2c4e5d1c9fe60676c412ab11b4fa5..2ef13b72ed3ff6ae8ad8748ddea977e693615ac6 100644 --- a/paddle/fluid/operators/CMakeLists.txt +++ b/paddle/fluid/operators/CMakeLists.txt @@ -82,10 +82,11 @@ function(op_library TARGET) if (${cc_srcs_len} EQUAL 0) message(FATAL_ERROR "The op library ${TARGET} should contains at least one .cc file") endif() - - #remove windows unsupported op if (WIN32) - foreach(windows_unsupport_op "nccl_op" "gen_nccl_id_op" "warpctc_op") + # remove windows unsupported op, because windows has no nccl, no warpctc such ops. + foreach(windows_unsupport_op "nccl_op" "gen_nccl_id_op" "warpctc_op" "hierarchical_sigmoid_op" + "crf_decoding_op" "select_op" "lstmp_op" "gru_op" "fusion_gru_op" "lstm_op" "fusion_lstm_op" "cumsum_op" + "channel_send_op" "channel_create_op" "channel_close_op" "channel_recv_op") if ("${TARGET}" STREQUAL "${windows_unsupport_op}") return() endif() @@ -281,10 +282,12 @@ op_library(array_to_lod_tensor_op DEPS lod_rank_table_op) op_library(max_sequence_len_op DEPS lod_rank_table) op_library(sequence_conv_op DEPS context_project) op_library(sequence_pool_op DEPS sequence_pooling) +if (NOT WIN32) op_library(lstm_op DEPS sequence2batch lstm_compute) op_library(hierarchical_sigmoid_op DEPS matrix_bit_code) op_library(lstmp_op DEPS sequence2batch lstm_compute) op_library(gru_op DEPS sequence2batch gru_compute) +endif(NOT WIN32) op_library(recurrent_op DEPS executor) op_library(warpctc_op DEPS dynload_warpctc sequence_padding sequence_scale) op_library(cos_sim_op DEPS cos_sim_functor) @@ -297,10 +300,10 @@ op_library(sequence_pad_op DEPS sequence_padding) op_library(unstack_op DEPS stack_op) op_library(fake_quantize_op DEPS memory) op_library(fusion_lstm_op DEPS cpu_lstm_compute) - if (WITH_GPU) op_library(conv_op DEPS vol2col depthwise_conv im2col) op_library(layer_norm_op DEPS cub) + op_library(reduce_mean_op DEPS cub) else() op_library(conv_op DEPS vol2col im2col) endif() @@ -313,11 +316,6 @@ op_library(save_combine_op DEPS lod_tensor) op_library(load_combine_op DEPS lod_tensor) op_library(concat_op DEPS concat) -# FIXME(thuan): Move CSP operators to paddle/fluid/framework/operators/concurrency -add_subdirectory(concurrency) -op_library(channel_send_op DEPS concurrency) -op_library(channel_recv_op DEPS concurrency) - list(REMOVE_ITEM GENERAL_OPS ${DEPS_OPS}) foreach(src ${GENERAL_OPS}) diff --git a/paddle/fluid/operators/activation_op.cc b/paddle/fluid/operators/activation_op.cc index 286b03d7b7d11a50f33f0190c1a5b9097ed0f4a2..c091476d6d132db17a656d5c8dee65e3a88d9ac2 100644 --- a/paddle/fluid/operators/activation_op.cc +++ b/paddle/fluid/operators/activation_op.cc @@ -15,6 +15,7 @@ limitations under the License. */ #include "paddle/fluid/operators/activation_op.h" #include #include "paddle/fluid/operators/mkldnn_activation_op.h" +#include "paddle/fluid/platform/port.h" namespace paddle { namespace operators { @@ -105,105 +106,105 @@ class ActivationOpGrad : public framework::OperatorWithKernel { } }; -__attribute__((unused)) constexpr char SigmoidDoc[] = R"DOC( +UNUSED constexpr char SigmoidDoc[] = R"DOC( Sigmoid Activation Operator $$out = \frac{1}{1 + e^{-x}}$$ )DOC"; -__attribute__((unused)) constexpr char LogSigmoidDoc[] = R"DOC( +UNUSED constexpr char LogSigmoidDoc[] = R"DOC( Logsigmoid Activation Operator $$out = \\log \\frac{1}{1 + e^{-x}}$$ )DOC"; -__attribute__((unused)) constexpr char ExpDoc[] = R"DOC( +UNUSED constexpr char ExpDoc[] = R"DOC( Exp Activation Operator. $out = e^x$ )DOC"; -__attribute__((unused)) constexpr char ReluDoc[] = R"DOC( +UNUSED constexpr char ReluDoc[] = R"DOC( Relu Activation Operator. $out = \max(x, 0)$ )DOC"; -__attribute__((unused)) constexpr char TanhDoc[] = R"DOC( +UNUSED constexpr char TanhDoc[] = R"DOC( Tanh Activation Operator. $$out = \\frac{e^{x} - e^{-x}}{e^{x} + e^{-x}}$$ )DOC"; -__attribute__((unused)) constexpr char TanhShrinkDoc[] = R"DOC( +UNUSED constexpr char TanhShrinkDoc[] = R"DOC( TanhShrink Activation Operator. $$out = x - \\frac{e^{x} - e^{-x}}{e^{x} + e^{-x}}$$ )DOC"; -__attribute__((unused)) constexpr char SqrtDoc[] = R"DOC( +UNUSED constexpr char SqrtDoc[] = R"DOC( Sqrt Activation Operator. $out = \sqrt{x}$ )DOC"; -__attribute__((unused)) constexpr char AbsDoc[] = R"DOC( +UNUSED constexpr char AbsDoc[] = R"DOC( Abs Activation Operator. $out = |x|$ )DOC"; -__attribute__((unused)) constexpr char CeilDoc[] = R"DOC( +UNUSED constexpr char CeilDoc[] = R"DOC( Ceil Activation Operator. $out = ceil(x)$ )DOC"; -__attribute__((unused)) constexpr char FloorDoc[] = R"DOC( +UNUSED constexpr char FloorDoc[] = R"DOC( Floor Activation Operator. $out = floor(x)$ )DOC"; -__attribute__((unused)) constexpr char CosDoc[] = R"DOC( +UNUSED constexpr char CosDoc[] = R"DOC( Cosine Activation Operator. $out = cos(x)$ )DOC"; -__attribute__((unused)) constexpr char SinDoc[] = R"DOC( +UNUSED constexpr char SinDoc[] = R"DOC( Sine Activation Operator. $out = sin(x)$ )DOC"; -__attribute__((unused)) constexpr char RoundDoc[] = R"DOC( +UNUSED constexpr char RoundDoc[] = R"DOC( Round Activation Operator. $out = [x]$ )DOC"; -__attribute__((unused)) constexpr char ReciprocalDoc[] = R"DOC( +UNUSED constexpr char ReciprocalDoc[] = R"DOC( Reciprocal Activation Operator. $$out = \\frac{1}{x}$$ )DOC"; -__attribute__((unused)) constexpr char LogDoc[] = R"DOC( +UNUSED constexpr char LogDoc[] = R"DOC( Log Activation Operator. $out = \ln(x)$ @@ -212,21 +213,21 @@ Natural logarithm of x. )DOC"; -__attribute__((unused)) constexpr char SquareDoc[] = R"DOC( +UNUSED constexpr char SquareDoc[] = R"DOC( Square Activation Operator. $out = x^2$ )DOC"; -__attribute__((unused)) constexpr char SoftplusDoc[] = R"DOC( +UNUSED constexpr char SoftplusDoc[] = R"DOC( Softplus Activation Operator. $out = \ln(1 + e^{x})$ )DOC"; -__attribute__((unused)) constexpr char SoftsignDoc[] = R"DOC( +UNUSED constexpr char SoftsignDoc[] = R"DOC( Softsign Activation Operator. $$out = \frac{x}{1 + |x|}$$ diff --git a/paddle/fluid/operators/auc_op.cc b/paddle/fluid/operators/auc_op.cc index dfaa7456f917c1308984b361afed752f96ea6f59..0784920064a879963cd9725cd9acf4cec7b874ce 100644 --- a/paddle/fluid/operators/auc_op.cc +++ b/paddle/fluid/operators/auc_op.cc @@ -36,11 +36,16 @@ class AucOp : public framework::OperatorWithKernel { "Out and Label should have same height."); int num_pred_buckets = ctx->Attrs().Get("num_thresholds") + 1; + int slide_steps = ctx->Attrs().Get("slide_steps"); + + PADDLE_ENFORCE_GE(num_pred_buckets, 1, "num_thresholds must larger than 1"); + PADDLE_ENFORCE_GE(slide_steps, 0, "slide_steps must be natural number"); ctx->SetOutputDim("AUC", {1}); - ctx->SetOutputDim("BatchAUC", {1}); - ctx->SetOutputDim("StatPosOut", {num_pred_buckets}); - ctx->SetOutputDim("StatNegOut", {num_pred_buckets}); + + slide_steps = slide_steps == 0 ? 1 : slide_steps; + ctx->SetOutputDim("StatPosOut", {slide_steps, num_pred_buckets}); + ctx->SetOutputDim("StatNegOut", {slide_steps, num_pred_buckets}); } protected: @@ -62,6 +67,7 @@ class AucOpMaker : public framework::OpProtoAndCheckerMaker { AddInput("Label", "A 2D int tensor indicating the label of the training data. " "shape: [batch_size, 1]"); + // TODO(typhoonzero): support weight input AddInput("StatPos", "Statistic value when label = 1"); AddInput("StatNeg", "Statistic value when label = 0"); @@ -69,18 +75,19 @@ class AucOpMaker : public framework::OpProtoAndCheckerMaker { AddOutput("AUC", "A scalar representing the " "current area-under-the-curve."); - AddOutput("BatchAUC", "The AUC for current batch"); + AddOutput("StatPosOut", "Statistic value when label = 1"); AddOutput("StatNegOut", "Statistic value when label = 0"); AddAttr("curve", "Curve type, can be 'ROC' or 'PR'.") .SetDefault("ROC"); - AddAttr("num_thresholds", - "The number of thresholds to use when discretizing the" - " roc curve.") + AddAttr( + "num_thresholds", + "The number of thresholds to use when discretizing the roc curve.") .SetDefault((2 << 12) - 1); - + AddAttr("slide_steps", "Use slide steps to calc batch auc.") + .SetDefault(1); AddComment(R"DOC( Area Under The Curve (AUC) Operator. diff --git a/paddle/fluid/operators/auc_op.h b/paddle/fluid/operators/auc_op.h index fb0517d70635e090f8c5b59ff9d8420fc34c747b..fb370842d1942c3b3eebecb1fe5e8ffb845cb34b 100644 --- a/paddle/fluid/operators/auc_op.h +++ b/paddle/fluid/operators/auc_op.h @@ -32,7 +32,9 @@ class AucKernel : public framework::OpKernel { std::string curve = ctx.Attr("curve"); int num_thresholds = ctx.Attr("num_thresholds"); + // buckets contain numbers from 0 to num_thresholds int num_pred_buckets = num_thresholds + 1; + int slide_steps = ctx.Attr("slide_steps"); // Only use output var for now, make sure it's persistable and // not cleaned up for each batch. @@ -40,16 +42,19 @@ class AucKernel : public framework::OpKernel { auto *stat_pos = ctx.Output("StatPosOut"); auto *stat_neg = ctx.Output("StatNegOut"); - auto *stat_pos_data = stat_pos->mutable_data(ctx.GetPlace()); - auto *stat_neg_data = stat_neg->mutable_data(ctx.GetPlace()); - calcAuc(ctx, label, predict, stat_pos_data, stat_neg_data, num_thresholds, - auc); + auto *origin_stat_pos = stat_pos->mutable_data(ctx.GetPlace()); + auto *origin_stat_neg = stat_neg->mutable_data(ctx.GetPlace()); - auto *batch_auc = ctx.Output("BatchAUC"); - std::vector stat_pos_batch(num_pred_buckets, 0); - std::vector stat_neg_batch(num_pred_buckets, 0); - calcAuc(ctx, label, predict, stat_pos_batch.data(), stat_neg_batch.data(), - num_thresholds, batch_auc); + std::vector stat_pos_data(num_pred_buckets, 0); + std::vector stat_neg_data(num_pred_buckets, 0); + + auto stat_pos_calc = stat_pos_data.data(); + auto stat_neg_calc = stat_neg_data.data(); + + statAuc(label, predict, num_pred_buckets, num_thresholds, slide_steps, + origin_stat_pos, origin_stat_neg, &stat_pos_calc, &stat_neg_calc); + + calcAuc(ctx, stat_pos_calc, stat_neg_calc, num_thresholds, auc); } private: @@ -58,29 +63,76 @@ class AucKernel : public framework::OpKernel { return (X1 > X2 ? (X1 - X2) : (X2 - X1)) * (Y1 + Y2) / 2.0; } - inline static void calcAuc(const framework::ExecutionContext &ctx, - const framework::Tensor *label, + inline static void statAuc(const framework::Tensor *label, const framework::Tensor *predict, - int64_t *stat_pos, int64_t *stat_neg, - int num_thresholds, - framework::Tensor *auc_tensor) { + const int num_pred_buckets, + const int num_thresholds, const int slide_steps, + int64_t *origin_stat_pos, int64_t *origin_stat_neg, + int64_t **stat_pos, int64_t **stat_neg) { size_t batch_size = predict->dims()[0]; size_t inference_width = predict->dims()[1]; const T *inference_data = predict->data(); const auto *label_data = label->data(); - auto *auc = auc_tensor->mutable_data(ctx.GetPlace()); - for (size_t i = 0; i < batch_size; i++) { uint32_t binIdx = static_cast( inference_data[i * inference_width + 1] * num_thresholds); if (label_data[i]) { - stat_pos[binIdx] += 1.0; + (*stat_pos)[binIdx] += 1.0; } else { - stat_neg[binIdx] += 1.0; + (*stat_neg)[binIdx] += 1.0; } } + int bucket_length = num_pred_buckets * sizeof(int64_t); + + // will stat auc unlimited. + if (slide_steps == 0) { + for (int slide = 0; slide < num_pred_buckets; ++slide) { + origin_stat_pos[slide] += (*stat_pos)[slide]; + origin_stat_neg[slide] += (*stat_neg)[slide]; + } + + *stat_pos = origin_stat_pos; + *stat_neg = origin_stat_neg; + + } else { + for (int slide = 1; slide < slide_steps; ++slide) { + int dst_idx = (slide - 1) * num_pred_buckets; + int src_inx = slide * num_pred_buckets; + std::memcpy(origin_stat_pos + dst_idx, origin_stat_pos + src_inx, + bucket_length); + std::memcpy(origin_stat_neg + dst_idx, origin_stat_neg + src_inx, + bucket_length); + } + + std::memcpy(origin_stat_pos + (slide_steps - 1) * num_pred_buckets, + *stat_pos, bucket_length); + std::memcpy(origin_stat_neg + (slide_steps - 1) * num_pred_buckets, + *stat_neg, bucket_length); + + std::memset(*stat_pos, 0, bucket_length); + std::memset(*stat_neg, 0, bucket_length); + + for (int slide = 0; slide < num_pred_buckets; ++slide) { + int stat_pos_steps = 0; + int stat_neg_steps = 0; + for (int step = 0; step < slide_steps; ++step) { + stat_pos_steps += origin_stat_pos[slide + step * num_pred_buckets]; + stat_neg_steps += origin_stat_neg[slide + step * num_pred_buckets]; + } + (*stat_pos)[slide] += stat_pos_steps; + (*stat_neg)[slide] += stat_neg_steps; + } + } + } + + inline static void calcAuc(const framework::ExecutionContext &ctx, + int64_t *stat_pos, int64_t *stat_neg, + int num_thresholds, + framework::Tensor *auc_tensor) { + auto *auc = auc_tensor->mutable_data(ctx.GetPlace()); + *auc = 0.0f; double totPos = 0.0; @@ -96,7 +148,6 @@ class AucKernel : public framework::OpKernel { totPos += stat_pos[idx]; totNeg += stat_neg[idx]; *auc += trapezoidArea(totNeg, totNegPrev, totPos, totPosPrev); - --idx; } diff --git a/paddle/fluid/operators/channel_close_op.cc b/paddle/fluid/operators/channel_close_op.cc deleted file mode 100644 index 8e2db250a069c488ee98f618bc03df6485022456..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/channel_close_op.cc +++ /dev/null @@ -1,70 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/framework/channel.h" -#include "paddle/fluid/framework/op_registry.h" - -namespace pf = paddle::framework; -static constexpr char kChannel[] = "Channel"; - -namespace paddle { -namespace operators { - -class ChannelCloseOp : public framework::OperatorBase { - public: - ChannelCloseOp(const std::string &type, - const framework::VariableNameMap &inputs, - const framework::VariableNameMap &outputs, - const framework::AttributeMap &attrs) - : framework::OperatorBase(type, inputs, outputs, attrs) {} - - private: - void RunImpl(const framework::Scope &scope, - const platform::Place &dev_place) const override { - auto &inp = *scope.FindVar(Input(kChannel)); - - // Get the mutable version of the channel variable and closes it. - pf::ChannelHolder *ch = inp.GetMutable(); - ch->close(); - } -}; - -class ChannelCloseOpOpInferShape : public framework::InferShapeBase { - public: - void operator()(framework::InferShapeContext *context) const override { - PADDLE_ENFORCE(context->HasInput("Channel"), - "The input of ChannelClose op must be set"); - } -}; - -class ChannelCloseOpMaker : public framework::OpProtoAndCheckerMaker { - public: - void Make() override { - AddInput(kChannel, - "The Channel Variable that should be closed by" - " the ChannelClose Op."); - AddComment(R"DOC( -Channel Close Operator. - -This operator closes an open channel. -)DOC"); - } -}; - -} // namespace operators -} // namespace paddle - -REGISTER_OPERATOR(channel_close, paddle::operators::ChannelCloseOp, - paddle::framework::EmptyGradOpMaker, - paddle::operators::ChannelCloseOpMaker); diff --git a/paddle/fluid/operators/channel_create_op.cc b/paddle/fluid/operators/channel_create_op.cc deleted file mode 100644 index a7f59e4088e3fb328e5b5a83eed65f0f90edb9f0..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/channel_create_op.cc +++ /dev/null @@ -1,113 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/framework/channel.h" -#include "paddle/fluid/framework/lod_rank_table.h" -#include "paddle/fluid/framework/lod_tensor_array.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/framework/reader.h" - -namespace pf = paddle::framework; - -static constexpr char kOutput[] = "Out"; - -namespace paddle { -namespace operators { - -class ChannelCreateOp : public framework::OperatorBase { - public: - ChannelCreateOp(const std::string &type, - const framework::VariableNameMap &inputs, - const framework::VariableNameMap &outputs, - const framework::AttributeMap &attrs) - : framework::OperatorBase(type, inputs, outputs, attrs) {} - - private: - void RunImpl(const framework::Scope &scope, - const platform::Place &dev_place) const override { - auto &out = *scope.FindVar(Output(kOutput)); - - // Determine the datatype and capacity of the channel to be created - // from the attributes provided. - auto dtype = - static_cast(Attr("data_type")); - auto capacity = Attr("capacity"); - - // Based on the datatype, create a new channel holder initialized with - // the given capacity. When capacity is 0, an unbuffered channel is - // created. - pf::ChannelHolder *ch = out.GetMutable(); - if (dtype == framework::proto::VarType::LOD_TENSOR) { - ch->Reset(capacity); - } else if (dtype == framework::proto::VarType::SELECTED_ROWS) { - ch->Reset(capacity); - } else if (dtype == framework::proto::VarType::LOD_RANK_TABLE) { - ch->Reset(capacity); - } else if (dtype == framework::proto::VarType::LOD_TENSOR_ARRAY) { - ch->Reset(capacity); - } else if (dtype == framework::proto::VarType::READER) { - ch->Reset(capacity); - } else if (dtype == framework::proto::VarType::CHANNEL) { - ch->Reset(capacity); - } else if (dtype == framework::proto::VarType::BOOL) { - ch->Reset(capacity); - } else if (dtype == framework::proto::VarType::INT32) { - ch->Reset(capacity); - } else if (dtype == framework::proto::VarType::INT64) { - ch->Reset(capacity); - } else if (dtype == framework::proto::VarType::FP32) { - ch->Reset(capacity); - } else if (dtype == framework::proto::VarType::FP64) { - ch->Reset(capacity); - } else { - PADDLE_THROW( - "Data type %d is not in " - "[LOD_TENSOR, SELECTED_ROWS, LOD_RANK_TABLE, LOD_TENSOR_ARRAY, " - "READER, CHANNEL, BOOL, INT32, INT64, FP32, FP64]", - dtype); - } - } -}; - -class ChannelCreateOpOpInferShape : public framework::InferShapeBase { - public: - void operator()(framework::InferShapeContext *context) const override { - PADDLE_ENFORCE(context->HasOutput(kOutput), - "The output of ChannelCreate op must be set"); - context->SetOutputDim(kOutput, {1}); - } -}; - -class ChannelCreateOpMaker : public framework::OpProtoAndCheckerMaker { - public: - void Make() override { - AddOutput(kOutput, - "The object of a Channel type created by ChannelCreate Op."); - AddAttr("capacity", "The size of the buffer of Channel.") - .SetDefault(0); - AddAttr("data_type", "The data type of elements inside the Channel."); - AddComment(R"DOC( -Channel Create Operator. - -This operator creates an object of the VarType Channel and returns it. -)DOC"); - } -}; - -} // namespace operators -} // namespace paddle - -REGISTER_OPERATOR(channel_create, paddle::operators::ChannelCreateOp, - paddle::framework::EmptyGradOpMaker, - paddle::operators::ChannelCreateOpMaker); diff --git a/paddle/fluid/operators/channel_recv_op.cc b/paddle/fluid/operators/channel_recv_op.cc deleted file mode 100644 index 101015e837e28b504b71d919abd5f908a102c812..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/channel_recv_op.cc +++ /dev/null @@ -1,98 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/framework/channel.h" -#include -#include -#include -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/framework/var_type.h" -#include "paddle/fluid/operators/concurrency/channel_util.h" -#include "paddle/fluid/operators/math/math_function.h" - -static constexpr char Channel[] = "Channel"; -static constexpr char Status[] = "Status"; -static constexpr char Out[] = "Out"; - -namespace paddle { -namespace operators { - -void SetReceiveStatus(const platform::Place &dev_place, - framework::Variable *status_var, bool status) { - auto cpu = platform::CPUPlace(); - auto status_tensor = - status_var->GetMutable()->mutable_data({1}, - cpu); - status_tensor[0] = status; -} - -class ChannelRecvOp : public framework::OperatorBase { - public: - ChannelRecvOp(const std::string &type, - const framework::VariableNameMap &inputs, - const framework::VariableNameMap &outputs, - const framework::AttributeMap &attrs) - : framework::OperatorBase(type, inputs, outputs, attrs) {} - - void InferShape(framework::InferShapeContext *ctx) const { - PADDLE_ENFORCE(ctx->HasInput(Channel), - "Input(Channel) of ChannelRecvOp should not be null."); - PADDLE_ENFORCE(ctx->HasOutput(Out), - "Input(Channel) of ChannelRecvOp should not be null."); - PADDLE_ENFORCE(ctx->HasOutput(Status), - "Output(Status) of ChannelRecvOp should not be null."); - ctx->SetOutputDim("Status", {1}); - } - - private: - void RunImpl(const framework::Scope &scope, - const platform::Place &dev_place) const override { - // Get the channel holder created by channel_create op, passed as input. - framework::ChannelHolder *ch = - scope.FindVar(Input(Channel))->GetMutable(); - auto output_var = scope.FindVar(Output(Out)); - // Receive the data from the channel. - bool ok = concurrency::ChannelReceive(ch, output_var); - - // Set the status output of the `ChannelReceive` call. - SetReceiveStatus(dev_place, scope.FindVar(Output(Status)), ok); - } -}; - -class ChannelRecvOpMaker : public framework::OpProtoAndCheckerMaker { - public: - void Make() override { - AddInput(Channel, - "(Channel) A variable which \"receives\" the a value sent" - "to it by a channel_send op.") - .AsDuplicable(); - AddOutput(Out, - "(Variable) Output Variable that will hold the data received" - " from the Channel") - .AsDuplicable(); - AddOutput(Status, - "(Tensor) An LoD Tensor that returns a boolean status of the" - "result of the receive operation.") - .AsDuplicable(); - AddComment(R"DOC( -)DOC"); - } -}; - -} // namespace operators -} // namespace paddle - -REGISTER_OPERATOR(channel_recv, paddle::operators::ChannelRecvOp, - paddle::framework::EmptyGradOpMaker, - paddle::operators::ChannelRecvOpMaker); diff --git a/paddle/fluid/operators/channel_send_op.cc b/paddle/fluid/operators/channel_send_op.cc deleted file mode 100644 index 67d6deb511d883ac69426ddd34be2199367cd4c7..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/channel_send_op.cc +++ /dev/null @@ -1,76 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/framework/channel.h" -#include -#include -#include -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/framework/var_type.h" -#include "paddle/fluid/operators/concurrency/channel_util.h" -#include "paddle/fluid/operators/math/math_function.h" - -static constexpr char Channel[] = "Channel"; -static constexpr char X[] = "X"; - -namespace paddle { -namespace operators { - -class ChannelSendOp : public framework::OperatorBase { - public: - ChannelSendOp(const std::string &type, - const framework::VariableNameMap &inputs, - const framework::VariableNameMap &outputs, - const framework::AttributeMap &attrs) - : framework::OperatorBase(type, inputs, outputs, attrs) {} - - void InferShape(framework::InferShapeContext *ctx) const { - PADDLE_ENFORCE(ctx->HasInput(Channel), - "Input(Channel) of ChannelSendOp should not be null."); - PADDLE_ENFORCE(ctx->HasInput(X), - "Input(X) of ChannelSendOp should not be null."); - } - - private: - void RunImpl(const framework::Scope &scope, - const platform::Place &dev_place) const override { - // Get the channel holder created by channel_create op, passed as input. - framework::ChannelHolder *ch = - scope.FindVar(Input(Channel))->GetMutable(); - auto input_var = scope.FindVar(Input(X)); - - // Send the input data through the channel. - concurrency::ChannelSend(ch, input_var); - } -}; - -class ChannelSendOpMaker : public framework::OpProtoAndCheckerMaker { - public: - void Make() override { - AddInput(Channel, - "(Channel) A variable which \"sends\" the passed in value to " - "a listening receiver.") - .AsDuplicable(); - AddInput(X, "(Variable) The value which gets sent by the channel.") - .AsDuplicable(); - AddComment(R"DOC( -)DOC"); - } -}; -} // namespace operators -} // namespace paddle - -REGISTER_OPERATOR(channel_send, paddle::operators::ChannelSendOp, - paddle::framework::EmptyGradOpMaker, - paddle::operators::ChannelSendOpMaker); diff --git a/paddle/fluid/operators/concurrency/CMakeLists.txt b/paddle/fluid/operators/concurrency/CMakeLists.txt deleted file mode 100644 index e4617440d152b4c15d09e81cd19c76739b95b979..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/concurrency/CMakeLists.txt +++ /dev/null @@ -1 +0,0 @@ -cc_library(concurrency SRCS channel_util.cc DEPS device_context framework_proto boost eigen3) diff --git a/paddle/fluid/operators/concurrency/channel_util.cc b/paddle/fluid/operators/concurrency/channel_util.cc deleted file mode 100644 index fba4abf1897bceea615222b2438700085ed8e551..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/concurrency/channel_util.cc +++ /dev/null @@ -1,111 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/concurrency/channel_util.h" -#include "paddle/fluid/framework/var_type.h" - -namespace poc = paddle::operators::concurrency; - -void poc::ChannelSend(framework::ChannelHolder *ch, framework::Variable *var) { - auto type = framework::ToVarType(var->Type()); - if (type == framework::proto::VarType_Type_LOD_TENSOR) - ch->Send(var->GetMutable()); - else if (type == framework::proto::VarType_Type_LOD_RANK_TABLE) - ch->Send(var->GetMutable()); - else if (type == framework::proto::VarType_Type_LOD_TENSOR_ARRAY) - ch->Send(var->GetMutable()); - else if (type == framework::proto::VarType_Type_SELECTED_ROWS) - ch->Send(var->GetMutable()); - else if (type == framework::proto::VarType_Type_READER) - ch->Send(var->GetMutable()); - else if (type == framework::proto::VarType_Type_CHANNEL) - ch->Send(var->GetMutable()); - else - PADDLE_THROW("ChannelSend:Unsupported type"); -} - -bool poc::ChannelReceive(framework::ChannelHolder *ch, - framework::Variable *var) { - // Get type of channel and use that to call mutable data for Variable - auto type = framework::ToVarType(ch->Type()); - if (type == framework::proto::VarType_Type_LOD_TENSOR) - return ch->Receive(var->GetMutable()); - else if (type == framework::proto::VarType_Type_LOD_RANK_TABLE) - return ch->Receive(var->GetMutable()); - else if (type == framework::proto::VarType_Type_LOD_TENSOR_ARRAY) - return ch->Receive(var->GetMutable()); - else if (type == framework::proto::VarType_Type_SELECTED_ROWS) - return ch->Receive(var->GetMutable()); - else if (type == framework::proto::VarType_Type_READER) - return ch->Receive(var->GetMutable()); - else if (type == framework::proto::VarType_Type_CHANNEL) - return ch->Receive(var->GetMutable()); - else - PADDLE_THROW("ChannelReceive:Unsupported type"); -} - -void poc::ChannelAddToSendQ(framework::ChannelHolder *ch, const void *referrer, - framework::Variable *var, - std::shared_ptr cond, - std::function cb) { - auto type = framework::ToVarType(var->Type()); - if (type == framework::proto::VarType_Type_LOD_TENSOR) { - ch->AddToSendQ(referrer, var->GetMutable(), cond, cb); - } else if (type == framework::proto::VarType_Type_LOD_RANK_TABLE) { - ch->AddToSendQ(referrer, var->GetMutable(), cond, - cb); - } else if (type == framework::proto::VarType_Type_LOD_TENSOR_ARRAY) { - ch->AddToSendQ(referrer, var->GetMutable(), cond, - cb); - } else if (type == framework::proto::VarType_Type_SELECTED_ROWS) { - ch->AddToSendQ(referrer, var->GetMutable(), cond, - cb); - } else if (type == framework::proto::VarType_Type_READER) { - ch->AddToSendQ(referrer, var->GetMutable(), cond, - cb); - } else if (type == framework::proto::VarType_Type_CHANNEL) { - ch->AddToSendQ(referrer, var->GetMutable(), cond, - cb); - } else { - PADDLE_THROW("ChannelAddToSendQ:Unsupported type"); - } -} - -void poc::ChannelAddToReceiveQ( - framework::ChannelHolder *ch, const void *referrer, - framework::Variable *var, std::shared_ptr cond, - std::function cb) { - auto type = framework::ToVarType(var->Type()); - if (type == framework::proto::VarType_Type_LOD_TENSOR) { - ch->AddToReceiveQ(referrer, var->GetMutable(), cond, - cb); - } else if (type == framework::proto::VarType_Type_LOD_RANK_TABLE) { - ch->AddToReceiveQ(referrer, var->GetMutable(), - cond, cb); - } else if (type == framework::proto::VarType_Type_LOD_TENSOR_ARRAY) { - ch->AddToReceiveQ(referrer, var->GetMutable(), - cond, cb); - } else if (type == framework::proto::VarType_Type_SELECTED_ROWS) { - ch->AddToReceiveQ(referrer, var->GetMutable(), - cond, cb); - } else if (type == framework::proto::VarType_Type_READER) { - ch->AddToReceiveQ(referrer, var->GetMutable(), - cond, cb); - } else if (type == framework::proto::VarType_Type_CHANNEL) { - ch->AddToReceiveQ(referrer, var->GetMutable(), - cond, cb); - } else { - PADDLE_THROW("ChannelAddToReceiveQ:Unsupported type"); - } -} diff --git a/paddle/fluid/operators/concurrency/channel_util.h b/paddle/fluid/operators/concurrency/channel_util.h deleted file mode 100644 index cd18ca78c6fdecdc6c72748611ccdd9c2690ef46..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/concurrency/channel_util.h +++ /dev/null @@ -1,38 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include "paddle/fluid/framework/channel.h" -#include "paddle/fluid/framework/variable.h" - -namespace paddle { -namespace operators { -namespace concurrency { - -void ChannelSend(framework::ChannelHolder *ch, framework::Variable *var); -bool ChannelReceive(framework::ChannelHolder *ch, framework::Variable *var); - -void ChannelAddToSendQ(framework::ChannelHolder *ch, const void *referrer, - framework::Variable *var, - std::shared_ptr cond, - std::function cb); -void ChannelAddToReceiveQ(framework::ChannelHolder *ch, const void *referrer, - framework::Variable *var, - std::shared_ptr cond, - std::function cb); - -} // namespace concurrency -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/conv_op.h b/paddle/fluid/operators/conv_op.h index b3140116dfe6a17a400bb88219ff43b249ecb32a..ef76106f17218a03d24ebc0eca43dbb0ae935093 100644 --- a/paddle/fluid/operators/conv_op.h +++ b/paddle/fluid/operators/conv_op.h @@ -380,7 +380,8 @@ class DepthwiseConvKernel : public framework::OpKernel { math::DepthwiseConvFunctor depthwiseConv; auto& dev_ctx = context.template device_context(); - depthwiseConv(dev_ctx, *input, filter, strides, paddings, output); + depthwiseConv(dev_ctx, *input, filter, strides, paddings, dilations, + output); } }; @@ -415,14 +416,14 @@ class DepthwiseConvGradKernel : public framework::OpKernel { input_grad->mutable_data(context.GetPlace()); set_zero(dev_ctx, input_grad, static_cast(0)); depthwiseConvInputGrad(dev_ctx, *input, filter, *output_grad, strides, - paddings, input_grad); + paddings, dilations, input_grad); } if (filter_grad) { filter_grad->mutable_data(context.GetPlace()); set_zero(dev_ctx, filter_grad, static_cast(0)); depthwiseConvFilterGrad(dev_ctx, *input, *output_grad, strides, paddings, - filter_grad); + dilations, filter_grad); } } }; diff --git a/paddle/fluid/operators/conv_transpose_op.h b/paddle/fluid/operators/conv_transpose_op.h index 0d9c6a62fec1ea24bee5c24b4a7b792781f14d9e..88c578b1410558b9adcd55f1cd6b53fb9cb124e2 100644 --- a/paddle/fluid/operators/conv_transpose_op.h +++ b/paddle/fluid/operators/conv_transpose_op.h @@ -345,7 +345,7 @@ class DepthwiseConvTransposeKernel : public framework::OpKernel { math::DepthwiseConvInputGradFunctor depthwiseConvInputGrad; depthwiseConvInputGrad(dev_ctx, *output, filter, *input, strides, paddings, - output); + dilations, output); } }; @@ -367,10 +367,11 @@ class DepthwiseConvTransposeGradKernel : public framework::OpKernel { auto& dev_ctx = context.template device_context(); std::vector strides = context.Attr>("strides"); std::vector paddings = context.Attr>("paddings"); + std::vector dilations = context.Attr>("dilations"); if (input_grad) { math::DepthwiseConvFunctor depthwiseConv; - depthwiseConv(dev_ctx, *output_grad, filter, strides, paddings, + depthwiseConv(dev_ctx, *output_grad, filter, strides, paddings, dilations, input_grad); } @@ -382,7 +383,7 @@ class DepthwiseConvTransposeGradKernel : public framework::OpKernel { math::DepthwiseConvFilterGradFunctor depthwiseConvFilterGrad; depthwiseConvFilterGrad(dev_ctx, *output_grad, *input, strides, paddings, - filter_grad); + dilations, filter_grad); } } }; diff --git a/paddle/fluid/operators/cub_reduce.h b/paddle/fluid/operators/cub_reduce.h new file mode 100644 index 0000000000000000000000000000000000000000..16fdad775f7befaac04b1ac59a601f04e0ab2bdc --- /dev/null +++ b/paddle/fluid/operators/cub_reduce.h @@ -0,0 +1,322 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include +#include + +#include // NOLINT +#include "paddle/fluid/framework/tensor.h" + +namespace paddle { +namespace operators { + +namespace detail { +template +struct Array { + public: + HOSTDEVICE inline Array() {} + + HOSTDEVICE inline T& operator[](size_t index) { return data_[index]; } + + HOSTDEVICE inline const T& operator[](size_t index) const { + return data_[index]; + } + + HOSTDEVICE constexpr inline size_t size() const { return ElementCount; } + + template + static inline Array From(const VectorLikeType& vec) { + PADDLE_ENFORCE_EQ(vec.size(), ElementCount, "size not match"); + size_t n = static_cast(vec.size()); + Array ret; + for (size_t i = 0; i < n; ++i) ret[i] = vec[i]; + return ret; + } + + private: + T data_[ElementCount]; +}; + +// reduce the last axis of 2d array +template +__global__ void ReduceKernel2D(const Tx* x, Ty* y, ReduceOp reducer, + TransformOp transformer, Ty init, + int reduce_num) { + __shared__ typename cub::BlockReduce::TempStorage temp_storage; + int idx_x = blockIdx.x * reduce_num; + int idx_y = threadIdx.x; + Ty reduce_var = init; + for (int idx_y = threadIdx.x; idx_y < reduce_num; idx_y += BlockDim) + reduce_var = reducer(reduce_var, transformer(x[idx_x + idx_y])); + + reduce_var = + cub::BlockReduce(temp_storage).Reduce(reduce_var, reducer); + + if (threadIdx.x == 0) { + y[blockIdx.x] = reduce_var; + } +} + +template +__global__ void ReduceKernel(const Tx* x, Ty* y, ReduceOp reducer, + TransformOp transformer, Ty init, int reduce_num, + Array x_strides, + Array reduce_dim, + Array reduce_strides, + Array left_dim, + Array left_strides) { + __shared__ typename cub::BlockReduce::TempStorage temp_storage; + Array sub_index; + int left_idx = blockIdx.x; + for (int i = 0; i < Rank - ReduceRank; ++i) { + sub_index[left_dim[i]] = left_idx / left_strides[i]; + left_idx %= left_strides[i]; + } + + int reduce_idx = threadIdx.x; + for (int j = 0; j < ReduceRank; ++j) { + sub_index[reduce_dim[j]] = reduce_idx / reduce_strides[j]; + reduce_idx %= reduce_strides[j]; + } + + int idx_x = 0; + for (int k = 0; k < Rank; ++k) idx_x += (sub_index[k] * x_strides[k]); + Ty reduce_var = static_cast(transformer(x[idx_x])); + + for (int i = threadIdx.x + BlockDim; i < reduce_num; i += BlockDim) { + int reduce_idx = i; + for (int j = 0; j < ReduceRank; ++j) { + sub_index[reduce_dim[j]] = reduce_idx / reduce_strides[j]; + reduce_idx %= reduce_strides[j]; + } + + int idx_x = 0; + for (int k = 0; k < Rank; ++k) idx_x += (sub_index[k] * x_strides[k]); + reduce_var = static_cast(reducer(reduce_var, transformer(x[idx_x]))); + } + + reduce_var = + cub::BlockReduce(temp_storage).Reduce(reduce_var, reducer); + + if (threadIdx.x == 0) { + y[blockIdx.x] = reduce_var; + } +} + +static inline std::vector GetStrides(const std::vector& dims) { + int n = static_cast(dims.size()); + if (n == 0) return std::vector(); + std::vector strides(n); + strides.back() = 1; + for (int i = n - 2; i >= 0; --i) { + strides[i] = strides[i + 1] * dims[i + 1]; + } + return strides; +} + +static inline std::vector GetStrides(const std::vector& dims, + const std::vector& idx) { + int n = static_cast(idx.size()); + if (n == 0) return std::vector(); + std::vector strides(n); + strides.back() = 1; + for (int i = n - 2; i >= 0; --i) { + strides[i] = strides[i + 1] * dims[idx[i + 1]]; + } + return strides; +} + +constexpr int kMaxBlockDim = 512; + +static inline int GetDesiredBlockDim(int block_dim) { + return block_dim >= kMaxBlockDim + ? kMaxBlockDim + : (1 << static_cast(std::log2(block_dim))); +} + +template +static void TensorReduceImpl( + const Tx* x_data, Ty* y_data, const platform::Place& place, + const ReduceOp& reducer, const TransformOp& transformer, const Ty& init, + int left_num, int reduce_num, const std::vector& x_strides, + const std::vector& reduce_dim, const std::vector& reduce_strides, + const std::vector& left_dim, const std::vector& left_strides, + cudaStream_t stream) { +#define CUB_RANK_CASE(i, ...) \ + case i: { \ + constexpr auto kRank = i; \ + switch (reduce_rank) { __VA_ARGS__; } \ + } break + +#define CUB_REDUCE_RANK_CASE(i, ...) \ + case i: { \ + constexpr auto kReduceRank = i; \ + ReduceKernel<<>>( \ + x_data, y_data, reducer, transformer, init, reduce_num, \ + Array::From(x_strides), \ + Array::From(reduce_dim), \ + Array::From(reduce_strides), \ + Array::From(left_dim), \ + Array::From(left_strides)); \ + } break + + int rank = x_strides.size(); + int reduce_rank = reduce_strides.size(); + if (rank == reduce_rank) { + cub::TransformInputIterator trans_x( + x_data, transformer); + size_t temp_storage_bytes = 0; + cub::DeviceReduce::Reduce(nullptr, temp_storage_bytes, trans_x, y_data, + reduce_num, reducer, init, stream); + framework::Tensor tmp; + auto* temp_storage = tmp.mutable_data( + framework::make_ddim({static_cast(temp_storage_bytes)}), + place); + cub::DeviceReduce::Reduce(temp_storage, temp_storage_bytes, trans_x, y_data, + reduce_num, reducer, init, stream); + return; + } + if (rank == 2 && reduce_rank == 1 && reduce_dim[0] == 1) { + ReduceKernel2D<<>>( + x_data, y_data, reducer, transformer, init, reduce_num); + return; + } + /* + if (rank == 3 && reduce_rank == 1 && reduce_dim[0] == 1) { + // TODO(liangdun): we can optimize 3d case which the 2nd axis is reduced. + // Currently, it is handled by code below, but inefficient + return; + } + */ + + switch (rank) { + CUB_RANK_CASE(2, CUB_REDUCE_RANK_CASE(1);); + + CUB_RANK_CASE(3, CUB_REDUCE_RANK_CASE(1); CUB_REDUCE_RANK_CASE(2);); + + CUB_RANK_CASE(4, CUB_REDUCE_RANK_CASE(1); CUB_REDUCE_RANK_CASE(2); + CUB_REDUCE_RANK_CASE(3);); + + CUB_RANK_CASE(5, CUB_REDUCE_RANK_CASE(1); CUB_REDUCE_RANK_CASE(2); + CUB_REDUCE_RANK_CASE(3); CUB_REDUCE_RANK_CASE(4);); + + CUB_RANK_CASE(6, CUB_REDUCE_RANK_CASE(1); CUB_REDUCE_RANK_CASE(2); + CUB_REDUCE_RANK_CASE(3); CUB_REDUCE_RANK_CASE(4); + CUB_REDUCE_RANK_CASE(5);); + + CUB_RANK_CASE(7, CUB_REDUCE_RANK_CASE(1); CUB_REDUCE_RANK_CASE(2); + CUB_REDUCE_RANK_CASE(3); CUB_REDUCE_RANK_CASE(4); + CUB_REDUCE_RANK_CASE(5); CUB_REDUCE_RANK_CASE(6);); + + CUB_RANK_CASE(8, CUB_REDUCE_RANK_CASE(1); CUB_REDUCE_RANK_CASE(2); + CUB_REDUCE_RANK_CASE(3); CUB_REDUCE_RANK_CASE(4); + CUB_REDUCE_RANK_CASE(5); CUB_REDUCE_RANK_CASE(6);); + + CUB_RANK_CASE(9, CUB_REDUCE_RANK_CASE(1); CUB_REDUCE_RANK_CASE(2); + CUB_REDUCE_RANK_CASE(3); CUB_REDUCE_RANK_CASE(4); + CUB_REDUCE_RANK_CASE(5); CUB_REDUCE_RANK_CASE(6); + CUB_REDUCE_RANK_CASE(7); CUB_REDUCE_RANK_CASE(8);); + } + +#undef CUB_REDUCE_RANK_CASE +#undef CUB_RANK_CASE +} + +} // namespace detail + +template +void TensorReduce(const framework::Tensor& x, framework::Tensor* y, + std::vector origin_reduce_dims, const Ty& init, + const ReduceOp& reducer, const TransformOp& transformer, + cudaStream_t stream) { + auto x_dim = framework::vectorize2int(x.dims()); + std::vector new_x_dim, new_reduce_dims; + int is_reduced = 0; + for (auto e : origin_reduce_dims) { + auto pos = e >= 0 ? e : e + x_dim.size(); + is_reduced |= 1 << e; + } + for (int i = 0; i < x_dim.size(); i++) { + if ((i == 0) || (((is_reduced >> i) ^ (is_reduced >> (i - 1))) & 1)) { + new_x_dim.push_back(x_dim[i]); + if ((is_reduced >> i) & 1) + new_reduce_dims.push_back(new_x_dim.size() - 1); + } else { + new_x_dim[new_x_dim.size() - 1] *= x_dim[i]; + } + } + x_dim = new_x_dim; + origin_reduce_dims = new_reduce_dims; + int x_rank = static_cast(x_dim.size()); + std::set left_set, reduce_set; + for (int i = 0; i < x_rank; ++i) left_set.insert(i); + + for (auto e : origin_reduce_dims) { + left_set.erase(e); + reduce_set.insert(e); + } + + std::vector reduce_dim(reduce_set.begin(), reduce_set.end()); + std::vector left_dim(left_set.begin(), left_set.end()); + + std::vector x_strides = detail::GetStrides(x_dim); + std::vector reduce_strides = detail::GetStrides(x_dim, reduce_dim); + std::vector left_strides = detail::GetStrides(x_dim, left_dim); + int reduce_num = reduce_strides[0] * x_dim[reduce_dim[0]]; + int left_num = 1; + if (left_dim.size()) left_num = left_strides[0] * x_dim[left_dim[0]]; + + std::vector y_dim(left_dim.size()); + for (int i = 0; i < left_dim.size(); ++i) { + y_dim[i] = x_dim[left_dim[i]]; + } + auto x_data = x.data(); + auto y_data = y->mutable_data(x.place()); + if (reduce_num == 1) return; + +#define CUB_BLOCK_DIM_CASE(block_dim) \ + case block_dim: { \ + constexpr auto kBlockDim = block_dim; \ + detail::TensorReduceImpl( \ + x_data, y_data, x.place(), reducer, transformer, init, left_num, \ + reduce_num, x_strides, reduce_dim, reduce_strides, left_dim, \ + left_strides, stream); \ + } break + + switch (detail::GetDesiredBlockDim(reduce_num)) { + CUB_BLOCK_DIM_CASE(512); + CUB_BLOCK_DIM_CASE(256); + CUB_BLOCK_DIM_CASE(128); + CUB_BLOCK_DIM_CASE(64); + CUB_BLOCK_DIM_CASE(32); + CUB_BLOCK_DIM_CASE(16); + CUB_BLOCK_DIM_CASE(8); + CUB_BLOCK_DIM_CASE(4); + CUB_BLOCK_DIM_CASE(2); + } +#undef CUB_BLOCK_DIM_CASE +} + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/detection/CMakeLists.txt b/paddle/fluid/operators/detection/CMakeLists.txt index 5a058ddbc59c6135bacf7c2dc4b5c8b687f9b2b1..aa8ed502fc94bd0970dfe5dbf00ef090e799ad30 100644 --- a/paddle/fluid/operators/detection/CMakeLists.txt +++ b/paddle/fluid/operators/detection/CMakeLists.txt @@ -30,7 +30,13 @@ detection_library(polygon_box_transform_op SRCS polygon_box_transform_op.cc polygon_box_transform_op.cu) detection_library(rpn_target_assign_op SRCS rpn_target_assign_op.cc) detection_library(generate_proposal_labels_op SRCS generate_proposal_labels_op.cc) -detection_library(generate_proposals_op SRCS generate_proposals_op.cc) + +if(WITH_GPU) + detection_library(generate_proposals_op SRCS generate_proposals_op.cc generate_proposals_op.cu DEPS memory cub) +else() + detection_library(generate_proposals_op SRCS generate_proposals_op.cc) +endif() + detection_library(roi_perspective_transform_op SRCS roi_perspective_transform_op.cc roi_perspective_transform_op.cu) #Export local libraries to parent set(DETECTION_LIBRARY ${LOCAL_DETECTION_LIBS} PARENT_SCOPE) diff --git a/paddle/fluid/operators/detection/generate_proposals_op.cc b/paddle/fluid/operators/detection/generate_proposals_op.cc index c33aa255362bc5234f2813fb93e70c943b03c33f..818d58ea9ee327fd99182ad2f8cbeed07e6aaea2 100644 --- a/paddle/fluid/operators/detection/generate_proposals_op.cc +++ b/paddle/fluid/operators/detection/generate_proposals_op.cc @@ -15,6 +15,7 @@ limitations under the License. */ #include #include #include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/var_type.h" #include "paddle/fluid/operators/gather.h" #include "paddle/fluid/operators/math/math_function.h" @@ -69,7 +70,7 @@ class GenerateProposalsOp : public framework::OperatorWithKernel { const framework::ExecutionContext &ctx) const override { return framework::OpKernelType( framework::ToDataType(ctx.Input("Anchors")->type()), - platform::CPUPlace()); + ctx.device_context()); } }; @@ -162,7 +163,7 @@ void FilterBoxes(const platform::DeviceContext &ctx, Tensor *boxes, const T *im_info_data = im_info.data(); T *boxes_data = boxes->mutable_data(ctx.GetPlace()); T im_scale = im_info_data[2]; - keep->Resize({boxes->dims()[0], 1}); + keep->Resize({boxes->dims()[0]}); min_size = std::max(min_size, 1.0f); int *keep_data = keep->mutable_data(ctx.GetPlace()); @@ -463,7 +464,7 @@ class GenerateProposalsOpMaker : public framework::OpProtoAndCheckerMaker { AddAttr("post_nms_topN", "post_nms_topN"); AddAttr("nms_thresh", "nms_thres"); AddAttr("min_size", "min size"); - AddAttr("eta", "eta"); + AddAttr("eta", "The parameter for adaptive NMS."); AddComment(R"DOC( Generate Proposals OP diff --git a/paddle/fluid/operators/detection/generate_proposals_op.cu b/paddle/fluid/operators/detection/generate_proposals_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..6146ff509d768c0317a5c65ed22af1a3075977a2 --- /dev/null +++ b/paddle/fluid/operators/detection/generate_proposals_op.cu @@ -0,0 +1,449 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include +#include +#include "cub/cub.cuh" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/memory/memory.h" +#include "paddle/fluid/operators/gather.cu.h" +#include "paddle/fluid/operators/math/math_function.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +using LoDTensor = framework::LoDTensor; + +namespace { + +#define DIVUP(m, n) ((m) / (n) + ((m) % (n) > 0)) +#define CUDA_1D_KERNEL_LOOP(i, n) \ + for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \ + i += blockDim.x * gridDim.x) + +int const kThreadsPerBlock = sizeof(uint64_t) * 8; + +template +__global__ void RangeInitKernel(const T start, const T delta, const int size, + T *out) { + CUDA_1D_KERNEL_LOOP(i, size) { out[i] = start + i * delta; } +} + +template +void SortDescending(const platform::CUDADeviceContext &ctx, const Tensor &value, + Tensor *value_out, Tensor *index_out) { + int num = value.numel(); + Tensor index_in_t; + int *idx_in = index_in_t.mutable_data({num}, ctx.GetPlace()); + int block = 512; + auto stream = ctx.stream(); + RangeInitKernel<<>>(0, 1, num, idx_in); + int *idx_out = index_out->mutable_data({num}, ctx.GetPlace()); + + const T *keys_in = value.data(); + T *keys_out = value_out->mutable_data({num}, ctx.GetPlace()); + + // Determine temporary device storage requirements + void *d_temp_storage = NULL; + size_t temp_storage_bytes = 0; + cub::DeviceRadixSort::SortPairsDescending( + d_temp_storage, temp_storage_bytes, keys_in, keys_out, idx_in, idx_out, + num); + + // Allocate temporary storage + auto place = boost::get(ctx.GetPlace()); + d_temp_storage = memory::Alloc(place, temp_storage_bytes); + + // Run sorting operation + cub::DeviceRadixSort::SortPairsDescending( + d_temp_storage, temp_storage_bytes, keys_in, keys_out, idx_in, idx_out, + num); + + memory::Free(place, d_temp_storage); +} + +template +__device__ __forceinline__ T Min(T x, T y) { + return x < y ? x : y; +} + +template +__device__ __forceinline__ T Max(T x, T y) { + return x > y ? x : y; +} + +template +__global__ void BoxDecodeAndClipKernel(const T *anchor, const T *deltas, + const T *var, const int *index, + const T *im_info, const int num, + T *proposals) { + T kBBoxClipDefault = log(1000.0 / 16.0); + CUDA_1D_KERNEL_LOOP(i, num) { + int k = index[i] * 4; + T axmin = anchor[k]; + T aymin = anchor[k + 1]; + T axmax = anchor[k + 2]; + T aymax = anchor[k + 3]; + + T w = axmax - axmin + 1.0; + T h = aymax - aymin + 1.0; + T cx = axmin + 0.5 * w; + T cy = aymin + 0.5 * h; + + T dxmin = deltas[k]; + T dymin = deltas[k + 1]; + T dxmax = deltas[k + 2]; + T dymax = deltas[k + 3]; + + T d_cx = 0., d_cy = 0., d_w = 0., d_h = 0.; + if (var) { + d_cx = cx + dxmin * w * var[k]; + d_cy = cy + dymin * h * var[k + 1]; + d_w = exp(Min(dxmax * var[k + 2], kBBoxClipDefault)) * w; + d_h = exp(Min(dymax * var[k + 3], kBBoxClipDefault)) * h; + } else { + d_cx = cx + dxmin * w; + d_cy = cy + dymin * h; + d_w = exp(Min(dxmax, kBBoxClipDefault)) * w; + d_h = exp(Min(dymax, kBBoxClipDefault)) * h; + } + + T oxmin = d_cx - d_w * 0.5; + T oymin = d_cy - d_h * 0.5; + T oxmax = d_cx + d_w * 0.5 - 1.; + T oymax = d_cy + d_h * 0.5 - 1.; + + proposals[i * 4] = Max(Min(oxmin, im_info[1] - 1.), 0.); + proposals[i * 4 + 1] = Max(Min(oymin, im_info[0] - 1.), 0.); + proposals[i * 4 + 2] = Max(Min(oxmax, im_info[1] - 1.), 0.); + proposals[i * 4 + 3] = Max(Min(oymax, im_info[0] - 1.), 0.); + } +} + +template +__global__ void FilterBBoxes(const T *bboxes, const T *im_info, + const T min_size, const int num, int *keep_num, + int *keep) { + T im_h = im_info[0]; + T im_w = im_info[1]; + T im_scale = im_info[2]; + + int cnt = 0; + __shared__ int keep_index[BlockSize]; + + CUDA_1D_KERNEL_LOOP(i, num) { + keep_index[threadIdx.x] = -1; + __syncthreads(); + + int k = i * 4; + T xmin = bboxes[k]; + T ymin = bboxes[k + 1]; + T xmax = bboxes[k + 2]; + T ymax = bboxes[k + 3]; + + T w = xmax - xmin + 1.0; + T h = ymax - ymin + 1.0; + T cx = xmin + w / 2.; + T cy = ymin + h / 2.; + + T w_s = (xmax - xmin) / im_scale + 1.; + T h_s = (ymax - ymin) / im_scale + 1.; + + if (w_s >= min_size && h_s >= min_size && cx <= im_w && cy <= im_h) { + keep_index[threadIdx.x] = i; + } + __syncthreads(); + if (threadIdx.x == 0) { + int size = (num - i) < BlockSize ? num - i : BlockSize; + for (int j = 0; j < size; ++j) { + if (keep_index[j] > -1) { + keep[cnt++] = keep_index[j]; + } + } + } + __syncthreads(); + } + if (threadIdx.x == 0) { + keep_num[0] = cnt; + } +} + +__device__ inline float IoU(const float *a, const float *b) { + float left = max(a[0], b[0]), right = min(a[2], b[2]); + float top = max(a[1], b[1]), bottom = min(a[3], b[3]); + float width = max(right - left + 1, 0.f), height = max(bottom - top + 1, 0.f); + float inter_s = width * height; + float s_a = (a[2] - a[0] + 1) * (a[3] - a[1] + 1); + float s_b = (b[2] - b[0] + 1) * (b[3] - b[1] + 1); + return inter_s / (s_a + s_b - inter_s); +} + +__global__ void NMSKernel(const int n_boxes, const float nms_overlap_thresh, + const float *dev_boxes, uint64_t *dev_mask) { + const int row_start = blockIdx.y; + const int col_start = blockIdx.x; + + const int row_size = + min(n_boxes - row_start * kThreadsPerBlock, kThreadsPerBlock); + const int col_size = + min(n_boxes - col_start * kThreadsPerBlock, kThreadsPerBlock); + + __shared__ float block_boxes[kThreadsPerBlock * 4]; + if (threadIdx.x < col_size) { + block_boxes[threadIdx.x * 4 + 0] = + dev_boxes[(kThreadsPerBlock * col_start + threadIdx.x) * 4 + 0]; + block_boxes[threadIdx.x * 4 + 1] = + dev_boxes[(kThreadsPerBlock * col_start + threadIdx.x) * 4 + 1]; + block_boxes[threadIdx.x * 4 + 2] = + dev_boxes[(kThreadsPerBlock * col_start + threadIdx.x) * 4 + 2]; + block_boxes[threadIdx.x * 4 + 3] = + dev_boxes[(kThreadsPerBlock * col_start + threadIdx.x) * 4 + 3]; + } + __syncthreads(); + + if (threadIdx.x < row_size) { + const int cur_box_idx = kThreadsPerBlock * row_start + threadIdx.x; + const float *cur_box = dev_boxes + cur_box_idx * 4; + int i = 0; + uint64_t t = 0; + int start = 0; + if (row_start == col_start) { + start = threadIdx.x + 1; + } + for (i = start; i < col_size; i++) { + if (IoU(cur_box, block_boxes + i * 4) > nms_overlap_thresh) { + t |= 1ULL << i; + } + } + const int col_blocks = DIVUP(n_boxes, kThreadsPerBlock); + dev_mask[cur_box_idx * col_blocks + col_start] = t; + } +} + +template +void NMS(const platform::CUDADeviceContext &ctx, const Tensor &proposals, + const Tensor &sorted_indices, const T nms_threshold, + Tensor *keep_out) { + int boxes_num = proposals.dims()[0]; + PADDLE_ENFORCE_EQ(boxes_num, sorted_indices.dims()[0]); + + const int col_blocks = DIVUP(boxes_num, kThreadsPerBlock); + dim3 blocks(DIVUP(boxes_num, kThreadsPerBlock), + DIVUP(boxes_num, kThreadsPerBlock)); + dim3 threads(kThreadsPerBlock); + + const T *boxes = proposals.data(); + auto place = boost::get(ctx.GetPlace()); + int size_bytes = boxes_num * col_blocks * sizeof(uint64_t); + uint64_t *d_mask = + reinterpret_cast(memory::Alloc(place, size_bytes)); + NMSKernel<<>>(boxes_num, nms_threshold, boxes, d_mask); + uint64_t *h_mask = reinterpret_cast( + memory::Alloc(platform::CPUPlace(), size_bytes)); + memory::Copy(platform::CPUPlace(), h_mask, place, d_mask, size_bytes, 0); + + std::vector remv(col_blocks); + memset(&remv[0], 0, sizeof(uint64_t) * col_blocks); + + std::vector keep_vec; + int num_to_keep = 0; + for (int i = 0; i < boxes_num; i++) { + int nblock = i / kThreadsPerBlock; + int inblock = i % kThreadsPerBlock; + + if (!(remv[nblock] & (1ULL << inblock))) { + ++num_to_keep; + keep_vec.push_back(i); + uint64_t *p = &h_mask[0] + i * col_blocks; + for (int j = nblock; j < col_blocks; j++) { + remv[j] |= p[j]; + } + } + } + int *keep = keep_out->mutable_data({num_to_keep}, ctx.GetPlace()); + memory::Copy(place, keep, platform::CPUPlace(), keep_vec.data(), + sizeof(int) * num_to_keep, 0); + memory::Free(place, d_mask); + memory::Free(platform::CPUPlace(), h_mask); +} + +template +std::pair ProposalForOneImage( + const platform::CUDADeviceContext &ctx, const Tensor &im_info, + const Tensor &anchors, const Tensor &variances, + const Tensor &bbox_deltas, // [M, 4] + const Tensor &scores, // [N, 1] + int pre_nms_top_n, int post_nms_top_n, float nms_thresh, float min_size, + float eta) { + // 1. pre nms + Tensor scores_sort, index_sort; + SortDescending(ctx, scores, &scores_sort, &index_sort); + int num = scores.numel(); + int pre_nms_num = (pre_nms_top_n <= 0 || pre_nms_top_n > num) ? scores.numel() + : pre_nms_top_n; + scores_sort.Resize({pre_nms_num, 1}); + index_sort.Resize({pre_nms_num, 1}); + + // 2. box decode and clipping + Tensor proposals; + proposals.mutable_data({pre_nms_num, 4}, ctx.GetPlace()); + int block = 512; + auto stream = ctx.stream(); + BoxDecodeAndClipKernel<<>>( + anchors.data(), bbox_deltas.data(), variances.data(), + index_sort.data(), im_info.data(), pre_nms_num, + proposals.data()); + + // 3. filter + Tensor keep_index, keep_num_t; + keep_index.mutable_data({pre_nms_num}, ctx.GetPlace()); + keep_num_t.mutable_data({1}, ctx.GetPlace()); + min_size = std::max(min_size, 1.0f); + FilterBBoxes<<<1, 512, 0, stream>>>( + proposals.data(), im_info.data(), min_size, pre_nms_num, + keep_num_t.data(), keep_index.data()); + int keep_num; + const auto gpu_place = boost::get(ctx.GetPlace()); + memory::Copy(platform::CPUPlace(), &keep_num, gpu_place, + keep_num_t.data(), sizeof(int), 0); + keep_index.Resize({keep_num}); + + Tensor scores_filter, proposals_filter; + proposals_filter.mutable_data({keep_num, 4}, ctx.GetPlace()); + scores_filter.mutable_data({keep_num, 1}, ctx.GetPlace()); + GPUGather(ctx, proposals, keep_index, &proposals_filter); + GPUGather(ctx, scores_sort, keep_index, &scores_filter); + + if (nms_thresh <= 0) { + return std::make_pair(proposals_filter, scores_filter); + } + + // 4. nms + Tensor keep_nms; + NMS(ctx, proposals_filter, keep_index, nms_thresh, &keep_nms); + if (post_nms_top_n > 0 && post_nms_top_n < keep_nms.numel()) { + keep_nms.Resize({post_nms_top_n}); + } + + Tensor scores_nms, proposals_nms; + proposals_nms.mutable_data({keep_nms.numel(), 4}, ctx.GetPlace()); + scores_nms.mutable_data({keep_nms.numel(), 1}, ctx.GetPlace()); + GPUGather(ctx, proposals_filter, keep_nms, &proposals_nms); + GPUGather(ctx, scores_filter, keep_nms, &scores_nms); + + return std::make_pair(proposals_nms, scores_nms); +} +} // namespace + +template +class CUDAGenerateProposalsKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &context) const override { + auto *scores = context.Input("Scores"); + auto *bbox_deltas = context.Input("BboxDeltas"); + auto *im_info = context.Input("ImInfo"); + auto *anchors = context.Input("Anchors"); + auto *variances = context.Input("Variances"); + + auto *rpn_rois = context.Output("RpnRois"); + auto *rpn_roi_probs = context.Output("RpnRoiProbs"); + + int pre_nms_top_n = context.Attr("pre_nms_topN"); + int post_nms_top_n = context.Attr("post_nms_topN"); + float nms_thresh = context.Attr("nms_thresh"); + float min_size = context.Attr("min_size"); + float eta = context.Attr("eta"); + PADDLE_ENFORCE_GE(eta, 1., "Not support adaptive NMS."); + + auto &dev_ctx = context.template device_context(); + + auto scores_dim = scores->dims(); + int64_t num = scores_dim[0]; + int64_t c_score = scores_dim[1]; + int64_t h_score = scores_dim[2]; + int64_t w_score = scores_dim[3]; + + auto bbox_dim = bbox_deltas->dims(); + int64_t c_bbox = bbox_dim[1]; + int64_t h_bbox = bbox_dim[2]; + int64_t w_bbox = bbox_dim[3]; + + Tensor bbox_deltas_swap, scores_swap; + bbox_deltas_swap.mutable_data({num, h_bbox, w_bbox, c_bbox}, + dev_ctx.GetPlace()); + scores_swap.mutable_data({num, h_score, w_score, c_score}, + dev_ctx.GetPlace()); + + math::Transpose trans; + std::vector axis = {0, 2, 3, 1}; + trans(dev_ctx, *bbox_deltas, &bbox_deltas_swap, axis); + trans(dev_ctx, *scores, &scores_swap, axis); + + Tensor *anchor = const_cast(anchors); + anchor->Resize({anchors->numel() / 4, 4}); + Tensor *var = const_cast(variances); + var->Resize({var->numel() / 4, 4}); + + rpn_rois->mutable_data({bbox_deltas->numel() / 4, 4}, + context.GetPlace()); + rpn_roi_probs->mutable_data({scores->numel(), 1}, context.GetPlace()); + + T *rpn_rois_data = rpn_rois->data(); + T *rpn_roi_probs_data = rpn_roi_probs->data(); + + auto place = boost::get(dev_ctx.GetPlace()); + + int64_t num_proposals = 0; + std::vector offset(1, 0); + for (int64_t i = 0; i < num; ++i) { + Tensor im_info_slice = im_info->Slice(i, i + 1); + Tensor bbox_deltas_slice = bbox_deltas_swap.Slice(i, i + 1); + Tensor scores_slice = scores_swap.Slice(i, i + 1); + + bbox_deltas_slice.Resize({h_bbox * w_bbox * c_bbox / 4, 4}); + scores_slice.Resize({h_score * w_score * c_score, 1}); + + std::pair box_score_pair = + ProposalForOneImage(dev_ctx, im_info_slice, *anchor, *var, + bbox_deltas_slice, scores_slice, pre_nms_top_n, + post_nms_top_n, nms_thresh, min_size, eta); + + Tensor proposals = box_score_pair.first; + Tensor scores = box_score_pair.second; + + memory::Copy(place, rpn_rois_data + num_proposals * 4, place, + proposals.data(), sizeof(T) * proposals.numel(), 0); + memory::Copy(place, rpn_roi_probs_data + num_proposals, place, + scores.data(), sizeof(T) * scores.numel(), 0); + num_proposals += proposals.dims()[0]; + offset.emplace_back(num_proposals); + } + framework::LoD lod; + lod.emplace_back(offset); + rpn_rois->set_lod(lod); + rpn_roi_probs->set_lod(lod); + rpn_rois->Resize({num_proposals, 4}); + rpn_roi_probs->Resize({num_proposals, 1}); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL(generate_proposals, + ops::CUDAGenerateProposalsKernel< + paddle::platform::CUDADeviceContext, float>); diff --git a/paddle/fluid/operators/detection/roi_perspective_transform_op.cc b/paddle/fluid/operators/detection/roi_perspective_transform_op.cc index 4cc980b41b34894f9d915d4b325887548091c0eb..42c720e701fbabacf1280dec2f78d3f6b99dfea2 100644 --- a/paddle/fluid/operators/detection/roi_perspective_transform_op.cc +++ b/paddle/fluid/operators/detection/roi_perspective_transform_op.cc @@ -104,7 +104,6 @@ bool in_quad(T x, T y, T roi_x[], T roi_y[]) { * a31 = (dx3 * dy2 - dx2 * dy3) / (dx1 * dy2 - dx2 * dy1) / (w - 1) * a32 = (dx1 * dy3 - dx3 * dy1) / (dx1 * dy2 - dx2 * dy1) / (h - 1) * a33 = 1 - * */ template void get_transform_matrix(const int transformed_width, @@ -260,8 +259,8 @@ class CPUROIPerspectiveTransformOpKernel : public framework::OpKernel { roi2image.Resize({rois_num}); int* roi2image_data = roi2image.mutable_data(ctx.GetPlace()); auto lod = rois->lod().back(); - for (int i = 0; i < lod.size() - 1; ++i) { - for (int j = lod[i]; j < lod[i + 1]; ++j) { + for (size_t i = 0; i < lod.size() - 1; ++i) { + for (size_t j = lod[i]; j < lod[i + 1]; ++j) { roi2image_data[j] = i; } } @@ -393,8 +392,8 @@ class CPUROIPerspectiveTransformGradOpKernel : public framework::OpKernel { roi2image.Resize({rois_num}); int* roi2image_data = roi2image.mutable_data(ctx.GetPlace()); auto lod = rois->lod().back(); - for (int i = 0; i < lod.size() - 1; ++i) { - for (int j = lod[i]; j < lod[i + 1]; ++j) { + for (size_t i = 0; i < lod.size() - 1; ++i) { + for (size_t j = lod[i]; j < lod[i + 1]; ++j) { roi2image_data[j] = i; } } @@ -404,7 +403,7 @@ class CPUROIPerspectiveTransformGradOpKernel : public framework::OpKernel { for (int in_h = 0; in_h < in_height; ++in_h) { for (int in_w = 0; in_w < in_width; ++in_w) { T gradient = 0.0; - for (int roi_idx = lod[n]; roi_idx < lod[n + 1]; ++roi_idx) { + for (size_t roi_idx = lod[n]; roi_idx < lod[n + 1]; ++roi_idx) { const T* rois = rois_data + roi_idx * 8; T roi_x[4]; T roi_y[4]; diff --git a/paddle/fluid/operators/detection/roi_perspective_transform_op.cu b/paddle/fluid/operators/detection/roi_perspective_transform_op.cu index b683b7573db747bde5f57e530ec53760db099843..c82930cc4994c3854e60f40ae9909a90d82cbff6 100644 --- a/paddle/fluid/operators/detection/roi_perspective_transform_op.cu +++ b/paddle/fluid/operators/detection/roi_perspective_transform_op.cu @@ -345,8 +345,8 @@ class CUDAROIPerspectiveTransformOpKernel : public framework::OpKernel { roi2image.Resize({rois_num}); int* roi2image_data = roi2image.mutable_data(platform::CPUPlace()); auto lod = rois->lod().back(); - for (int i = 0; i < lod.size() - 1; ++i) { - for (int j = lod[i]; j < lod[i + 1]; ++j) { + for (size_t i = 0; i < lod.size() - 1; ++i) { + for (size_t j = lod[i]; j < lod[i + 1]; ++j) { roi2image_data[j] = i; } } @@ -432,7 +432,7 @@ __global__ void RoiTransformGradKernel( T gradient = 0.0; // Accumulate gradient over all RoIs that interpolated this element - for (int roi_idx = lod[n]; roi_idx < lod[n + 1]; ++roi_idx) { + for (size_t roi_idx = lod[n]; roi_idx < lod[n + 1]; ++roi_idx) { const T* rois = rois_data + roi_idx * 8; T roi_x[4]; T roi_y[4]; diff --git a/paddle/fluid/operators/detection_map_op.h b/paddle/fluid/operators/detection_map_op.h index dd1ab85fd8d0c8170afcd9dd2a49ee55c41dc8be..dd5d138a1e979826d59c4731920379b030e3b492 100644 --- a/paddle/fluid/operators/detection_map_op.h +++ b/paddle/fluid/operators/detection_map_op.h @@ -76,8 +76,8 @@ class DetectionMAPOpKernel : public framework::OpKernel { auto ap_type = GetAPType(ctx.Attr("ap_type")); int class_num = ctx.Attr("class_num"); - auto label_lod = in_label->lod(); - auto detect_lod = in_detect->lod(); + auto& label_lod = in_label->lod(); + auto& detect_lod = in_detect->lod(); PADDLE_ENFORCE_EQ(label_lod.size(), 1UL, "Only support one level sequence now."); PADDLE_ENFORCE_EQ(label_lod[0].size(), detect_lod[0].size(), @@ -166,11 +166,11 @@ class DetectionMAPOpKernel : public framework::OpKernel { auto labels = framework::EigenTensor::From(input_label); auto detect = framework::EigenTensor::From(input_detect); - auto label_lod = input_label.lod(); - auto detect_lod = input_detect.lod(); + auto& label_lod = input_label.lod(); + auto& detect_lod = input_detect.lod(); int batch_size = label_lod[0].size() - 1; - auto label_index = label_lod[0]; + auto& label_index = label_lod[0]; for (int n = 0; n < batch_size; ++n) { std::map> boxes; @@ -274,7 +274,6 @@ class DetectionMAPOpKernel : public framework::OpKernel { output_true_pos->set_lod(true_pos_lod); output_false_pos->set_lod(false_pos_lod); - return; } void GetInputPos(const framework::Tensor& input_pos_count, @@ -292,7 +291,7 @@ class DetectionMAPOpKernel : public framework::OpKernel { auto SetData = [](const framework::LoDTensor& pos_tensor, std::map>>& pos) { const T* pos_data = pos_tensor.data(); - auto pos_data_lod = pos_tensor.lod()[0]; + auto& pos_data_lod = pos_tensor.lod()[0]; for (size_t i = 0; i < pos_data_lod.size() - 1; ++i) { for (size_t j = pos_data_lod[i]; j < pos_data_lod[i + 1]; ++j) { T score = pos_data[j * 2]; @@ -317,20 +316,23 @@ class DetectionMAPOpKernel : public framework::OpKernel { std::map>>* false_pos) const { int batch_size = gt_boxes.size(); for (int n = 0; n < batch_size; ++n) { - auto image_gt_boxes = gt_boxes[n]; - for (auto it = image_gt_boxes.begin(); it != image_gt_boxes.end(); ++it) { + auto& image_gt_boxes = gt_boxes[n]; + for (auto& image_gt_box : image_gt_boxes) { size_t count = 0; - auto labeled_bboxes = it->second; + auto& labeled_bboxes = image_gt_box.second; if (evaluate_difficult) { count = labeled_bboxes.size(); } else { - for (size_t i = 0; i < labeled_bboxes.size(); ++i) - if (!(labeled_bboxes[i].is_difficult)) ++count; + for (auto& box : labeled_bboxes) { + if (!box.is_difficult) { + ++count; + } + } } if (count == 0) { continue; } - int label = it->first; + int label = image_gt_box.first; if (label_pos_count->find(label) == label_pos_count->end()) { (*label_pos_count)[label] = count; } else { diff --git a/paddle/fluid/operators/distributed/grpc_client.h b/paddle/fluid/operators/distributed/grpc_client.h index 75a3662316462a222760bfbb7d7906c70f46d143..d8e9cee85bd734c2ed4b1cae03ecee04e304b651 100644 --- a/paddle/fluid/operators/distributed/grpc_client.h +++ b/paddle/fluid/operators/distributed/grpc_client.h @@ -15,6 +15,7 @@ limitations under the License. */ #pragma once #include +#include #include // NOLINT #include // NOLINT diff --git a/paddle/fluid/operators/distributed/request_handler.h b/paddle/fluid/operators/distributed/request_handler.h index 3dbbd75b1e945208395c42ace3235db7891936c5..5be7095acd3c5ac6f880a8a26c246f60a93643b5 100644 --- a/paddle/fluid/operators/distributed/request_handler.h +++ b/paddle/fluid/operators/distributed/request_handler.h @@ -15,6 +15,7 @@ #pragma once #include +#include // NOLINT #include #include diff --git a/paddle/fluid/operators/distributed/rpc_server.h b/paddle/fluid/operators/distributed/rpc_server.h index d88e8c640ffb5ea44e88318cc973c9a783862435..f3e61e1575ced0b9ffbad23e6973121daca9751b 100644 --- a/paddle/fluid/operators/distributed/rpc_server.h +++ b/paddle/fluid/operators/distributed/rpc_server.h @@ -14,6 +14,7 @@ #pragma once +#include #include #include #include // NOLINT diff --git a/paddle/fluid/operators/elementwise_op.h b/paddle/fluid/operators/elementwise_op.h index a79b900b9801e6b80e4433a9acdd4dab6c34859d..94df11bee70dec44f19ee9ffff04ca92d5990ee8 100644 --- a/paddle/fluid/operators/elementwise_op.h +++ b/paddle/fluid/operators/elementwise_op.h @@ -89,7 +89,7 @@ class ElementwiseOpMaker : public framework::OpProtoAndCheckerMaker { AddAttr("use_mkldnn", "(bool, default false). Used by MKLDNN.") .SetDefault(false); AddComment(string::Sprintf(R"DOC( -Limited Elementwise %s Operator +Elementwise %s Operator The equation is: diff --git a/paddle/fluid/operators/extract_rows_op.cc b/paddle/fluid/operators/extract_rows_op.cc index 9a297d03cfb041e584159a5fc5ba214f8ac404b4..3acae3bcdf4a509ab6e7e19f21c4b2ec4d72b7d7 100644 --- a/paddle/fluid/operators/extract_rows_op.cc +++ b/paddle/fluid/operators/extract_rows_op.cc @@ -50,7 +50,7 @@ class ExtractRowsOp : public framework::OperatorBase { auto &in = scope.FindVar(Input("X"))->Get(); auto out = scope.FindVar(Output("Out"))->GetMutable(); - auto in_rows = in.rows(); + auto &in_rows = in.rows(); auto out_dim = framework::make_ddim( std::vector{static_cast(in_rows.size()), 1}); auto dst_ptr = out->mutable_data(out_dim, in.place()); diff --git a/paddle/fluid/operators/fused_embedding_fc_lstm_op.cc b/paddle/fluid/operators/fused_embedding_fc_lstm_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..0b917a403620e2ffb2cbb4ca7856cce9584e1eef --- /dev/null +++ b/paddle/fluid/operators/fused_embedding_fc_lstm_op.cc @@ -0,0 +1,604 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/fused_embedding_fc_lstm_op.h" +#include +#include "paddle/fluid/operators/math/blas.h" +#include "paddle/fluid/operators/math/cpu_vec.h" +#include "paddle/fluid/operators/math/fc_compute.h" +#include "paddle/fluid/operators/math/sequence2batch.h" +#include "paddle/fluid/platform/cpu_info.h" + +namespace paddle { +namespace operators { + +void FusedEmbeddingFCLSTMOp::InferShape( + framework::InferShapeContext* ctx) const { + PADDLE_ENFORCE(ctx->HasInput("Embeddings"), + "Assert only one Input(Embeddings) of LSTM."); + PADDLE_ENFORCE(ctx->HasInput("WeightH"), + "Assert only one Input(WeightH) of LSTM."); + PADDLE_ENFORCE(ctx->HasInput("Bias"), "Assert only one Input(Bias) of LSTM."); + PADDLE_ENFORCE(ctx->HasOutput("XX"), "Assert only one Output(XX) of LSTM."); + PADDLE_ENFORCE(ctx->HasOutput("Hidden"), + "Assert only one Output(Hidden) of LSTM."); + PADDLE_ENFORCE(ctx->HasOutput("Cell"), + "Assert only one Output(Cell) of LSTM."); + PADDLE_ENFORCE(ctx->HasInput("Ids"), + "Input(Ids) of LookupTableOp should not be null."); + + auto table_dims = ctx->GetInputDim("Embeddings"); + auto ids_dims = ctx->GetInputDim("Ids"); + int ids_rank = ids_dims.size(); + + PADDLE_ENFORCE_EQ(table_dims.size(), 2); + PADDLE_ENFORCE_EQ(ids_dims[ids_rank - 1], 1, + "The last dimension of the 'Ids' tensor must be 1."); + + auto x_dims = ctx->GetInputDim("Ids"); + PADDLE_ENFORCE_EQ(x_dims.size(), 2, "Input(Ids)'s rank must be 2."); + + if (ctx->HasInput("H0")) { + PADDLE_ENFORCE(ctx->HasInput("C0"), + "Input(Cell) and Input(Hidden) of LSTM should not " + "be null at the same time."); + auto h_dims = ctx->GetInputDim("H0"); + auto c_dims = ctx->GetInputDim("C0"); + PADDLE_ENFORCE(h_dims == c_dims, + "The dimension of Input(H0) and Input(C0) " + "should be the same."); + } + + auto embeddings_dims = ctx->GetInputDim("Embeddings"); + PADDLE_ENFORCE_EQ(embeddings_dims.size(), 2, + "The rank of Input(Embeddings) should be 2."); + + auto wh_dims = ctx->GetInputDim("WeightH"); + int frame_size = wh_dims[1] / 4; + PADDLE_ENFORCE_EQ(wh_dims.size(), 2, + "The rank of Input(WeightH) should be 2."); + PADDLE_ENFORCE_EQ(wh_dims[0], frame_size, + "The first dimension of Input(WeightH) " + "should be %d.", + frame_size); + PADDLE_ENFORCE_EQ(wh_dims[1], 4 * frame_size, + "The second dimension of Input(WeightH) " + "should be 4 * %d.", + frame_size); + + auto b_dims = ctx->GetInputDim("Bias"); + PADDLE_ENFORCE_EQ(b_dims.size(), 2, "The rank of Input(Bias) should be 2."); + PADDLE_ENFORCE_EQ(b_dims[0], 1, + "The first dimension of Input(Bias) should be 1."); + PADDLE_ENFORCE_EQ( + b_dims[1], (ctx->Attrs().Get("use_peepholes") ? 7 : 4) * frame_size, + "The second dimension of Input(Bias) should be " + "7 * %d if enable peepholes connection or" + "4 * %d if disable peepholes", + frame_size, frame_size); + + framework::DDim out_dims({x_dims[0], frame_size}); + ctx->SetOutputDim("Hidden", out_dims); + ctx->SetOutputDim("Cell", out_dims); + ctx->ShareLoD("Ids", "Hidden"); + ctx->ShareLoD("Ids", "Cell"); + int xx_width; + if (ctx->Attrs().Get("use_seq")) { + xx_width = wh_dims[1]; + } else { + xx_width = x_dims[1] > wh_dims[1] ? wh_dims[1] : x_dims[1]; + PADDLE_ENFORCE(ctx->HasOutput("BatchedInput"), + "Assert only one Output(BatchedInput) of LSTM."); + PADDLE_ENFORCE(ctx->HasOutput("BatchedHidden"), + "Assert only one Output(BatchedHidden) of LSTM."); + PADDLE_ENFORCE(ctx->HasOutput("BatchedCell"), + "Assert only one Output(BatchedCell) of LSTM."); + PADDLE_ENFORCE(ctx->HasOutput("ReorderedH0"), + "Assert only one Output(ReorderedH0) of LSTM"); + PADDLE_ENFORCE(ctx->HasOutput("ReorderedC0"), + "Assert only one Output(ReorderedC0) of LSTM."); + ctx->SetOutputDim("BatchedInput", {x_dims[0], wh_dims[1]}); + ctx->SetOutputDim("BatchedHidden", out_dims); + ctx->SetOutputDim("BatchedCell", out_dims); + } + ctx->SetOutputDim("XX", {x_dims[0], xx_width}); + ctx->ShareLoD("Ids", "XX"); +} + +framework::OpKernelType FusedEmbeddingFCLSTMOp::GetExpectedKernelType( + const framework::ExecutionContext& ctx) const { + return framework::OpKernelType( + framework::ToDataType( + ctx.Input("Embeddings")->type()), + ctx.device_context()); +} + +void FusedEmbeddingFCLSTMOpMaker::Make() { + AddInput("Ids", + "An input with type int32 or int64 " + "contains the ids to be looked up in W. " + "The last dimension size must be 1."); + AddInput("Embeddings", + "(Tensor) the learnable weights of X." + " - The shape is (M x 4D), where M is the dim size of x, D is the " + "hidden size. " + " - Weight = {W_cx, W_ix, W_fx, W_ox}"); + AddInput("WeightH", + "(Tensor) same as LSTMOp, the learnable hidden-hidden weights." + " - The shape is (D x 4D), where D is the hidden size. " + " - Weight = {W_ch, W_ih, W_fh, W_oh}"); + AddInput("Bias", + "(Tensor) the learnable weights. Almost same as LSTMOp" + "Note: we should add the fc bias into this (1x4D) in bias." + "input-hidden bias weight and peephole connections weight if " + "setting `use_peepholes` True. " + "1. `use_peepholes = False` " + " - The shape is (1 x 4D). " + " - Bias = {b_c, b_i, b_f, b_o}." + "2. `use_peepholes = True` " + " - The shape is (1 x 7D). " + " - Bias = {b_c, b_i, b_f, b_o, W_ic, W_fc, W_oc}."); + AddInput("H0", + "(Tensor, optional) (same as LSTMOp) the initial hidden state is an " + "optional " + "input. This is a tensor with shape (N x D), where N is the " + "batch size and D is the hidden size.") + .AsDispensable(); + AddInput("C0", + "(Tensor, optional) (same as LSTMOp) (the initial cell state is an " + "optional " + "input. This is a tensor with shape (N x D), where N is the " + "batch size. `H0` and `C0` can be NULL but only at the same time.") + .AsDispensable(); + AddOutput("Hidden", + "(LoDTensor) (same as LSTMOp) the hidden state of LSTM operator. " + "The shape is (T x D), and lod is the same with the `Input`."); + AddOutput("Cell", + "(LoDTensor) (same as LSTMOp) the cell state of LSTM operator. " + "The shape is (T x D), and lod is the same with the `Input`."); + AddOutput("XX", + "(LoDTensor) the result after X * WeightX (size is T x 4D)" + " or batched_X (size is T x M), this will be automatically chosen," + " where T is the total time steps in this mini-batch," + " D is the hidden size, M is the dim size of x input.") + .AsIntermediate(); + AddOutput("BatchedInput", "(LoDTensor) (T x 4D).").AsIntermediate(); + AddOutput("BatchedHidden", "(LoDTensor) (T x D).").AsIntermediate(); + AddOutput("BatchedCell", "(LoDTensor) (T x D).").AsIntermediate(); + AddOutput("ReorderedH0", "(LoDTensor) (N x D).").AsIntermediate(); + AddOutput("ReorderedC0", "(LoDTensor) (N x D).").AsIntermediate(); + AddAttr("use_peepholes", + "(bool, defalut: True) " + "whether to enable diagonal/peephole connections.") + .SetDefault(true); + AddAttr("is_reverse", + "(bool, defalut: False) " + "whether to compute reversed LSTM.") + .SetDefault(false); + AddAttr("use_seq", + "(bool, defalut: True) " + "whether to use seq mode to compute.") + .SetDefault(true); + AddAttr("gate_activation", + "(string, default: sigmoid)" + "The activation for input gate, forget gate and output " + "gate, `sigmoid` by default.") + .SetDefault("sigmoid") + .InEnum({"sigmoid", "tanh", "relu", "identity"}); + AddAttr("cell_activation", + "(string, default: tanh)" + "The activation for cell output, `tanh` by defalut.") + .SetDefault("tanh") + .InEnum({"sigmoid", "tanh", "relu", "identity"}); + AddAttr("candidate_activation", + "(string, default: tanh)" + "The activation for candidate hidden state, " + "`tanh` by default.") + .SetDefault("tanh") + .InEnum({"sigmoid", "tanh", "relu", "identity"}); + AddComment(R"DOC( +Fusion Long-Short Term Memory (LSTM) Operator. +This operator fuse the X into LSTM, more details can refer to LSTM op. +)DOC"); +} + +template +class FusedEmbeddingFCLSTMKernel : public framework::OpKernel { + public: +#define INIT_VEC_FUNC \ + std::function act_gate, act_cell, act_cand; \ + auto& act_gate_str = ctx.Attr("gate_activation"); \ + auto& act_cell_str = ctx.Attr("cell_activation"); \ + auto& act_cand_str = ctx.Attr("candidate_activation"); \ + if (platform::jit::MayIUse(platform::jit::avx)) { \ + math::VecActivations act_functor; \ + act_gate = act_functor(act_gate_str); \ + act_cell = act_functor(act_cell_str); \ + act_cand = act_functor(act_cand_str); \ + } else { \ + math::VecActivations act_functor; \ + act_gate = act_functor(act_gate_str); \ + act_cell = act_functor(act_cell_str); \ + act_cand = act_functor(act_cand_str); \ + } + +#define INIT_BASE_INPUT_OUTPUT \ + auto* ids = ctx.Input("Ids"); \ + auto* h0 = ctx.Input("H0"); \ + auto* c0 = ctx.Input("C0"); \ + auto* embeddings = ctx.Input("Embeddings"); \ + auto* wh = ctx.Input("WeightH"); \ + auto* bias = ctx.Input("Bias"); \ + auto* xx = ctx.Output("XX"); \ + auto* hidden_out = ctx.Output("Hidden"); \ + auto* cell_out = ctx.Output("Cell"); \ + bool is_reverse = ctx.Attr("is_reverse"); \ + bool use_peepholes = ctx.Attr("use_peepholes"); + +#define INIT_BASE_SIZES \ + auto ids_dims = ids->dims(); /* T x M*/ \ + auto ids_numel = ids->numel(); /* T x 1*/ \ + auto wh_dims = wh->dims(); /* D x 4D*/ \ + const int D = wh_dims[0]; \ + const int D2 = D * 2; \ + const int D3 = D * 3; \ + int64_t row_number = embeddings->dims()[0]; \ + int64_t row_width = embeddings->dims()[1]; \ + const int D4 = wh_dims[1]; + +#define INIT_BASE_INPUT_DATAS \ + const int64_t* ids_data = ids->data(); \ + const T* embeddings_data = embeddings->data(); \ + const T* wh_data = wh->data(); \ + /* diagonal weight*/ \ + const T* wc_data = bias->data() + D4; \ + /* for peephole only*/ \ + Tensor checked_cell; \ + T* checked_cell_data = nullptr; \ + auto place = ctx.GetPlace(); \ + if (use_peepholes) { \ + /* w_ic * Ct-1, w_fc * Ct-1 ; w_oc * Ct => ih*/ \ + checked_cell_data = checked_cell.mutable_data({2, D}, place); \ + } + +/// Compute LSTM +#define GEMM_WH_ADDON(bs, prev, out) \ + blas.GEMM(CblasNoTrans, CblasNoTrans, bs, D4, D, static_cast(1), prev, D, \ + wh_data, D4, static_cast(1), out, D4) + +// gates: W_ch, W_ih, W_fh, W_oh +#define GET_Ct(ct_1, gates, ct) \ + /* C_t = C_t-1 * fgated + cand_gated * igated*/ \ + act_cand(D, gates, gates); \ + blas.VMUL(D, gates, gates + D, gates + D); \ + blas.VMUL(D, ct_1, gates + D2, gates + D2); \ + blas.VADD(D, gates + D, gates + D2, ct) + +#define GET_Ht(ct, gates, ht) \ + /* H_t = act_cell(C_t) * ogated */ \ + act_cell(D, ct, gates + D2); \ + blas.VMUL(D, gates + D2, gates + D3, ht) + +#define GET_Ct_NOH0C0(gates, ct) \ + /* C_t = igated * cgated*/ \ + act_gate(D, gates + D, gates + D); \ + act_cand(D, gates, gates); \ + blas.VMUL(D, gates, gates + D, ct) + +#define COMPUTE_CtHt_NOH0C0(gates, ct, ht) \ + GET_Ct_NOH0C0(gates, ct); \ + act_gate(D, gates + D3, gates + D3); \ + GET_Ht(ct, gates, ht) + +#define COMPUTE_CtHt_PEEPHOLE_NOH0C0(gates, ct, ht) \ + GET_Ct_NOH0C0(gates, ct); \ + /* get outgated, put W_oc * C_t on igated */ \ + blas.VMUL(D, wc_data + D2, ct, gates + D); \ + blas.VADD(D, gates + D, gates + D3, gates + D3); \ + act_gate(D, gates + D3, gates + D3); \ + GET_Ht(ct, gates, ht) + +#define COMPUTE_CtHt(gates, ct_1, ct, ht) \ + act_gate(D3, gates + D, gates + D); \ + GET_Ct(ct_1, gates, ct); \ + GET_Ht(ct, gates, ht) + +#define COMPUTE_CtHt_PEEPHOLE(gates, ct_1, ct, ht) \ + /* get fgated and igated*/ \ + blas.VMUL(D, wc_data, ct_1, checked_cell_data); \ + blas.VMUL(D, wc_data + D, ct_1, checked_cell_data + D); \ + blas.VADD(D2, checked_cell_data, gates + D, gates + D); \ + act_gate(D2, gates + D, gates + D); \ + GET_Ct(ct_1, gates, ct); \ + /* get ogated*/ \ + blas.VMUL(D, wc_data + D2, ct, gates + D); \ + blas.VADD(D, gates + D, gates + D3, gates + D3); \ + act_gate(D, gates + D3, gates + D3); \ + GET_Ht(ct, gates, ht) + + void SeqCompute(const framework::ExecutionContext& ctx) const { + using DeviceContext = paddle::platform::CPUDeviceContext; + INIT_BASE_INPUT_OUTPUT + INIT_BASE_SIZES + INIT_VEC_FUNC + INIT_BASE_INPUT_DATAS + + // std::cout << "====> SeqCompute" << std::endl; + auto ids_lod = ids->lod(); + const int total_T = ids_dims[0]; + const int N = ids_lod[0].size() - 1; + const T* h0_data = h0 ? h0->data() : nullptr; + const T* c0_data = c0 ? c0->data() : nullptr; + T* xx_data = xx->mutable_data(place); + T* h_out_data = hidden_out->mutable_data(place); + T* c_out_data = cell_out->mutable_data(place); + auto blas = math::GetBlas(ctx); + + for (int64_t i = 0; i < ids_numel; ++i) { + PADDLE_ENFORCE_LT(ids_data[i], row_number); + PADDLE_ENFORCE_GE(ids_data[i], 0, "ids %d", i); + memcpy(xx_data + i * row_width, embeddings_data + ids_data[i] * row_width, + row_width * sizeof(T)); + } + + int xx_offset = D4; + int gate_offset = D; + if (is_reverse) { + const int offset = (total_T - 1) * D; + xx_data = xx_data + offset * 4; + h_out_data = h_out_data + offset; + c_out_data = c_out_data + offset; + xx_offset = -D4; + gate_offset = -D; + } + +#define MOVE_ONE_STEP \ + prev_h_data = h_out_data; \ + prev_c_data = c_out_data; \ + xx_data = xx_data + xx_offset; \ + h_out_data = h_out_data + gate_offset; \ + c_out_data = c_out_data + gate_offset + +#define PROCESS_H0C0_DEFINES \ + int bid = is_reverse ? N - 1 - i : i; \ + int seq_len = ids_lod[0][bid + 1] - ids_lod[0][bid]; \ + const T* prev_c_data = nullptr; \ + const T* prev_h_data = nullptr; \ + int tstart = 0 + +#define PROCESS_H0C0_PEEPHOLE \ + PROCESS_H0C0_DEFINES; \ + if (h0_data) { \ + prev_h_data = h0_data + bid * D; \ + prev_c_data = c0_data + bid * D; \ + } else { \ + COMPUTE_CtHt_PEEPHOLE_NOH0C0(xx_data, c_out_data, h_out_data); \ + MOVE_ONE_STEP; \ + tstart = 1; \ + } + +#define PROCESS_H0C0 \ + PROCESS_H0C0_DEFINES; \ + if (h0_data) { \ + prev_h_data = h0_data + bid * D; \ + prev_c_data = c0_data + bid * D; \ + } else { \ + COMPUTE_CtHt_NOH0C0(xx_data, c_out_data, h_out_data); \ + MOVE_ONE_STEP; \ + tstart = 1; \ + } + + if (use_peepholes) { + for (int i = 0; i < N; ++i) { + PROCESS_H0C0_PEEPHOLE + for (int step = tstart; step < seq_len; ++step) { + GEMM_WH_ADDON(1, prev_h_data, xx_data); + COMPUTE_CtHt_PEEPHOLE(xx_data, prev_c_data, c_out_data, h_out_data); + MOVE_ONE_STEP; + } + } + } else { + for (int i = 0; i < N; ++i) { + PROCESS_H0C0 + for (int step = tstart; step < seq_len; ++step) { + GEMM_WH_ADDON(1, prev_h_data, xx_data); + COMPUTE_CtHt(xx_data, prev_c_data, c_out_data, h_out_data); + MOVE_ONE_STEP; + } + } + } +#undef PROCESS_H0C0_DEFINES +#undef PROCESS_H0C0_PEEPHOLE +#undef PROCESS_H0C0 +#undef MOVE_ONE_STEP + } + + void BatchCompute(const framework::ExecutionContext& ctx) const { + using DeviceContext = platform::CPUDeviceContext; + INIT_BASE_INPUT_OUTPUT + if (ids->lod()[0].size() == 2) { + SeqCompute(ctx); + return; + } + INIT_BASE_SIZES + INIT_VEC_FUNC + INIT_BASE_INPUT_DATAS + + // std::cout << "===> Batch Compute" << std::endl; + + auto* reordered_h0 = ctx.Output("ReorderedH0"); + auto* reordered_c0 = ctx.Output("ReorderedC0"); + auto* batched_input = ctx.Output("BatchedInput"); + auto* batched_c_out = ctx.Output("BatchedCell"); + auto* batched_h_out = ctx.Output("BatchedHidden"); + T* xx_data = xx->mutable_data(place); + T* batched_input_data = batched_input->mutable_data(place); + T* batched_c_out_data = batched_c_out->mutable_data(place); + T* batched_h_out_data = batched_h_out->mutable_data(place); + hidden_out->mutable_data(place); + cell_out->mutable_data(place); + + math::LoDTensor2BatchFunctor to_batch; + auto& dev_ctx = ctx.template device_context(); + auto blas = math::GetBlas(dev_ctx); + + for (int64_t i = 0; i < ids_numel; ++i) { + PADDLE_ENFORCE_LT(ids_data[i], row_number); + PADDLE_ENFORCE_GE(ids_data[i], 0, "ids %d", i); + memcpy(xx_data + i * row_width, embeddings_data + ids_data[i] * row_width, + row_width * sizeof(T)); + } + + to_batch(dev_ctx, *xx, batched_input, true, is_reverse); + + auto batched_lod = batched_input->lod(); + const auto& seq_order = batched_lod[2]; + const int max_bs = seq_order.size(); + reordered_h0->Resize({max_bs, D}); + reordered_c0->Resize({max_bs, D}); + + int tstart = 0; + T* prev_h_data = nullptr; + T* prev_c_data = nullptr; + if (h0) { + // reorder h0, c0 + T* reordered_h0_data = reordered_h0->mutable_data(place); + T* reordered_c0_data = reordered_c0->mutable_data(place); + const T* h0_data = h0->data(); + const T* c0_data = c0->data(); + prev_h_data = reordered_h0_data; + prev_c_data = reordered_c0_data; + size_t sz = sizeof(T) * D; + for (int i = 0; i < max_bs; ++i) { + std::memcpy(reordered_h0_data, h0_data + seq_order[i] * D, sz); + std::memcpy(reordered_c0_data, c0_data + seq_order[i] * D, sz); + reordered_h0_data += D; + reordered_c0_data += D; + } + } else { + // compute without h0, c0 + T* cur_in_data = batched_input_data; + T* cur_h_out_data = batched_h_out_data; + T* cur_c_out_data = batched_c_out_data; + for (int i = 0; i < max_bs; ++i) { + GET_Ct_NOH0C0(cur_in_data, cur_c_out_data); + if (use_peepholes) { + blas.VMUL(D, wc_data + D2, cur_c_out_data, cur_in_data + D); + blas.VADD(D, cur_in_data + D, cur_in_data + D3, cur_in_data + D3); + } + act_gate(D, cur_in_data + D3, cur_in_data + D3); + GET_Ht(cur_c_out_data, cur_in_data, cur_h_out_data); + cur_in_data += D4; + cur_c_out_data += D; + cur_h_out_data += D; + } + tstart = 1; + prev_h_data = batched_h_out_data; + prev_c_data = batched_c_out_data; + } + const auto& batch_starts = batched_lod[0]; + const int max_seq_len = batch_starts.size() - 1; + const int offset = tstart * max_bs * D; + batched_input_data = batched_input_data + offset * 4; + batched_h_out_data = batched_h_out_data + offset; + batched_c_out_data = batched_c_out_data + offset; + +#define DEFINE_CUR \ + T* cur_in_data = batched_input_data; \ + T* cur_prev_c_data = prev_c_data; \ + T* cur_c_out_data = batched_c_out_data; \ + T* cur_h_out_data = batched_h_out_data + +#define MOVE_ONE_BATCH \ + cur_in_data += D4; \ + cur_prev_c_data += D; \ + cur_c_out_data += D; \ + cur_h_out_data += D + +#define MOVE_ONE_STEP \ + prev_c_data = batched_c_out_data; \ + prev_h_data = batched_h_out_data; \ + batched_c_out_data = cur_c_out_data; \ + batched_h_out_data = cur_h_out_data; \ + batched_input_data = cur_in_data + + if (use_peepholes) { + for (int step = tstart; step < max_seq_len; ++step) { + const int cur_bs = batch_starts[step + 1] - batch_starts[step]; + GEMM_WH_ADDON(cur_bs, prev_h_data, batched_input_data); + DEFINE_CUR; + for (int i = 0; i < cur_bs; ++i) { + COMPUTE_CtHt_PEEPHOLE(cur_in_data, cur_prev_c_data, cur_c_out_data, + cur_h_out_data); + MOVE_ONE_BATCH; + } + MOVE_ONE_STEP; + } + } else { + for (int step = tstart; step < max_seq_len; ++step) { + const int cur_bs = batch_starts[step + 1] - batch_starts[step]; + GEMM_WH_ADDON(cur_bs, prev_h_data, batched_input_data); + DEFINE_CUR; + for (int i = 0; i < cur_bs; ++i) { + COMPUTE_CtHt(cur_in_data, cur_prev_c_data, cur_c_out_data, + cur_h_out_data); + MOVE_ONE_BATCH; + } + MOVE_ONE_STEP; + } + } +#undef MOVE_ONE_STEP +#undef MOVE_ONE_BATCH +#undef DEFINE_CUR + + math::Batch2LoDTensorFunctor to_seq; + batched_h_out->set_lod(batched_lod); + to_seq(dev_ctx, *batched_h_out, hidden_out); + batched_c_out->set_lod(batched_lod); + to_seq(dev_ctx, *batched_c_out, cell_out); + } + + void Compute(const framework::ExecutionContext& ctx) const override { + if (ctx.Attr("use_seq")) { + SeqCompute(ctx); + } else { + BatchCompute(ctx); + } + } + +#undef COMPUTE_CtHt_PEEPHOLE +#undef COMPUTE_CtHt +#undef GET_Ct_NOH0C0 +#undef COMPUTE_CtHt_NOH0C0 +#undef COMPUTE_CtHt_PEEPHOLE_NOH0C0 +#undef GET_Ht +#undef GET_Ct +#undef GEMM_WH_ADDON +#undef INIT_BASE_INPUT_DATAS +#undef INIT_BASE_SIZES +#undef INIT_BASE_INPUT_OUTPUT +#undef INIT_VEC_FUNC +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OPERATOR(fused_embedding_fc_lstm, ops::FusedEmbeddingFCLSTMOp, + ops::FusedEmbeddingFCLSTMOpMaker, + paddle::framework::DefaultGradOpDescMaker); + +REGISTER_OP_CPU_KERNEL(fused_embedding_fc_lstm, + ops::FusedEmbeddingFCLSTMKernel, + ops::FusedEmbeddingFCLSTMKernel); diff --git a/paddle/fluid/operators/fused_embedding_fc_lstm_op.h b/paddle/fluid/operators/fused_embedding_fc_lstm_op.h new file mode 100644 index 0000000000000000000000000000000000000000..2775b2ac04d2890355fe6d75a1e2507a2668dc95 --- /dev/null +++ b/paddle/fluid/operators/fused_embedding_fc_lstm_op.h @@ -0,0 +1,41 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace operators { + +using LoDTensor = framework::LoDTensor; +using Tensor = framework::Tensor; + +class FusedEmbeddingFCLSTMOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override; + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override; +}; + +class FusedEmbeddingFCLSTMOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override; +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/fusion_gru_op.cc b/paddle/fluid/operators/fusion_gru_op.cc index 31e87d9113118ebe7a4b25ffee5ba55e2714fb66..a04c1c1263fba659e2d3f623b607e9f476bb40ed 100644 --- a/paddle/fluid/operators/fusion_gru_op.cc +++ b/paddle/fluid/operators/fusion_gru_op.cc @@ -290,12 +290,13 @@ class FusionGRUKernel : public framework::OpKernel { void BatchCompute(const framework::ExecutionContext& ctx) const { using DeviceContext = paddle::platform::CPUDeviceContext; auto* x = ctx.Input("X"); + INIT_BASE_INPUT_OUTPUT + INIT_BASE_SIZES if (x->lod()[0].size() == 2) { + xx->Resize({total_T, D3}); SeqCompute(ctx); return; } - INIT_BASE_INPUT_OUTPUT - INIT_BASE_SIZES INIT_VEC_FUNC auto* reordered_h0 = ctx.Output("ReorderedH0"); diff --git a/paddle/fluid/operators/fusion_lstm_op.cc b/paddle/fluid/operators/fusion_lstm_op.cc index 8ca79d20ec4f6412b00dbf3990068f81b65e2efd..ae1f6d8e489039667d861a69acabf2c632ef2061 100644 --- a/paddle/fluid/operators/fusion_lstm_op.cc +++ b/paddle/fluid/operators/fusion_lstm_op.cc @@ -76,12 +76,18 @@ void FusionLSTMOp::InferShape(framework::InferShapeContext* ctx) const { PADDLE_ENFORCE_EQ(b_dims.size(), 2, "The rank of Input(Bias) should be 2."); PADDLE_ENFORCE_EQ(b_dims[0], 1, "The first dimension of Input(Bias) should be 1."); - PADDLE_ENFORCE_EQ( - b_dims[1], (ctx->Attrs().Get("use_peepholes") ? 7 : 4) * frame_size, - "The second dimension of Input(Bias) should be " - "7 * %d if enable peepholes connection or" - "4 * %d if disable peepholes", - frame_size, frame_size); + if (ctx->Attrs().Get("use_peepholes")) { + PADDLE_ENFORCE_EQ(b_dims[1], 7 * frame_size, + "The second dimension of Input(Bias) should be " + "7 * %d if enable peepholes connection", + frame_size); + ctx->SetOutputDim("CheckedCell", {2, frame_size}); + } else { + PADDLE_ENFORCE_EQ(b_dims[1], 4 * frame_size, + "The second dimension of Input(Bias) should be " + "4 * %d if disable peepholes", + frame_size); + } framework::DDim out_dims({x_dims[0], frame_size}); ctx->SetOutputDim("Hidden", out_dims); @@ -173,6 +179,8 @@ void FusionLSTMOpMaker::Make() { AddOutput("BatchedCell", "(LoDTensor) (T x D).").AsIntermediate(); AddOutput("ReorderedH0", "(LoDTensor) (N x D).").AsIntermediate(); AddOutput("ReorderedC0", "(LoDTensor) (N x D).").AsIntermediate(); + AddOutput("CheckedCell", "(Tensor) (2 x D) only for peephole.") + .AsIntermediate(); AddAttr("use_peepholes", "(bool, defalut: True) " "whether to enable diagonal/peephole connections.") @@ -250,19 +258,19 @@ class FuisonLSTMKernel : public framework::OpKernel { const int D3 = D * 3; \ const int D4 = wh_dims[1]; -#define INIT_BASE_INPUT_DATAS \ - const T* x_data = x->data(); \ - const T* wx_data = wx->data(); \ - const T* wh_data = wh->data(); \ - /* diagonal weight*/ \ - const T* wc_data = bias->data() + D4; \ - /* for peephole only*/ \ - Tensor checked_cell; \ - T* checked_cell_data = nullptr; \ - auto place = ctx.GetPlace(); \ - if (use_peepholes) { \ - /* w_ic * Ct-1, w_fc * Ct-1 ; w_oc * Ct => ih*/ \ - checked_cell_data = checked_cell.mutable_data({2, D}, place); \ +#define INIT_BASE_INPUT_DATAS \ + const T* x_data = x->data(); \ + const T* wx_data = wx->data(); \ + const T* wh_data = wh->data(); \ + /* diagonal weight*/ \ + const T* wc_data = bias->data() + D4; \ + /* for peephole only*/ \ + T* checked_cell_data = nullptr; \ + auto place = ctx.GetPlace(); \ + if (use_peepholes) { \ + /* w_ic * Ct-1, w_fc * Ct-1 ; w_oc * Ct => ih*/ \ + auto* checked_cell = ctx.Output("CheckedCell"); \ + checked_cell_data = checked_cell->mutable_data(place); \ } /// Compute LSTM @@ -424,11 +432,12 @@ class FuisonLSTMKernel : public framework::OpKernel { void BatchCompute(const framework::ExecutionContext& ctx) const { using DeviceContext = platform::CPUDeviceContext; INIT_BASE_INPUT_OUTPUT + INIT_BASE_SIZES if (x->lod()[0].size() == 2) { + xx->Resize({x_dims[0], D4}); SeqCompute(ctx); return; } - INIT_BASE_SIZES INIT_VEC_FUNC INIT_BASE_INPUT_DATAS diff --git a/paddle/fluid/operators/lookup_table_op.cu b/paddle/fluid/operators/lookup_table_op.cu index 74823dab09cac358f647c074ac2f2ee2fed17e55..abd5dce8f7e7146a1671a387328c177e5e6e0a85 100644 --- a/paddle/fluid/operators/lookup_table_op.cu +++ b/paddle/fluid/operators/lookup_table_op.cu @@ -127,10 +127,8 @@ class LookupTableGradCUDAKernel : public framework::OpKernel { auto gpu_place = boost::get(context.GetPlace()); // TODO(yuyang18): Strange code here. - memory::Copy(platform::CPUPlace(), - new_rows.CUDAMutableData(context.GetPlace()), gpu_place, - ids_data, ids_num * sizeof(int64_t), stream); - + memory::Copy(gpu_place, new_rows.CUDAMutableData(context.GetPlace()), + gpu_place, ids_data, ids_num * sizeof(int64_t), stream); d_table->set_rows(new_rows); auto *d_table_value = d_table->mutable_value(); diff --git a/paddle/fluid/operators/math/cpu_lstm_compute.cc b/paddle/fluid/operators/math/cpu_lstm_compute.cc index 58e6512021203664573a0478dade052f92dd70bb..e96d1879331974e0873e13f171414bcfa8c45953 100644 --- a/paddle/fluid/operators/math/cpu_lstm_compute.cc +++ b/paddle/fluid/operators/math/cpu_lstm_compute.cc @@ -13,6 +13,31 @@ limitations under the License. */ namespace paddle { namespace operators { -namespace math {} // namespace math +namespace math { +#ifdef __AVX__ +template <> +void lstm_compute_ctht(float* gates, const float* ct_1, float* ct, + float* ht) { + namespace act = detail::forward::avx; + // gates: W_ch, W_ih, W_fh, W_oh + __m256 c, i, f, o; + c = _mm256_loadu_ps(gates); + i = _mm256_loadu_ps(gates + 8); + f = _mm256_loadu_ps(gates + 16); + o = _mm256_loadu_ps(gates + 24); + + /* C_t = C_t-1 * fgated + cand_gated * igated*/ + c = _mm256_mul_ps(act::Tanh(c), act::Sigmoid(i)); + i = _mm256_loadu_ps(ct_1); + f = _mm256_mul_ps(i, act::Sigmoid(f)); + f = _mm256_add_ps(c, f); + _mm256_storeu_ps(ct, f); + + /* H_t = act_cell(C_t) * ogated */ + o = _mm256_mul_ps(act::Tanh(f), act::Sigmoid(o)); + _mm256_storeu_ps(ht, o); +} +#endif +} // namespace math } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/math/cpu_lstm_compute.h b/paddle/fluid/operators/math/cpu_lstm_compute.h index 28b6f71729edf1b8cc5d610d76af78dea213313e..169a9e4b47f54851ad436428416eca879b78e186 100644 --- a/paddle/fluid/operators/math/cpu_lstm_compute.h +++ b/paddle/fluid/operators/math/cpu_lstm_compute.h @@ -48,32 +48,15 @@ namespace forward { namespace avx { __m256 Sigmoid(const __m256 a); __m256 Tanh(const __m256 a); + } // namespace avx } // namespace forward } // namespace detail template <> void lstm_compute_ctht(float* gates, const float* ct_1, float* ct, - float* ht) { - namespace act = detail::forward::avx; - // gates: W_ch, W_ih, W_fh, W_oh - __m256 c, i, f, o; - c = _mm256_loadu_ps(gates); - i = _mm256_loadu_ps(gates + 8); - f = _mm256_loadu_ps(gates + 16); - o = _mm256_loadu_ps(gates + 24); + float* ht); - /* C_t = C_t-1 * fgated + cand_gated * igated*/ - c = _mm256_mul_ps(act::Tanh(c), act::Sigmoid(i)); - i = _mm256_loadu_ps(ct_1); - f = _mm256_mul_ps(i, act::Sigmoid(f)); - f = _mm256_add_ps(c, f); - _mm256_storeu_ps(ct, f); - - /* H_t = act_cell(C_t) * ogated */ - o = _mm256_mul_ps(act::Tanh(f), act::Sigmoid(o)); - _mm256_storeu_ps(ht, o); -} #endif } // namespace math diff --git a/paddle/fluid/operators/math/depthwise_conv.cu b/paddle/fluid/operators/math/depthwise_conv.cu index 027e2de48d229761f12f974dc73625c8ea1b3567..3be389912307f7aac6dda6d1018943eb8f08696d 100644 --- a/paddle/fluid/operators/math/depthwise_conv.cu +++ b/paddle/fluid/operators/math/depthwise_conv.cu @@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include #include #include "paddle/fluid/operators/math/depthwise_conv.h" #include "paddle/fluid/platform/cuda_primitives.h" @@ -20,149 +21,268 @@ namespace paddle { namespace operators { namespace math { +template +__inline__ __device__ T warpReduceSum(T val) { +#if CUDA_VERSION < 9000 + for (int offset = 16; offset > 0; offset /= 2) + val += __shfl_down(val, offset); + return val; +#else +#define FULL_MASK 0xffffffff + for (int offset = 16; offset > 0; offset /= 2) + val += __shfl_down_sync(FULL_MASK, val, offset); + return val; +#endif +} +__forceinline__ __device__ unsigned lane_id() { + unsigned ret; + asm volatile("mov.u32 %0, %laneid;" : "=r"(ret)); + return ret; +} + +__forceinline__ __device__ unsigned warp_id() { + unsigned ret; + asm volatile("mov.u32 %0, %warpid;" : "=r"(ret)); + return ret; +} + // A Cuda kernel to compute the depthwise convolution forward pass // in NCHW format. template -__global__ void KernelDepthwiseConv( - const int nthreads, const T* const input_data, const T* const filter_data, - const int batch_size, const int output_channels, const int output_height, - const int output_width, const int input_channels, const int input_height, - const int input_width, const int filter_multiplier, const int filter_height, +__device__ __inline__ void KernelDepthwiseConv( + const T* const input_data, const T* const filter_data, const int batch_size, + const int output_channels, const int output_height, const int output_width, + const int input_channels, const int input_height, const int input_width, + const int filter_multiplier, const int filter_height, const int filter_width, const int stride_height, const int stride_width, - const int padding_height, const int padding_width, T* const output_data) { - int index = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x; - - if (index < nthreads) { - const int batch = index / output_channels / output_height / output_width; - const int c_out = (index / output_height / output_width) % output_channels; - const int h_out = (index / output_width) % output_height; - const int w_out = index % output_width; - - const int c_in = c_out / filter_multiplier; - const T* weight = filter_data + c_out * filter_height * filter_width; - T value = 0; - const int h_in_start = -padding_height + h_out * stride_height; - const int w_in_start = -padding_width + w_out * stride_width; - const int h_in_end = h_in_start + filter_height; - const int w_in_end = w_in_start + filter_width; - - const int in_offset = - ((batch * input_channels + c_in) * input_height) * input_width; - - const int h_end = h_in_end < input_height ? h_in_end : input_height; - const int w_end = w_in_end < input_width ? w_in_end : input_width; - const int h_start = h_in_start > 0 ? h_in_start : 0; - const int w_start = w_in_start > 0 ? w_in_start : 0; - - for (int h_in = h_start; h_in < h_end; h_in++) { - for (int w_in = w_start; w_in < w_end; w_in++) { - const int offset = in_offset + h_in * input_width + w_in; - value += - weight[(h_in - h_in_start) * filter_width + (w_in - w_in_start)] * - input_data[offset]; + const int padding_height, const int padding_width, const int dilate_height, + const int dilate_width, T* const output_data) { + for (int w_out = threadIdx.x; w_out < output_width; w_out += blockDim.x) { + for (int h_out = threadIdx.y; h_out < output_height; h_out += blockDim.y) { + const int batch = blockIdx.y; + const int c_out = blockIdx.x; + + const int c_in = c_out / filter_multiplier; + const T* weight = filter_data + c_out * filter_height * filter_width; + T value = 0; + const int h_in_start = -padding_height + h_out * stride_height; + const int w_in_start = -padding_width + w_out * stride_width; + const int h_in_end = h_in_start + filter_height * dilate_height; + const int w_in_end = w_in_start + filter_width * dilate_width; + + const int in_offset = + ((batch * input_channels + c_in) * input_height) * input_width; + + const int h_end = h_in_end < input_height ? h_in_end : input_height; + const int w_end = w_in_end < input_width ? w_in_end : input_width; + const int h_start = h_in_start > 0 ? h_in_start : 0; + const int w_start = w_in_start > 0 ? w_in_start : 0; + int weight_offset = 0; + + for (int h_in = h_in_start; h_in < h_in_end; h_in += dilate_height) { + for (int w_in = w_in_start; w_in < w_in_end; w_in += dilate_width) { + if (h_in >= h_start && h_in < h_end && w_in >= w_start && + w_in < w_end) { + const int offset = in_offset + h_in * input_width + w_in; + value += weight[weight_offset] * input_data[offset]; + } + weight_offset++; + } } + int index = + ((batch * gridDim.x + c_out) * output_height + h_out) * output_width + + w_out; + output_data[index] = value; } - output_data[index] = value; } } +template +__global__ void KernelDepthwiseConvSp( + const T* const input_data, const T* const filter_data, const int batch_size, + const int output_channels, const int output_height, const int output_width, + const int input_channels, const int input_height, const int input_width, + const int filter_multiplier, const int filter_height, + const int filter_width, const int stride_height, const int stride_width, + const int padding_height, const int padding_width, const int dilate_height, + const int dilate_width, T* const output_data) { + if (c_filter_multiplier == 0) + KernelDepthwiseConv(input_data, filter_data, batch_size, output_channels, + output_height, output_width, input_channels, + input_height, input_width, filter_multiplier, + filter_height, filter_width, stride_height, + stride_width, padding_height, padding_width, + dilate_height, dilate_width, output_data); + + else + KernelDepthwiseConv(input_data, filter_data, batch_size, output_channels, + output_height, output_width, input_channels, + input_height, input_width, c_filter_multiplier, + filter_height, filter_height, c_stride, c_stride, + padding_height, padding_width, dilate_height, + dilate_width, output_data); +} + // CUDA kernel to compute the depthwise convolution backprop w.r.t input. template -__global__ void KernelDepthwiseConvInputGrad( - const int nthreads, const T* const output_grad_data, - const T* const filter_data, const int batch_size, const int output_channels, - const int output_height, const int output_width, const int input_channels, - const int input_height, const int input_width, const int filter_multiplier, - const int filter_height, const int filter_width, const int stride_height, - const int stride_width, const int padding_height, const int padding_width, - T* const input_grad_data) { - int index = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x; - if (index < nthreads) { - const int batch = index / input_channels / input_height / input_width; - const int c_in = (index / input_height / input_width) % input_channels; - const int h_in = (index / input_width) % input_height; - const int w_in = index % input_width; - - const int c_out_start = c_in * filter_multiplier; - - int h_out_start = - (h_in - filter_height + padding_height + stride_height) / stride_height; - h_out_start = 0 > h_out_start ? 0 : h_out_start; - - int h_out_end = (h_in + padding_height) / stride_height; - h_out_end = output_height - 1 < h_out_end ? output_height - 1 : h_out_end; - - int w_out_start = - (w_in - filter_width + padding_width + stride_width) / stride_width; - w_out_start = 0 > w_out_start ? 0 : w_out_start; - - int w_out_end = (w_in + padding_width) / stride_width; - w_out_end = output_width - 1 < w_out_end ? output_width - 1 : w_out_end; - - T value = 0; - - for (int c_out = c_out_start; c_out < c_out_start + filter_multiplier; - c_out++) { - for (int h_out = h_out_start; h_out <= h_out_end; ++h_out) { - const int filter_h = h_in + padding_height - h_out * stride_height; - for (int w_out = w_out_start; w_out <= w_out_end; ++w_out) { - const int filter_w = w_in + padding_width - w_out * stride_width; - const int filter_offset = c_out * filter_height * filter_width + - filter_h * filter_width + filter_w; - const int output_grad_offset = - ((batch * output_channels + c_out) * output_height + h_out) * - output_width + - w_out; - value += - output_grad_data[output_grad_offset] * filter_data[filter_offset]; +__device__ __inline__ void KernelDepthwiseConvInputGrad( + const T* const output_grad_data, const T* const filter_data, + const int batch_size, const int output_channels, const int output_height, + const int output_width, const int input_channels, const int input_height, + const int input_width, const int filter_multiplier, const int filter_height, + const int filter_width, const int stride_height, const int stride_width, + const int padding_height, const int padding_width, const int dilate_height, + const int dilate_width, T* const input_grad_data) { + for (int w_in = threadIdx.x; w_in < input_width; w_in += blockDim.x) { + for (int h_in = threadIdx.y; h_in < input_height; h_in += blockDim.y) { + const int batch = blockIdx.y; + const int c_in = blockIdx.x; + + const int c_out_start = c_in * filter_multiplier; + + int h_out_start = + h_in - (filter_height - 1) * dilate_height + padding_height; + + int h_out_end = h_in + padding_height; + + int w_out_start = + w_in - (filter_width - 1) * dilate_width + padding_width; + + int w_out_end = w_in + padding_width; + + T value = 0; + + for (int c_out = c_out_start; c_out < c_out_start + filter_multiplier; + c_out++) { + int filter_offset = (c_out + 1) * filter_height * filter_width; + for (int h_out = h_out_start; h_out <= h_out_end; + h_out += dilate_height) { + for (int w_out = w_out_start; w_out <= w_out_end; + w_out += dilate_width) { + filter_offset--; + int s_h_out = h_out / stride_height; + int s_w_out = w_out / stride_width; + if (h_out % stride_height == 0 && w_out % stride_width == 0 && + s_h_out >= 0 && s_h_out < output_height && s_w_out >= 0 && + s_w_out < output_width) { + const int output_grad_offset = + ((batch * output_channels + c_out) * output_height + + s_h_out) * + output_width + + s_w_out; + value += output_grad_data[output_grad_offset] * + filter_data[filter_offset]; + } + } } } + int index = + ((batch * gridDim.x + c_in) * input_height + h_in) * input_width + + w_in; + input_grad_data[index] = value; } - input_grad_data[index] += value; } } +template +__global__ void KernelDepthwiseConvInputGradSp( + const T* const output_grad_data, const T* const filter_data, + const int batch_size, const int output_channels, const int output_height, + const int output_width, const int input_channels, const int input_height, + const int input_width, const int filter_multiplier, const int filter_height, + const int filter_width, const int stride_height, const int stride_width, + const int padding_height, const int padding_width, const int dilate_height, + const int dilate_width, T* const input_grad_data) { + if (c_filter_multiplier == 0) + KernelDepthwiseConvInputGrad( + output_grad_data, filter_data, batch_size, output_channels, + output_height, output_width, input_channels, input_height, input_width, + filter_multiplier, filter_height, filter_width, stride_height, + stride_width, padding_height, padding_width, dilate_height, + dilate_width, input_grad_data); + else + KernelDepthwiseConvInputGrad( + output_grad_data, filter_data, batch_size, output_channels, + output_height, output_width, input_channels, input_height, input_width, + c_filter_multiplier, filter_height, filter_width, c_stride, c_stride, + padding_height, padding_width, dilate_height, dilate_width, + input_grad_data); +} + // Cuda kernel to compute the depthwise convolution backprop w.r.t. filter. template -__global__ void KernelDepthwiseConvFilterGrad( - const int nthreads, const T* const output_grad_data, - const T* const input_data, const int num, const int output_channels, - const int output_height, const int output_width, const int input_channels, - const int input_height, const int input_width, const int filter_multiplier, - const int filter_height, const int filter_width, const int stride_height, - const int stride_width, const int padding_height, const int padding_width, - T* const filter_grad_data) { - int index = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x; - if (index < nthreads) { - const int w_out = index % output_width; - const int h_out = (index / output_width) % output_height; - const int c_out = (index / output_width / output_height) % output_channels; - const int batch = (index / output_width / output_height / output_channels); - const int c_in = c_out / filter_multiplier; - const int h_in_start = -padding_height + h_out * stride_height; - const int w_in_start = -padding_width + w_out * stride_width; - const int h_in_end = - -padding_height + h_out * stride_height + filter_height; - const int w_in_end = -padding_width + w_out * stride_width + filter_width; - const int in_offset = - (batch * input_channels + c_in) * input_height * input_width; - - T* addr_offset = filter_grad_data + c_out * filter_height * filter_width; - const int h_end = h_in_end < input_height ? h_in_end : input_height; - const int w_end = w_in_end < input_width ? w_in_end : input_width; - const int h_start = h_in_start > 0 ? h_in_start : 0; - const int w_start = w_in_start > 0 ? w_in_start : 0; - - for (int h_in = h_start; h_in < h_end; h_in++) { - for (int w_in = w_start; w_in < w_end; w_in++) { - const int offset = in_offset + h_in * input_width + w_in; - const T diff_temp = output_grad_data[index] * input_data[offset]; - T* addr = addr_offset + (h_in - h_in_start) * filter_width + - (w_in - w_in_start); - paddle::platform::CudaAtomicAdd(addr, diff_temp); +__device__ __inline__ void KernelDepthwiseConvFilterGrad( + const T* output_grad_data, const T* input_data, const int num, + const int output_channels, const int output_height, const int output_width, + const int input_channels, const int input_height, const int input_width, + const int filter_multiplier, const int filter_height, + const int filter_width, const int stride_height, const int stride_width, + const int padding_height, const int padding_width, const int dilate_height, + const int dilate_width, T* filter_grad_data) { + T s = 0; + + int gbid = ((blockIdx.z * gridDim.y) + blockIdx.y) * gridDim.x + blockIdx.x; + int lid = lane_id(); + + for (int image_w = threadIdx.x; image_w < output_width; + image_w += blockDim.x) { + for (int bid = 0; bid < num; bid++) { + for (int image_h = threadIdx.y; image_h < output_height; + image_h += blockDim.y) { + int kernel_id = blockIdx.z; + int kernel_h = blockIdx.y * dilate_height - padding_height; + int kernel_w = blockIdx.x * dilate_width - padding_width; + + int image_hk = image_h * stride_height + kernel_h; + int image_wk = image_w * stride_width + kernel_w; + if (image_hk < 0 || image_hk >= input_height) continue; + if (image_wk < 0 || image_wk >= input_width) continue; +#define gaid(N, C, H, W) \ + ((((N)*gridDim.z + (C)) * output_height + (H)) * output_width + (W)) + + s += output_grad_data[gaid(bid, kernel_id, image_h, image_w)] * + input_data[((bid * (gridDim.z / filter_multiplier) + + kernel_id / filter_multiplier) * + input_height + + image_hk) * + input_width + + image_wk]; + +#undef gaid } } } +#if __CUDA_ARCH__ >= 530 + s = warpReduceSum(s); + if (lid == 0) paddle::platform::CudaAtomicAdd(&filter_grad_data[gbid], s); +#else + paddle::platform::CudaAtomicAdd(&filter_grad_data[gbid], s); +#endif +} + +template +__global__ void KernelDepthwiseConvFilterGradSp( + const T* output_grad_data, const T* input_data, const int num, + const int output_channels, const int output_height, const int output_width, + const int input_channels, const int input_height, const int input_width, + const int filter_multiplier, const int filter_height, + const int filter_width, const int stride_height, const int stride_width, + const int padding_height, const int padding_width, const int dilate_height, + const int dilate_width, T* filter_grad_data) { + if (c_filter_multiplier == 0) + KernelDepthwiseConvFilterGrad( + output_grad_data, input_data, num, output_channels, output_height, + output_width, input_channels, input_height, input_width, + filter_multiplier, filter_height, filter_width, stride_height, + stride_width, padding_height, padding_width, dilate_height, + dilate_width, filter_grad_data); + else + KernelDepthwiseConvFilterGrad( + output_grad_data, input_data, num, output_channels, output_height, + output_width, input_channels, input_height, input_width, + c_filter_multiplier, filter_height, filter_width, stride_height, + stride_width, padding_height, padding_width, dilate_height, + dilate_width, filter_grad_data); } /* @@ -177,7 +297,9 @@ class DepthwiseConvFunctor { const framework::Tensor& input, const framework::Tensor& filter, const std::vector& strides, - const std::vector& paddings, framework::Tensor* output) { + const std::vector& paddings, + const std::vector& dilations, + framework::Tensor* output) { const int batch_size = input.dims()[0]; const int input_channels = input.dims()[1]; const int input_height = input.dims()[2]; @@ -191,22 +313,37 @@ class DepthwiseConvFunctor { const int stride_width = strides[1]; const int padding_height = paddings[0]; const int padding_width = paddings[1]; + const int dilate_height = dilations[0]; + const int dilate_width = dilations[1]; const T* input_data = input.data(); const T* filter_data = filter.data(); T* output_data = output->mutable_data(context.GetPlace()); - int nthreads = batch_size * output_channels * output_height * output_width; - int blocks = (nthreads + 1024 - 1) / 1024; - dim3 threads(1024, 1); - dim3 grid(blocks, 1); - - KernelDepthwiseConv<<>>( - nthreads, input_data, filter_data, batch_size, output_channels, - output_height, output_width, input_channels, input_height, input_width, - output_channels / input_channels, ksize_height, ksize_width, - stride_height, stride_width, padding_height, padding_width, - output_data); + int thread = 512; + int blocks = std::min(std::max(thread / output_width, 1), output_height); + dim3 threads(std::min(output_width, thread), blocks, 1); + dim3 grid(output_channels, batch_size, 1); + int filter_multiplier = output_channels / input_channels; +#define check_case(c_filter_multiplier, c_stride) \ + if (c_filter_multiplier == 0 || \ + filter_multiplier == c_filter_multiplier && \ + stride_height == stride_width && stride_height == c_stride) { \ + KernelDepthwiseConvSp<<>>( \ + input_data, filter_data, batch_size, output_channels, output_height, \ + output_width, input_channels, input_height, input_width, \ + filter_multiplier, ksize_height, ksize_width, stride_height, \ + stride_width, padding_height, padding_width, dilate_height, \ + dilate_width, output_data); \ + return; \ + } + check_case(1, 1); + check_case(1, 2); + // NOTE(liangdun): 0,0 for other case + // add other case if needed, e.g. check_case(2^n,1) + check_case(0, 0); +#undef check_case } }; @@ -219,6 +356,7 @@ class DepthwiseConvInputGradFunctor { const framework::Tensor& output_grad, const std::vector& strides, const std::vector& paddings, + const std::vector& dilations, framework::Tensor* input_grad) { const int batch_size = input.dims()[0]; const int input_channels = input.dims()[1]; @@ -233,22 +371,39 @@ class DepthwiseConvInputGradFunctor { const int stride_width = strides[1]; const int padding_height = paddings[0]; const int padding_width = paddings[1]; + const int dilate_height = dilations[0]; + const int dilate_width = dilations[1]; const T* filter_data = filter.data(); const T* output_grad_data = output_grad.data(); T* input_grad_data = input_grad->mutable_data(context.GetPlace()); - int nthreads = batch_size * input_channels * input_height * input_width; - int blocks = (nthreads + 1024 - 1) / 1024; - dim3 threads(1024, 1); - dim3 grid(blocks, 1); - - KernelDepthwiseConvInputGrad<<>>( - nthreads, output_grad_data, filter_data, batch_size, output_channels, - output_height, output_width, input_channels, input_height, input_width, - output_channels / input_channels, ksize_height, ksize_width, - stride_height, stride_width, padding_height, padding_width, - input_grad_data); + int thread = 512; + int blocks = std::min(std::max(thread / input_width, 1), input_height); + dim3 threads(std::min(input_width, thread), blocks, 1); + dim3 grid(input_channels, batch_size, 1); + int filter_multiplier = output_channels / input_channels; + +#define check_case(c_filter_multiplier, c_stride) \ + if (c_filter_multiplier == 0 || \ + filter_multiplier == c_filter_multiplier && \ + stride_height == stride_width && stride_height == c_stride) { \ + KernelDepthwiseConvInputGradSp< \ + T, c_filter_multiplier, \ + c_stride><<>>( \ + output_grad_data, filter_data, batch_size, output_channels, \ + output_height, output_width, input_channels, input_height, \ + input_width, filter_multiplier, ksize_height, ksize_width, \ + stride_height, stride_width, padding_height, padding_width, \ + dilate_height, dilate_width, input_grad_data); \ + return; \ + } + check_case(1, 1); + check_case(1, 2); + // NOTE(liangdun): 0,0 for other case + // add other case if needed, e.g. check_case(2^n,1) + check_case(0, 0); +#undef check_case } }; @@ -260,6 +415,7 @@ class DepthwiseConvFilterGradFunctor { const framework::Tensor& output_grad, const std::vector& strides, const std::vector& paddings, + const std::vector& dilations, framework::Tensor* filter_grad) { const int batch_size = input.dims()[0]; const int input_channels = input.dims()[1]; @@ -274,23 +430,34 @@ class DepthwiseConvFilterGradFunctor { const int stride_width = strides[1]; const int padding_height = paddings[0]; const int padding_width = paddings[1]; + const int dilate_height = dilations[0]; + const int dilate_width = dilations[1]; const T* input_data = input.data(); const T* output_grad_data = output_grad.data(); T* filter_grad_data = filter_grad->mutable_data(context.GetPlace()); - int nthreads = batch_size * output_channels * output_height * output_width; - - int blocks = (nthreads + 1024 - 1) / 1024; - dim3 threads(1024, 1); - dim3 grid(blocks, 1); - - KernelDepthwiseConvFilterGrad<<>>( - nthreads, output_grad_data, input_data, batch_size, output_channels, - output_height, output_width, input_channels, input_height, input_width, - output_channels / input_channels, ksize_height, ksize_width, - stride_height, stride_width, padding_height, padding_width, - filter_grad_data); + int block_size = 512; + int crop_output_height = + std::min(std::max(block_size / output_width, 1), output_height); + dim3 grid(ksize_width, ksize_height, output_channels); + dim3 threads(std::min(output_width, block_size), crop_output_height, 1); + int filter_multiplier = output_channels / input_channels; + +#define check_case(c_filter_multiplier) \ + if (c_filter_multiplier == 0 || c_filter_multiplier == filter_multiplier) { \ + KernelDepthwiseConvFilterGradSp< \ + T, c_filter_multiplier><<>>( \ + output_grad_data, input_data, batch_size, output_channels, \ + output_height, output_width, input_channels, input_height, \ + input_width, filter_multiplier, ksize_height, ksize_width, \ + stride_height, stride_width, padding_height, padding_width, \ + dilate_height, dilate_width, filter_grad_data); \ + return; \ + } + check_case(1); + check_case(0); +#undef check_case } }; diff --git a/paddle/fluid/operators/math/depthwise_conv.h b/paddle/fluid/operators/math/depthwise_conv.h index 97aec401889a56d3fc9ac08e766d931bb3725b01..71f6fcb23df1942d6dcf7177165f2ec1022a9b35 100644 --- a/paddle/fluid/operators/math/depthwise_conv.h +++ b/paddle/fluid/operators/math/depthwise_conv.h @@ -32,7 +32,8 @@ class DepthwiseConvFunctor { void operator()(const DeviceContext& context, const framework::Tensor& input, const framework::Tensor& filter, const std::vector& strides, - const std::vector& paddings, framework::Tensor* output); + const std::vector& paddings, + const std::vector& dilations, framework::Tensor* output); }; template @@ -43,6 +44,7 @@ class DepthwiseConvInputGradFunctor { const framework::Tensor& output_grad, const std::vector& strides, const std::vector& paddings, + const std::vector& dilations, framework::Tensor* input_grad); }; @@ -53,6 +55,7 @@ class DepthwiseConvFilterGradFunctor { const framework::Tensor& output_grad, const std::vector& strides, const std::vector& paddings, + const std::vector& dilations, framework::Tensor* filter_grad); }; diff --git a/paddle/fluid/operators/math/math_function.cc b/paddle/fluid/operators/math/math_function.cc index 5923792902a81521256de300f77955f1ea3d16c6..854c8653ff545cb12eef79837d0312bb28458af8 100644 --- a/paddle/fluid/operators/math/math_function.cc +++ b/paddle/fluid/operators/math/math_function.cc @@ -13,6 +13,15 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/math/math_function.h" + +#ifdef PADDLE_WITH_MKLML +#include "paddle/fluid/platform/dynload/mklml.h" +#endif + +#ifdef PADDLE_USE_OPENBLAS +#include +#endif + #include #include "paddle/fluid/framework/data_type.h" #include "paddle/fluid/operators/math/math_function_impl.h" diff --git a/paddle/fluid/operators/math/math_function.h b/paddle/fluid/operators/math/math_function.h index c63ad89e46d2c187c7e6fe6b2fe73fbbed5f4044..b4f19417b6eabf24805c5c8128c2a6d423ddac69 100644 --- a/paddle/fluid/operators/math/math_function.h +++ b/paddle/fluid/operators/math/math_function.h @@ -13,18 +13,6 @@ See the License for the specific language governing permissions and limitations under the License. */ #pragma once -#ifdef PADDLE_WITH_MKLML -#include "paddle/fluid/platform/dynload/mklml.h" -#endif - -#ifdef PADDLE_USE_OPENBLAS -#include -// remove typedef in openblas -#undef FLOAT -#undef INT -#undef SIZE -#endif - #include #include diff --git a/paddle/fluid/operators/math/selected_rows_functor.cu b/paddle/fluid/operators/math/selected_rows_functor.cu index b27880c232a51d32777569cf9ac67656ce02f232..ba8eccf82042b679f69a32f9d053f05ac8fb9a99 100644 --- a/paddle/fluid/operators/math/selected_rows_functor.cu +++ b/paddle/fluid/operators/math/selected_rows_functor.cu @@ -60,11 +60,9 @@ struct SelectedRowsAdd { auto out_place = context.GetPlace(); PADDLE_ENFORCE(platform::is_gpu_place(out_place)); - memory::Copy( - boost::get(out_place), out_data, - boost::get(in1_place), in1_data, - in1_value.numel() * sizeof(T), - reinterpret_cast(context).stream()); + memory::Copy(boost::get(out_place), out_data, + boost::get(in1_place), in1_data, + in1_value.numel() * sizeof(T), context.stream()); auto* in2_data = in2_value.data(); memory::Copy(boost::get(out_place), @@ -148,7 +146,7 @@ struct SelectedRowsAddTo { auto in1_height = input1.height(); PADDLE_ENFORCE_EQ(in1_height, input2->height()); - framework::Vector in1_rows(input1.rows()); + auto& in1_rows = input1.rows(); auto& in2_rows = *(input2->mutable_rows()); auto& in1_value = input1.value(); diff --git a/paddle/fluid/operators/pool_mkldnn_op.cc b/paddle/fluid/operators/pool_mkldnn_op.cc index 5341187d1ce9400ac34750ab691608e76158ae0d..56cef91e29cc7da27384c27a7ec63e90cfadfc3b 100644 --- a/paddle/fluid/operators/pool_mkldnn_op.cc +++ b/paddle/fluid/operators/pool_mkldnn_op.cc @@ -46,6 +46,25 @@ static std::string gethash(const memory::dims& input_dims, dims2str(paddings) + pooling_type + suffix; } +static inline int ComputeCeiledOutput(int input_size, int kernel_size, + int padding, int stride) { + return (input_size - kernel_size + 2 * padding) / stride + 1; +} + +static inline void CorrectOutputSize( + const std::vector& src_tz, const std::vector& dst_tz, + const std::vector& kernel_size, const std::vector& paddings, + const std::vector& strides, + std::vector& right_bot_padding) { // NOLINT + for (size_t i = 0; i < right_bot_padding.size(); i++) { + int desired_size = ComputeCeiledOutput(src_tz[i + 2], kernel_size[i], + paddings[i], strides[i]); + if (desired_size != dst_tz[i + 2]) { + right_bot_padding[i] += strides[i]; + } + } +} + template class PoolMKLDNNOpKernel : public paddle::framework::OpKernel { public: @@ -103,6 +122,13 @@ class PoolMKLDNNOpKernel : public paddle::framework::OpKernel { auto pool_p = std::static_pointer_cast(dev_ctx.GetBlob(key_pool_p)); if (pool_p == nullptr) { + const std::vector& padding_left_top(paddings); + std::vector padding_right_bottom(paddings); + bool ceil_mode = ctx.Attr("ceil_mode"); + if (ceil_mode) { + CorrectOutputSize(src_tz, dst_tz, ksize, paddings, strides, + padding_right_bottom); + } auto src_md = platform::MKLDNNMemDesc( src_tz, platform::MKLDNNGetDataType(), input_format); @@ -114,8 +140,9 @@ class PoolMKLDNNOpKernel : public paddle::framework::OpKernel { mkldnn::memory::format::any); std::shared_ptr pool_pd = - CreatePrimitiveDesc(src_md, dst_md, strides, paddings, ksize, - pooling_type, mkldnn_engine); + CreatePrimitiveDesc(src_md, dst_md, strides, padding_left_top, + padding_right_bottom, ksize, pooling_type, + mkldnn_engine, ceil_mode); // save pool_pd into global device context to be referred in backward path dev_ctx.SetBlob(key_pool_pd, pool_pd); @@ -171,14 +198,16 @@ class PoolMKLDNNOpKernel : public paddle::framework::OpKernel { private: std::unique_ptr CreatePrimitiveDesc( const mkldnn::memory::desc& src, const mkldnn::memory::desc& dst, - const std::vector& stride, const std::vector& padding, - const std::vector& kernel, const std::string& pooling_type, - const mkldnn::engine& engine) const { + const std::vector& stride, const std::vector& padding_left_top, + const std::vector& padding_right_bot, const std::vector& kernel, + const std::string& pooling_type, const mkldnn::engine& engine, + bool ceil_mode) const { auto pool_desc = mkldnn::pooling_forward::desc( mkldnn::prop_kind::forward, pooling_type == "max" ? mkldnn::algorithm::pooling_max : mkldnn::algorithm::pooling_avg, - src, dst, stride, kernel, padding, padding, mkldnn::padding_kind::zero); + src, dst, stride, kernel, padding_left_top, padding_right_bot, + mkldnn::padding_kind::zero); auto p_pool_pd = new mkldnn::pooling_forward::primitive_desc(pool_desc, engine); diff --git a/paddle/fluid/operators/reduce_mean_op.cu b/paddle/fluid/operators/reduce_mean_op.cu index 960cb3235be7f4cc98b97d3b088ceaeb3d4a4209..59b30244839849d79e3e531953134633503c4090 100644 --- a/paddle/fluid/operators/reduce_mean_op.cu +++ b/paddle/fluid/operators/reduce_mean_op.cu @@ -12,17 +12,64 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include +#include "paddle/fluid/operators/cub_reduce.h" #include "paddle/fluid/operators/reduce_mean_op.h" -REGISTER_OP_CUDA_KERNEL(reduce_mean, - ops::ReduceKernel, - ops::ReduceKernel, - ops::ReduceKernel, - ops::ReduceKernel); +namespace paddle { +namespace operators { + +template +struct DivideFunctor { + HOSTDEVICE explicit inline DivideFunctor(int n) : n_inv((T)(1.0 / n)) {} + + HOSTDEVICE inline T operator()(const T& x) const { return x * n_inv; } + + private: + T n_inv; +}; + +template +class ReduceMeanKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + bool reduce_all = context.Attr("reduce_all"); + auto* input = context.Input("X"); + auto* output = context.Output("Out"); + + auto dims = context.Attr>("dim"); + bool keep_dim = context.Attr("keep_dim"); + + std::vector reduce_dims; + if (reduce_all) { + reduce_dims.resize(input->dims().size()); + for (int i = 0; i < reduce_dims.size(); ++i) reduce_dims[i] = i; + } else { + for (auto e : dims) { + reduce_dims.push_back(e >= 0 ? e : e + input->dims().size()); + } + } + + int reduce_num = 1; + for (int i = 0; i < reduce_dims.size(); ++i) { + reduce_num *= input->dims()[reduce_dims[i]]; + } + + auto stream = context.cuda_device_context().stream(); + TensorReduce>( + *input, output, reduce_dims, static_cast(0), cub::Sum(), + DivideFunctor(reduce_num), stream); + } +}; + +} // namespace operators +} // namespace paddle + +REGISTER_OP_CUDA_KERNEL(reduce_mean, ops::ReduceMeanKernel, + ops::ReduceMeanKernel, + ops::ReduceMeanKernel, + ops::ReduceMeanKernel); + REGISTER_OP_CUDA_KERNEL( reduce_mean_grad, ops::ReduceGradKernel, diff --git a/paddle/fluid/operators/reduce_sum_op.cu b/paddle/fluid/operators/reduce_sum_op.cu index f2e16955a50dc6a7feda9fbaf968c929ef3d8a4f..53cd9e9419dd9aecee730917ae21d7a4ab332ffc 100644 --- a/paddle/fluid/operators/reduce_sum_op.cu +++ b/paddle/fluid/operators/reduce_sum_op.cu @@ -12,17 +12,59 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include "paddle/fluid/operators/cub_reduce.h" #include "paddle/fluid/operators/reduce_sum_op.h" -REGISTER_OP_CUDA_KERNEL(reduce_sum, - ops::ReduceKernel, - ops::ReduceKernel, - ops::ReduceKernel, - ops::ReduceKernel); +namespace paddle { +namespace operators { + +template +struct IdentityFunctor { + HOSTDEVICE explicit inline IdentityFunctor() {} + + HOSTDEVICE inline T operator()(const T& x) const { return x; } +}; + +template +class ReduceSumKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + bool reduce_all = context.Attr("reduce_all"); + auto* input = context.Input("X"); + auto* output = context.Output("Out"); + + auto dims = context.Attr>("dim"); + bool keep_dim = context.Attr("keep_dim"); + + std::vector reduce_dims; + if (reduce_all) { + reduce_dims.resize(input->dims().size()); + for (int i = 0; i < reduce_dims.size(); ++i) reduce_dims[i] = i; + } else { + for (auto e : dims) { + reduce_dims.push_back(e >= 0 ? e : e + input->dims().size()); + } + } + + int reduce_num = 1; + for (int i = 0; i < reduce_dims.size(); ++i) { + reduce_num *= input->dims()[reduce_dims[i]]; + } + + auto stream = context.cuda_device_context().stream(); + TensorReduce>( + *input, output, reduce_dims, static_cast(0), cub::Sum(), + IdentityFunctor(), stream); + } +}; + +} // namespace operators +} // namespace paddle + +REGISTER_OP_CUDA_KERNEL(reduce_sum, ops::ReduceSumKernel, + ops::ReduceSumKernel, ops::ReduceSumKernel, + ops::ReduceSumKernel); + REGISTER_OP_CUDA_KERNEL( reduce_sum_grad, ops::ReduceGradKernel, diff --git a/paddle/fluid/operators/sampling_id_op.cc b/paddle/fluid/operators/sampling_id_op.cc index 724463c95c4a29fb5c00fe791b389d3908771640..a4f41a170426a4650fd3bf8f7fec4758ff34e1b9 100644 --- a/paddle/fluid/operators/sampling_id_op.cc +++ b/paddle/fluid/operators/sampling_id_op.cc @@ -53,15 +53,16 @@ class SamplingIdOpMaker : public framework::OpProtoAndCheckerMaker { SamplingId Operator. A layer for sampling id from multinomial distribution from the input. Sampling one id for one sample.)DOC"); - AddAttr("min", "Minimum value of random. [default 0.0].") + AddAttr("min", "Minimum value of random. (float, default 0.0).") .SetDefault(0.0f); - AddAttr("max", "Maximun value of random. [default 1.0].") + AddAttr("max", "Maximun value of random. (float, default 1.0).") .SetDefault(1.0f); - AddAttr("seed", - "Random seed used for the random number engine. " - "0 means use a seed generated by the system." - "Note that if seed is not 0, this operator will always " - "generate the same random numbers every time. [default 0].") + AddAttr( + "seed", + "Random seed used for the random number engine. " + "0 means use a seed generated by the system." + "Note that if seed is not 0, this operator will always " + "generate the same random numbers every time. (int, default 0).") .SetDefault(0); } }; diff --git a/paddle/fluid/operators/scale_op.cc b/paddle/fluid/operators/scale_op.cc index bf4df4f600c14050b636b7ee6d7b6973b57adb94..981969d2aaa684731a615ec64ca7f7718b35cf09 100644 --- a/paddle/fluid/operators/scale_op.cc +++ b/paddle/fluid/operators/scale_op.cc @@ -77,8 +77,10 @@ class ScaleOpVarTypeInference : public framework::VarTypeInference { auto out_var_name = op_desc.Output("Out").front(); auto *out_var = block->FindVarRecursive(out_var_name); - out_var->SetType(in_var.GetType()); - out_var->SetDataType(in_var.GetDataType()); + if (in_var_name != out_var_name) { + out_var->SetType(in_var.GetType()); + out_var->SetDataType(in_var.GetDataType()); + } } }; diff --git a/paddle/fluid/operators/select_op.cc b/paddle/fluid/operators/select_op.cc deleted file mode 100644 index e71841d4d1815d50cd9800910c9db34e121beffc..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/select_op.cc +++ /dev/null @@ -1,419 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include -#include // NOLINT -#include -#include "paddle/fluid/framework/channel.h" -#include "paddle/fluid/framework/executor.h" -#include "paddle/fluid/framework/lod_tensor.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/concurrency/channel_util.h" - -#include - -namespace paddle { -namespace operators { - -static constexpr char kX[] = "X"; -static constexpr char kCaseToExecute[] = "case_to_execute"; -static constexpr char kOutputs[] = "Out"; - -static constexpr char kCases[] = "cases"; -static constexpr char kCasesBlock[] = "sub_block"; - -class SelectOp : public framework::OperatorBase { - public: - SelectOp(const std::string &type, const framework::VariableNameMap &inputs, - const framework::VariableNameMap &outputs, - const framework::AttributeMap &attrs) - : framework::OperatorBase(type, inputs, outputs, attrs) {} - - private: - enum class SelectOpCaseType { - DEFAULT = 0, - SEND = 1, - RECEIVE = 2, - }; - - struct SelectOpCase { - int caseIndex; - SelectOpCaseType caseType; - std::string channelName; - std::string varName; - - SelectOpCase() {} - - SelectOpCase(int caseIndex, SelectOpCaseType caseType, - std::string channelName, std::string varName) - : caseIndex(caseIndex), - caseType(caseType), - channelName(channelName), - varName(varName) {} - }; - - void RunImpl(const framework::Scope &scope, - const platform::Place &dev_place) const override { - std::vector casesConfigs = - Attr>(kCases); - - framework::BlockDesc *casesBlock = - Attr(kCasesBlock); - - framework::Scope &casesBlockScope = scope.NewScope(); - - std::string caseToExecuteVarName = Input(kCaseToExecute); - framework::Variable *caseToExecuteVar = - casesBlockScope.FindVar(caseToExecuteVarName); - - // Construct cases from "conditional_block_op"(s) in the casesBlock - std::vector> cases = - ParseAndShuffleCases(&casesConfigs); - - // Get all unique channels involved in select - std::set channelsSet; - for (auto c : cases) { - if (!c->channelName.empty()) { - auto channelVar = scope.FindVar(c->channelName); - framework::ChannelHolder *ch = - channelVar->GetMutable(); - - if (channelsSet.find(ch) == channelsSet.end()) { - channelsSet.insert(ch); - } - } - } - - // Order all channels by their pointer address - std::vector channels(channelsSet.begin(), - channelsSet.end()); - std::sort(channels.begin(), channels.end()); - - // Poll all cases - int32_t caseToExecute = pollCases(&scope, &cases, channels); - - // At this point, the case to execute has already been determined, - // so we can proceed with executing the cases block - framework::LoDTensor *caseToExecuteTensor = - caseToExecuteVar->GetMutable(); - caseToExecuteTensor->data()[0] = caseToExecute; - - // Execute the cases block, only one case will be executed since we set the - // case_to_execute value to the index of the case we want to execute - framework::Executor executor(dev_place); - framework::ProgramDesc *program = casesBlock->Program(); - executor.Run(*program, &casesBlockScope, casesBlock->ID(), - false /*create_local_scope*/); - } - - /** - * Goes through all operators in the casesConfigs and processes - * "conditional_block" operators. These operators are mapped to our - * SelectOpCase objects. We randomize the case orders, and set the - * default case (if any exists) as the last case) - * @param casesBlock - * @return - */ - std::vector> ParseAndShuffleCases( - std::vector *casesConfigs) const { - std::vector> cases; - std::shared_ptr defaultCase; - - if (casesConfigs != nullptr) { - boost::char_delimiters_separator sep(false, ",", ""); - for (std::vector::iterator itr = casesConfigs->begin(); - itr < casesConfigs->end(); ++itr) { - std::string caseConfig = *itr; - boost::tokenizer<> tokens(caseConfig, sep); - - boost::tokenizer<>::iterator tok_iter = tokens.begin(); - PADDLE_ENFORCE(tok_iter != tokens.end(), "Cannot get case index"); - std::string caseIndexString = *tok_iter; - int caseIndex = std::stoi(caseIndexString); - - ++tok_iter; - PADDLE_ENFORCE(tok_iter != tokens.end(), "Cannot get case type"); - std::string caseTypeString = *tok_iter; - SelectOpCaseType caseType = (SelectOpCaseType)std::stoi(caseTypeString); - - std::string caseChannel; - std::string caseChannelVar; - - ++tok_iter; - if (caseType != SelectOpCaseType::DEFAULT) { - PADDLE_ENFORCE(tok_iter != tokens.end(), "Cannot get case channel"); - caseChannel = *tok_iter; - - ++tok_iter; - PADDLE_ENFORCE(tok_iter != tokens.end(), - "Cannot get case channel variable"); - caseChannelVar = *tok_iter; - } - - auto c = std::make_shared(caseIndex, caseType, - caseChannel, caseChannelVar); - - if (caseType == SelectOpCaseType::DEFAULT) { - PADDLE_ENFORCE(defaultCase == nullptr, - "Select can only contain one default case."); - defaultCase = c; - } else { - cases.push_back(c); - } - } - } - - // Randomly sort cases, with default case being last - std::random_shuffle(cases.begin(), cases.end()); - if (defaultCase != nullptr) { - cases.push_back(defaultCase); - } - - return cases; - } - - /** - * This method will recursively poll the cases and determines if any case - * condition is true. - * If none of the cases conditions are true (and there is no default case), - * then block - * the thread. The thread may be woken up by a channel operation, at which - * point we - * execute the case. - * @param scope - * @param cases - * @param channels - * @return - */ - int32_t pollCases(const framework::Scope *scope, - std::vector> *cases, - std::vector channels) const { - // Lock all involved channels - lockChannels(channels); - - std::atomic caseToExecute(-1); - - std::vector>::iterator it = cases->begin(); - while (it != cases->end()) { - std::shared_ptr c = *it; - - auto chVar = scope->FindVar(c->channelName); - framework::ChannelHolder *ch = - chVar->GetMutable(); - - switch (c->caseType) { - case SelectOpCaseType::SEND: - PADDLE_ENFORCE(!ch->IsClosed(), "Cannot send to a closed channel"); - if (ch->CanSend()) { - // We can send to channel directly, send the data to channel - // and execute case - auto chVar = scope->FindVar(c->varName); - concurrency::ChannelSend(ch, chVar); - caseToExecute = c->caseIndex; - } - break; - case SelectOpCaseType::RECEIVE: - if (ch->CanReceive()) { - // We can receive from channel directly, send the data to channel - // and execute case - auto chVar = scope->FindVar(c->varName); - concurrency::ChannelReceive(ch, chVar); - caseToExecute = c->caseIndex; - } - break; - case SelectOpCaseType::DEFAULT: - caseToExecute = c->caseIndex; - break; - } - - if (caseToExecute != -1) { - // We found a case to execute, stop looking at other case statements - break; - } - - ++it; - } - - if (caseToExecute == -1) { - // None of the cases are eligible to execute, enqueue current thread - // into all the sending/receiving queue of each involved channel - std::atomic completed(false); - std::recursive_mutex mutex; - std::unique_lock lock{mutex}; - // std::condition_variable_any selectCond; - auto selectCond = std::make_shared(); - - std::recursive_mutex callbackMutex; - pushThreadOnChannelQueues(scope, cases, selectCond, &caseToExecute, - &completed, &callbackMutex); - - // TODO(thuan): Atomically unlock all channels and sleep current thread - unlockChannels(channels); - selectCond->wait(lock, [&completed]() { return completed.load(); }); - - // Select has been woken up by case operation - lockChannels(channels); - removeThreadOnChannelQueues(scope, cases); - - if (caseToExecute == -1) { - // Recursively poll cases, since we were woken up by a channel close - // TODO(thuan): Need to test if this is a valid case - unlockChannels(channels); - return pollCases(scope, cases, channels); - } - } - - // At this point, caseToExecute != -1, and we can proceed with executing - // the case block - unlockChannels(channels); - - return caseToExecute; - } - - void lockChannels(std::vector chs) const { - std::vector::iterator it = chs.begin(); - while (it != chs.end()) { - framework::ChannelHolder *ch = *it; - ch->Lock(); - ++it; - } - } - - void unlockChannels(std::vector chs) const { - std::vector::reverse_iterator it = chs.rbegin(); - while (it != chs.rend()) { - framework::ChannelHolder *ch = *it; - ch->Unlock(); - ++it; - } - } - - void pushThreadOnChannelQueues( - const framework::Scope *scope, - std::vector> *cases, - std::shared_ptr rCond, - std::atomic *caseToExecute, std::atomic *completed, - std::recursive_mutex *callbackMutex) const { - std::vector>::iterator it = cases->begin(); - while (it != cases->end()) { - std::shared_ptr c = *it; - - auto chVar = scope->FindVar(c->channelName); - framework::ChannelHolder *ch = - chVar->GetMutable(); - - std::function cb = - [&caseToExecute, &completed, &callbackMutex, - c](framework::ChannelAction channelAction) { - std::lock_guard lock{*callbackMutex}; - - bool canProcess = false; - if (!(*completed)) { - // If the channel wasn't closed, we set the caseToExecute index - // as this current case - if (channelAction != framework::ChannelAction::CLOSE) { - *caseToExecute = c->caseIndex; - } - // This will allow our conditional variable to break out of wait - *completed = true; - canProcess = true; - } - - return canProcess; - }; - - switch (c->caseType) { - case SelectOpCaseType::SEND: { - auto chOutputVar = scope->FindVar(c->varName); - concurrency::ChannelAddToSendQ(ch, this, chOutputVar, rCond, cb); - break; - } - case SelectOpCaseType::RECEIVE: { - auto chOutputVar = scope->FindVar(c->varName); - concurrency::ChannelAddToReceiveQ(ch, this, chOutputVar, rCond, cb); - break; - } - default: - break; - } - ++it; - } - } - - void removeThreadOnChannelQueues( - const framework::Scope *scope, - std::vector> *cases) const { - std::vector>::iterator it = cases->begin(); - while (it != cases->end()) { - std::shared_ptr c = *it; - - auto chVar = scope->FindVar(c->channelName); - framework::ChannelHolder *ch = - chVar->GetMutable(); - switch (c->caseType) { - case SelectOpCaseType::SEND: { - ch->RemoveFromSendQ(this); - break; - } - case SelectOpCaseType::RECEIVE: { - ch->RemoveFromReceiveQ(this); - break; - } - default: - break; - } - ++it; - } - } -}; - -class SelectOpMaker : public framework::OpProtoAndCheckerMaker { - public: - void Make() override { - AddInput(kX, - "A set of variables, which are required by operators inside the " - "cases of Select Op") - .AsDuplicable(); - AddInput(kCaseToExecute, - "(Int) The variable the sets the index of the case to execute, " - "after evaluating the channels being sent to and received from") - .AsDuplicable(); - AddOutput(kOutputs, - "A set of variables, which will be assigned with values " - "generated by the operators inside the cases of Select Op.") - .AsDuplicable(); - AddAttr>(kCases, - "(String vector) Serialized list of" - "all cases in the select op. Each" - "case is serialized as: " - "',,,'" - "where type is 0 for default, 1 for" - "send, and 2 for receive" - "No channel and values are needed for" - "default cases."); - AddAttr(kCasesBlock, - "The cases block inside select_op"); - AddComment(R"DOC( -)DOC"); - } -}; - -// TODO(thuan): Implement Gradient Operator for SELECT_OP - -} // namespace operators -} // namespace paddle - -REGISTER_OPERATOR(select, paddle::operators::SelectOp, - paddle::framework::EmptyGradOpMaker, - paddle::operators::SelectOpMaker); diff --git a/paddle/fluid/operators/sequence_erase_op.cc b/paddle/fluid/operators/sequence_erase_op.cc index 1c86486157a02c3b78ed61e840fd8e452b9cb452..816ba123a6cbf84ec9b321d5d7cfef7fab9749b1 100644 --- a/paddle/fluid/operators/sequence_erase_op.cc +++ b/paddle/fluid/operators/sequence_erase_op.cc @@ -24,9 +24,9 @@ class SequenceEraseOp : public framework::OperatorWithKernel { void InferShape(framework::InferShapeContext* ctx) const override { PADDLE_ENFORCE(ctx->HasInput("X"), - "Input(X) of SequenceEraseOp should not be null."); + "Input(X) of SequenceErase operator should not be null."); PADDLE_ENFORCE(ctx->HasOutput("Out"), - "Output(Out) of SequenceEraseOp should not be null."); + "Output(Out) of SequenceErase operator should not be null."); auto x_dims = ctx->GetInputDim("X"); PADDLE_ENFORCE(x_dims.size() == 2 && x_dims[1] == 1, "Input(X) of SequenceEraseOp should be a 2-D LoDTensor " diff --git a/paddle/fluid/operators/sequence_slice_op.h b/paddle/fluid/operators/sequence_slice_op.h index b5ea6ff49bbb29571f9a6ef6358ef881acd9be9e..03b59d71cc0ca2eddd1d9912e7ca25348507ba03 100644 --- a/paddle/fluid/operators/sequence_slice_op.h +++ b/paddle/fluid/operators/sequence_slice_op.h @@ -75,11 +75,11 @@ class SequenceSliceOpKernel : public framework::OpKernel { } for (size_t i = 0; i < n; ++i) { - PADDLE_ENFORCE_LT(0, offset_data[i], + PADDLE_ENFORCE_LE(0, offset_data[i], "The offset[%d] must greater than zero.", i); PADDLE_ENFORCE_LT(0, length_data[i], "The length[%d] must greater than zero.", i); - PADDLE_ENFORCE_LT(lod[0][i] + offset_data[i] + length_data[i], + PADDLE_ENFORCE_LE(lod[0][i] + offset_data[i] + length_data[i], lod[0][i + 1], "The target tensor's length overflow."); } diff --git a/paddle/fluid/operators/sgd_op.cu b/paddle/fluid/operators/sgd_op.cu index 4722be7a666d3e8f3c25c9499f88ddda835f60e3..243609075713305a90dc162991166ba24d54e835 100644 --- a/paddle/fluid/operators/sgd_op.cu +++ b/paddle/fluid/operators/sgd_op.cu @@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#define EIGEN_USE_GPU +#include #include "paddle/fluid/operators/sgd_op.h" #include "paddle/fluid/platform/cuda_primitives.h" @@ -33,22 +33,21 @@ __global__ void SGDKernel(const T* g, const T* p, const T* learning_rate, } } -template +template __global__ void SparseSGDFunctorKernel(const T* selected_rows, const int64_t* rows, const T* learning_rate, T* tensor_out, - int64_t row_numel) { - const int ty = blockIdx.y; - int tid = threadIdx.x; - - selected_rows += ty * row_numel; - tensor_out += rows[ty] * row_numel; - - for (int index = tid; index < row_numel; index += block_size) { - // Since index in rows of SelectedRows can be duplicate, we have to use - // Atomic Operation to avoid concurrent write error. - paddle::platform::CudaAtomicAdd( - tensor_out + index, -1.0 * learning_rate[0] * selected_rows[index]); + int64_t row_numel, int64_t limit) { + for (int64_t i = blockIdx.x; i < limit; i += gridDim.x) { + const T* selected_rows_ptr = selected_rows + i * row_numel; + T* tensor_out_ptr = tensor_out + rows[i] * row_numel; + for (int64_t index = threadIdx.x; index < row_numel; index += blockDim.x) { + // Since index in rows of SelectedRows can be duplicate, we have to use + // Atomic Operation to avoid concurrent write error. + paddle::platform::CudaAtomicAdd( + tensor_out_ptr + index, + -1.0 * learning_rate[0] * selected_rows_ptr[index]); + } } } } // namespace @@ -89,7 +88,7 @@ class SGDOpCUDAKernel : public framework::OpKernel { PADDLE_ENFORCE_EQ(in_height, out_dims[0]); auto& in_value = grad->value(); - framework::Vector in_rows(grad->rows()); + auto& in_rows = grad->rows(); int64_t in_row_numel = in_value.numel() / in_rows.size(); PADDLE_ENFORCE_EQ(in_row_numel, param_out->numel() / in_height); @@ -97,13 +96,15 @@ class SGDOpCUDAKernel : public framework::OpKernel { auto* in_data = in_value.data(); auto* out_data = param_out->data(); - const int block_size = 256; - dim3 threads(block_size, 1); - dim3 grid(1, in_rows.size()); - SparseSGDFunctorKernel< - T, 256><<>>( + const int kThreadsPerBlock = 256; + int thread_x = kThreadsPerBlock; + int max_threads = ctx.cuda_device_context().GetMaxPhysicalThreadCount(); + int max_blocks = std::max(max_threads / kThreadsPerBlock, 1); + + SparseSGDFunctorKernel<<>>( in_data, in_rows.CUDAData(ctx.GetPlace()), learning_rate->data(), - out_data, in_row_numel); + out_data, in_row_numel, in_rows.size()); } else { PADDLE_THROW("Unsupported Variable Type of Grad"); diff --git a/paddle/fluid/operators/sum_op.h b/paddle/fluid/operators/sum_op.h index 6dffe527c1072ee97fcde1725bfc1a47ed1ad74a..34403c7a7aa717cca470be2931009e219e00e3ae 100644 --- a/paddle/fluid/operators/sum_op.h +++ b/paddle/fluid/operators/sum_op.h @@ -32,7 +32,7 @@ class SumKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext &context) const override { auto in_vars = context.MultiInputVar("X"); - int N = in_vars.size(); + size_t in_num = in_vars.size(); auto out_var = context.OutputVar("Out"); bool in_place = out_var == in_vars[0]; @@ -53,7 +53,7 @@ class SumKernel : public framework::OpKernel { auto &place = *context.template device_context().eigen_device(); // If in_place, just skip the first tensor - for (int i = in_place ? 1 : 0; i < N; i++) { + for (size_t i = in_place ? 1 : 0; i < in_num; i++) { if (in_vars[i]->IsType()) { auto &in_t = in_vars[i]->Get(); if (in_t.numel() == 0) { @@ -101,13 +101,13 @@ class SumKernel : public framework::OpKernel { // Runtime InferShape size_t first_dim = 0; - for (int i = 0; i < N; i++) { + for (size_t i = 0; i < in_num; i++) { auto &sel_row = get_selected_row(i); first_dim += sel_row.rows().size(); } std::vector in_dim; - for (int i = 0; i < N; i++) { + for (size_t i = 0; i < in_num; i++) { auto &sel_row = get_selected_row(i); if (sel_row.rows().size() > 0) { in_dim = framework::vectorize(sel_row.value().dims()); @@ -116,14 +116,14 @@ class SumKernel : public framework::OpKernel { } if (in_dim.empty()) { VLOG(3) << "WARNING: all the inputs are empty"; - in_dim = framework::vectorize(get_selected_row(N - 1).value().dims()); + in_dim = + framework::vectorize(get_selected_row(in_num - 1).value().dims()); } else { in_dim[0] = static_cast(first_dim); } out_value->Resize(framework::make_ddim(in_dim)); out_value->mutable_data(context.GetPlace()); - // if all the input sparse vars are empty, no need to // merge these vars. if (first_dim == 0UL) { @@ -133,7 +133,7 @@ class SumKernel : public framework::OpKernel { math::SelectedRowsAddTo functor; int64_t offset = 0; - for (int i = 0; i < N; i++) { + for (size_t i = 0; i < in_num; i++) { auto &sel_row = get_selected_row(i); if (sel_row.rows().size() == 0) { continue; diff --git a/paddle/fluid/operators/tensorrt_engine_op.cc b/paddle/fluid/operators/tensorrt_engine_op.cc index 1048d3017140c9e31426a1580b2862667116a024..41a5786fe8c3295390144732221280e152d0a15a 100644 --- a/paddle/fluid/operators/tensorrt_engine_op.cc +++ b/paddle/fluid/operators/tensorrt_engine_op.cc @@ -22,8 +22,6 @@ namespace paddle { DEFINE_int32(tensorrt_engine_batch_size, 1, "the batch_size of TensorRT"); -DEFINE_int32(tensorrt_max_batch_size, 1, "TensorRT maximum batch size"); -DEFINE_int32(tensorrt_workspace_size, 16 << 20, "TensorRT workspace size"); namespace operators { @@ -34,6 +32,8 @@ class TensorRTEngineOpMaker : public framework::OpProtoAndCheckerMaker { AddOutput("Ys", "A list of outputs").AsDuplicable(); AddAttr("subgraph", "the subgraph."); AddAttr("engine_uniq_key", "unique key for the TRT engine."); + AddAttr("max_batch_size", "the maximum batch size."); + AddAttr("workspace_size", "the workspace size."); AddComment("TensorRT engine operator."); } }; diff --git a/paddle/fluid/operators/tensorrt_engine_op.h b/paddle/fluid/operators/tensorrt_engine_op.h index 79e75ea9a035b654f0bb7026d3a491bebe0b23c4..d4ba0f9c33c91811647f9d19a332f139c16b0eb2 100644 --- a/paddle/fluid/operators/tensorrt_engine_op.h +++ b/paddle/fluid/operators/tensorrt_engine_op.h @@ -28,8 +28,6 @@ namespace paddle { DECLARE_int32(tensorrt_engine_batch_size); -DECLARE_int32(tensorrt_max_batch_size); -DECLARE_int32(tensorrt_workspace_size); namespace operators { @@ -92,14 +90,14 @@ class TensorRTEngineKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { auto engine_name = context.Attr("engine_uniq_key"); + int max_batch_size = context.Attr("max_batch_size"); if (!Singleton::Global().HasEngine(engine_name)) { Prepare(context); } auto* engine = Singleton::Global().Get(engine_name); auto input_names = context.op().Inputs("Xs"); PADDLE_ENFORCE(!input_names.empty(), "should pass more than one inputs"); - PADDLE_ENFORCE_LE(FLAGS_tensorrt_engine_batch_size, - FLAGS_tensorrt_max_batch_size); + PADDLE_ENFORCE_LE(FLAGS_tensorrt_engine_batch_size, max_batch_size); std::vector output_maps = context.Attr>("output_name_mapping"); @@ -173,8 +171,9 @@ class TensorRTEngineKernel : public framework::OpKernel { // Get the ProgramDesc and pass to convert. framework::proto::BlockDesc block_desc; block_desc.ParseFromString(context.Attr("subgraph")); - int max_batch = FLAGS_tensorrt_max_batch_size; - auto max_workspace = FLAGS_tensorrt_workspace_size; + int max_batch_size = context.Attr("max_batch_size"); + int workspace_size = context.Attr("workspace_size"); + auto params = context.Attr>("parameters"); std::unordered_set parameters; for (const auto& param : params) { @@ -186,7 +185,7 @@ class TensorRTEngineKernel : public framework::OpKernel { // TODO(Superjomn) replace this with a different stream auto* engine = Singleton::Global().Create( - max_batch, max_workspace, nullptr /*engine hold its own stream*/, + max_batch_size, workspace_size, nullptr /*engine hold its own stream*/, context.Attr("engine_uniq_key"), boost::get(context.GetPlace()).device); diff --git a/paddle/fluid/operators/tensorrt_engine_op_test.cc b/paddle/fluid/operators/tensorrt_engine_op_test.cc index 27c1d29762b3de5e57f877b271aae52e71eb7cf9..e21101e8d12f210af08284dbcebe5c14c1af6dd3 100644 --- a/paddle/fluid/operators/tensorrt_engine_op_test.cc +++ b/paddle/fluid/operators/tensorrt_engine_op_test.cc @@ -58,8 +58,6 @@ void AddTensorToBlockDesc(framework::proto::BlockDesc* block, using inference::analysis::SetAttr; TEST(TensorRTEngineOp, manual) { - FLAGS_tensorrt_engine_batch_size = 2; - FLAGS_tensorrt_max_batch_size = 2; framework::ProgramDesc program; auto* block_ = program.Proto()->add_blocks(); block_->set_idx(0); @@ -101,6 +99,8 @@ TEST(TensorRTEngineOp, manual) { engine_op_desc.SetOutput("Ys", std::vector({"z0"})); SetAttr(engine_op_desc.Proto(), "subgraph", block_->SerializeAsString()); + SetAttr(engine_op_desc.Proto(), "max_batch_size", 2); + SetAttr(engine_op_desc.Proto(), "workspace_size", 2 << 10); SetAttr(engine_op_desc.Proto(), "engine_uniq_key", "a_engine"); SetAttr>(engine_op_desc.Proto(), "parameters", std::vector({})); @@ -129,8 +129,6 @@ TEST(TensorRTEngineOp, manual) { } void Execute(int batch_size, int input_dim, int output_dim, int nlayers = 1) { - FLAGS_tensorrt_engine_batch_size = batch_size; - FLAGS_tensorrt_max_batch_size = batch_size; framework::ProgramDesc program; framework::Scope scope; platform::CUDAPlace place; @@ -195,8 +193,8 @@ void Execute(int batch_size, int input_dim, int output_dim, int nlayers = 1) { SetAttr(engine_op_desc.Proto(), "subgraph", block_->SerializeAsString()); - SetAttr(engine_op_desc.Proto(), "max_batch", batch_size); - SetAttr(engine_op_desc.Proto(), "max_workspace", 2 << 10); + SetAttr(engine_op_desc.Proto(), "max_batch_size", batch_size); + SetAttr(engine_op_desc.Proto(), "workspace_size", 2 << 10); SetAttr>( engine_op_desc.Proto(), "parameters", std::vector({"y0", "y1", "y2", "y3"})); diff --git a/paddle/fluid/operators/top_k_op.cu b/paddle/fluid/operators/top_k_op.cu index 9da8551eb2d7ea66ad434c42b54522432095ce29..8e4a07556fb51dbb15ef948fcee120e2f68e089a 100644 --- a/paddle/fluid/operators/top_k_op.cu +++ b/paddle/fluid/operators/top_k_op.cu @@ -256,36 +256,65 @@ __device__ __forceinline__ void BlockReduce(Pair* sh_topk, int* maxid, * 3. go to the second setp, until one thread's topk value is null; * 4. go to the first setp, until get the topk value. */ + template __global__ void KeMatrixTopK(T* output, int output_stride, int64_t* indices, - const T* src, int lds, int dim, int k) { + const T* src, int lds, int dim, int k, + int grid_dim, int num) { __shared__ Pair sh_topk[BlockSize]; __shared__ int maxid[BlockSize / 2]; const int tid = threadIdx.x; const int warp = threadIdx.x / 32; - output += blockIdx.x * output_stride; - indices += blockIdx.x * k; - Pair topk[MaxLength]; - int beam = MaxLength; - Pair max; - bool is_empty = false; - bool firststep = true; + const int bid = blockIdx.x; + for (int i = bid; i < num; i += grid_dim) { + output += i * output_stride; + indices += i * k; + + Pair topk[MaxLength]; + int beam = MaxLength; + Pair max; + bool is_empty = false; + bool firststep = true; + + for (int k = 0; k < MaxLength; k++) { + topk[k].set(-INFINITY, -1); + } + while (k) { + ThreadGetTopK( + topk, &beam, k, src + i * lds, &firststep, &is_empty, &max, dim, tid); - for (int k = 0; k < MaxLength; k++) { - topk[k].set(-INFINITY, -1); + sh_topk[tid] = topk[0]; + BlockReduce(sh_topk, maxid, topk, &output, + &indices, &beam, &k, tid, warp); + } } - while (k) { - ThreadGetTopK(topk, &beam, k, - src + blockIdx.x * lds, &firststep, - &is_empty, &max, dim, tid); - - sh_topk[tid] = topk[0]; - BlockReduce(sh_topk, maxid, topk, &output, - &indices, &beam, &k, tid, warp); +} + +inline static int GetDesiredBlockDim(int dim) { + if (dim > 128) { + return 256; + } else if (dim > 64) { + return 128; + } else if (dim > 32) { + return 64; + } else { + return 32; } } +#define FIXED_BLOCK_DIM_BASE(dim, ...) \ + case (dim): { \ + constexpr auto kBlockDim = (dim); \ + __VA_ARGS__; \ + } break + +#define FIXED_BLOCK_DIM(...) \ + FIXED_BLOCK_DIM_BASE(256, ##__VA_ARGS__); \ + FIXED_BLOCK_DIM_BASE(128, ##__VA_ARGS__); \ + FIXED_BLOCK_DIM_BASE(64, ##__VA_ARGS__); \ + FIXED_BLOCK_DIM_BASE(32, ##__VA_ARGS__) + template class TopkOpCUDAKernel : public framework::OpKernel { public: @@ -310,18 +339,26 @@ class TopkOpCUDAKernel : public framework::OpKernel { // NOTE: pass lds and dim same to input width. // NOTE: old matrix implementation of stride is different to eigen. // TODO(typhoonzero): refine this kernel. - dim3 threads(256, 1); - dim3 grid(input_height, 1); - - KeMatrixTopK<<< - grid, threads, 0, reinterpret_cast( - ctx.device_context()) - .stream()>>>( - output_data, output->dims()[1], indices_data, input_data, input_width, - input_width, static_cast(k)); + const int kMaxHeight = 2048; + int gridx = input_height < kMaxHeight ? input_height : kMaxHeight; + auto& dev_ctx = ctx.cuda_device_context(); + + switch (GetDesiredBlockDim(input_width)) { + FIXED_BLOCK_DIM( + KeMatrixTopK<<>>( + output_data, output->dims()[1], indices_data, input_data, + input_width, input_width, static_cast(k), gridx, + input_height)); + default: + PADDLE_THROW("Error"); + } } }; +#undef FIXED_BLOCK_DIM_BASE +#undef FIXED_BLOCK_DIM + } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/while_op.cc b/paddle/fluid/operators/while_op.cc index 16eac1ec2406c147fa765bc014038ae03a1416b2..3c8a01b6e47459760b05b5ca7fa4fa5e1d37d112 100644 --- a/paddle/fluid/operators/while_op.cc +++ b/paddle/fluid/operators/while_op.cc @@ -224,10 +224,12 @@ class WhileGradOp : public framework::OperatorBase { if (cur_scope_iter == step_scopes->rbegin()) { auto *var = (*cur_scope_iter)->FindVar(inside_grad_name); PADDLE_ENFORCE_NOT_NULL(var, "Can not find var %s", inside_grad_name); - PADDLE_ENFORCE(var->IsType() || - var->IsType(), - "Currently the type of var only can be LoDTensorArray " - "or LoDTensor."); + PADDLE_ENFORCE( + var->IsType() || + var->IsType(), + "Currently the type of var only can be LoDTensorArray, " + "or LoDTensor, but the received var[%s] is %s.", + inside_grad_name, var->Type().name()); if (var->IsType()) { auto &inside_tensor = var->Get(); diff --git a/paddle/fluid/platform/dynload/cublas.h b/paddle/fluid/platform/dynload/cublas.h index c7c533bd42859c374c4783d43ec4cdd34a6a994a..4ea0cd7283b55649dbdbbf97f81f10c69ac6a1d2 100644 --- a/paddle/fluid/platform/dynload/cublas.h +++ b/paddle/fluid/platform/dynload/cublas.h @@ -55,7 +55,7 @@ extern void *cublas_dso_handle; struct DynLoad__##__name { \ template \ inline cublasStatus_t operator()(Args... args) { \ - return __name(args...); \ + return ::__name(args...); \ } \ }; \ extern DynLoad__##__name __name diff --git a/paddle/fluid/platform/dynload/cudnn.h b/paddle/fluid/platform/dynload/cudnn.h index 0103e7a3accf88f3c83f109298010c3c9af3d549..e6353f67ef118072a2d8e49111e8ecc486589998 100644 --- a/paddle/fluid/platform/dynload/cudnn.h +++ b/paddle/fluid/platform/dynload/cudnn.h @@ -13,6 +13,9 @@ See the License for the specific language governing permissions and limitations under the License. */ #pragma once +#define GLOG_NO_ABBREVIATED_SEVERITIES +#define GOOGLE_GLOG_DLL_DECL +#include #include #include // NOLINT @@ -47,13 +50,13 @@ extern void EnforceCUDNNLoaded(const char* fn_name); #else -#define DECLARE_DYNAMIC_LOAD_CUDNN_WRAP(__name) \ - struct DynLoad__##__name { \ - template \ - auto operator()(Args... args) -> decltype(__name(args...)) { \ - return __name(args...); \ - } \ - }; \ +#define DECLARE_DYNAMIC_LOAD_CUDNN_WRAP(__name) \ + struct DynLoad__##__name { \ + template \ + inline cudnnStatus_t operator()(Args... args) { \ + return ::__name(args...); \ + } \ + }; \ extern DynLoad__##__name __name #endif diff --git a/paddle/fluid/platform/dynload/curand.h b/paddle/fluid/platform/dynload/curand.h index 2daf1b4215ce1f7f771bbac72bfe103b0b941976..0bb300ec33076d9ddfaf69190f14131279cc888e 100644 --- a/paddle/fluid/platform/dynload/curand.h +++ b/paddle/fluid/platform/dynload/curand.h @@ -44,7 +44,7 @@ extern void *curand_dso_handle; struct DynLoad__##__name { \ template \ curandStatus_t operator()(Args... args) { \ - return __name(args...); \ + return ::__name(args...); \ } \ }; \ extern DynLoad__##__name __name diff --git a/paddle/fluid/platform/dynload/dynamic_loader.cc b/paddle/fluid/platform/dynload/dynamic_loader.cc index 6a3ad2151081504fda2a3818c5f99ad47039d91d..cc5cda6106c188f3156d33480b5d3641eed32556 100644 --- a/paddle/fluid/platform/dynload/dynamic_loader.cc +++ b/paddle/fluid/platform/dynload/dynamic_loader.cc @@ -107,7 +107,11 @@ static inline void* GetDsoHandleFromDefaultPath(const std::string& dso_path, static inline void* GetDsoHandleFromSearchPath(const std::string& search_root, const std::string& dso_name, bool throw_on_error = true) { +#if !defined(_WIN32) int dynload_flags = RTLD_LAZY | RTLD_LOCAL; +#else + int dynload_flags = 0; +#endif // !_WIN32 void* dso_handle = nullptr; std::string dlPath = dso_name; @@ -117,10 +121,15 @@ static inline void* GetDsoHandleFromSearchPath(const std::string& search_root, // search xxx.so from custom path dlPath = join(search_root, dso_name); dso_handle = dlopen(dlPath.c_str(), dynload_flags); +#if !defined(_WIN32) + auto errorno = dlerror(); +#else + auto errorno = GetLastError(); +#endif // !_WIN32 // if not found, search from default path if (nullptr == dso_handle) { LOG(WARNING) << "Failed to find dynamic library: " << dlPath << " (" - << dlerror() << ")"; + << errorno << ")"; if (dlPath.find("nccl") != std::string::npos) { std::cout << "You may need to install 'nccl2' from NVIDIA official website: " @@ -139,10 +148,15 @@ static inline void* GetDsoHandleFromSearchPath(const std::string& search_root, "export LD_LIBRARY_PATH=... \n Note: After Mac OS 10.11, " "using the DYLD_LIBRARY_PATH is impossible unless System " "Integrity Protection (SIP) is disabled."; +#if !defined(_WIN32) + auto errorno = dlerror(); +#else + auto errorno = GetLastError(); +#endif // !_WIN32 if (throw_on_error) { - PADDLE_ENFORCE(nullptr != dso_handle, error_msg, dlPath, dlerror()); + PADDLE_ENFORCE(nullptr != dso_handle, error_msg, dlPath, errorno); } else if (nullptr == dso_handle) { - LOG(WARNING) << string::Sprintf(error_msg, dlPath, dlerror()); + LOG(WARNING) << string::Sprintf(error_msg, dlPath, errorno); } return dso_handle; diff --git a/paddle/fluid/platform/enforce.h b/paddle/fluid/platform/enforce.h index 61a653d9313daff96d39c08e80f17d7e33acceb1..f04395a8ac00f33501008aa12f22773ddda9b138 100644 --- a/paddle/fluid/platform/enforce.h +++ b/paddle/fluid/platform/enforce.h @@ -21,6 +21,7 @@ limitations under the License. */ #if defined(_WIN32) #define NOMINMAX // msvc max/min macro conflict with std::min/max #define GLOG_NO_ABBREVIATED_SEVERITIES // msvc conflict logging with windows.h +#define GOOGLE_GLOG_DLL_DECL #endif #ifdef PADDLE_WITH_CUDA @@ -47,7 +48,7 @@ limitations under the License. */ #include "paddle/fluid/platform/dynload/cublas.h" #include "paddle/fluid/platform/dynload/cudnn.h" #include "paddle/fluid/platform/dynload/curand.h" -#if !defined(__APPLE__) and !defined(_WIN32) +#if !defined(__APPLE__) && !defined(_WIN32) #include "paddle/fluid/platform/dynload/nccl.h" #endif // __APPLE__ #endif // PADDLE_WITH_CUDA @@ -216,7 +217,7 @@ inline typename std::enable_if::type throw_on_error( #endif } -#if !defined(__APPLE__) and !defined(_WIN32) +#if !defined(__APPLE__) && !defined(_WIN32) template inline typename std::enable_if::type throw_on_error( ncclResult_t stat, const Args&... args) { @@ -260,14 +261,8 @@ inline void throw_on_error(T e) { } \ } while (false) -#define PADDLE_THROW_EOF() \ - do { \ - throw ::paddle::platform::EOFException("There is no next data.", __FILE__, \ - __LINE__); \ - } while (false) - #else -#define PADDLE_ENFORCE(...) ::paddle::platform::throw_on_error(__VA_ARGS__) +#define PADDLE_ENFORCE(...) ::paddle::platform::throw_on_error(__VA_ARGS__); #endif // REPLACE_ENFORCE_GLOG #else // !_WIN32 @@ -281,6 +276,12 @@ inline void throw_on_error(T e) { #define PADDLE_ENFORCE(x, ...) x #endif // !_WIN32 +#define PADDLE_THROW_EOF() \ + do { \ + throw ::paddle::platform::EOFException("There is no next data.", __FILE__, \ + __LINE__); \ + } while (false) + /* * Some enforce helpers here, usage: * int a = 1; @@ -294,7 +295,7 @@ inline void throw_on_error(T e) { * extra messages is also supported, for example: * PADDLE_ENFORCE(a, b, "some simple enforce failed between %d numbers", 2) */ - +#if !defined(_WIN32) #define PADDLE_ENFORCE_EQ(__VAL0, __VAL1, ...) \ __PADDLE_BINARY_COMPARE(__VAL0, __VAL1, ==, !=, __VA_ARGS__) #define PADDLE_ENFORCE_NE(__VAL0, __VAL1, ...) \ @@ -307,6 +308,7 @@ inline void throw_on_error(T e) { __PADDLE_BINARY_COMPARE(__VAL0, __VAL1, <, >=, __VA_ARGS__) #define PADDLE_ENFORCE_LE(__VAL0, __VAL1, ...) \ __PADDLE_BINARY_COMPARE(__VAL0, __VAL1, <=, >, __VA_ARGS__) + #define PADDLE_ENFORCE_NOT_NULL(__VAL, ...) \ do { \ if (UNLIKELY(nullptr == (__VAL))) { \ @@ -326,6 +328,27 @@ inline void throw_on_error(T e) { paddle::string::Sprintf("" __VA_ARGS__)); \ } \ } while (0) +#else +#define PADDLE_ENFORCE_EQ(__VAL0, __VAL1, ...) ((__VAL0) == (__VAL1)) +#define PADDLE_ENFORCE_NE(__VAL0, __VAL1, ...) ((__VAL0) != (__VAL1)) +#define PADDLE_ENFORCE_GT(__VAL0, __VAL1, ...) ((__VAL0) > (__VAL1)) +#define PADDLE_ENFORCE_GE(__VAL0, __VAL1, ...) ((__VAL0) >= (__VAL1)) +#define PADDLE_ENFORCE_LT(__VAL0, __VAL1, ...) ((__VAL0) < (__VAL1)) +#define PADDLE_ENFORCE_LE(__VAL0, __VAL1, ...) ((__VAL0) <= (__VAL1)) + +#define __PADDLE_BINARY_COMPARE(__VAL0, __VAL1, __CMP, __INV_CMP, ...) \ + do { \ + if (!((__VAL0)__CMP(__VAL1))) { \ + PADDLE_THROW("Windows disable the enforce. Enforce failed."); \ + } \ + } while (0) +#define PADDLE_ENFORCE_NOT_NULL(__VAL1, ...) \ + do { \ + if (nullptr == (__VAL1)) { \ + PADDLE_THROW("Windows disable the enforce. Enforce failed"); \ + } \ + } while (0) +#endif // !_WIN32 } // namespace platform } // namespace paddle diff --git a/paddle/fluid/platform/gpu_info.cc b/paddle/fluid/platform/gpu_info.cc index 126636d879213b1c8f242db8fbdf6a358a1d2da9..f599e7fbc886a60394ae4690e4160275b55b8596 100644 --- a/paddle/fluid/platform/gpu_info.cc +++ b/paddle/fluid/platform/gpu_info.cc @@ -20,8 +20,11 @@ limitations under the License. */ #include "paddle/fluid/platform/enforce.h" DEFINE_double(fraction_of_gpu_memory_to_use, 0.92, - "Default use 92% of GPU memory for PaddlePaddle," - "reserve the rest for page tables, etc"); + "Allocate a trunk of gpu memory that is this fraction of the " + "total gpu memory size. Future memory usage will be allocated " + "from the trunk. If the trunk doesn't have enough gpu memory, " + "additional trunks of the same size will be requested from gpu " + "until the gpu has no memory left for another trunk."); namespace paddle { namespace platform { diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt index b5bd07d401f9ebfe441bc0f84f9bad317f0e8da9..e7f634c4a622b48e97040987836406cf73cb23b6 100644 --- a/paddle/fluid/pybind/CMakeLists.txt +++ b/paddle/fluid/pybind/CMakeLists.txt @@ -1,5 +1,5 @@ -set(PYBIND_DEPS pybind python proto_desc memory executor prune feed_fetch_method) +set(PYBIND_DEPS pybind python proto_desc memory executor prune feed_fetch_method pass_builder) set(PYBIND_SRCS pybind.cc exception.cc protobuf.cc const_value.cc) if(NOT WIN32) list(APPEND PYBIND_DEPS parallel_executor profiler) diff --git a/paddle/fluid/pybind/protobuf.cc b/paddle/fluid/pybind/protobuf.cc index 67501186d150171728194f23bc02d2c014848dd7..3b22718a8c6f994dbc2dc3e7aaa19a7163f716ba 100644 --- a/paddle/fluid/pybind/protobuf.cc +++ b/paddle/fluid/pybind/protobuf.cc @@ -214,7 +214,6 @@ void BindVarDsec(pybind11::module *m) { .def("set_shapes", &pd::VarDesc::SetShapes) .def("set_dtype", &pd::VarDesc::SetDataType) .def("set_dtypes", &pd::VarDesc::SetDataTypes) - .def("set_capacity", &pd::VarDesc::SetCapacity) .def("shape", &pd::VarDesc::GetShape, pybind11::return_value_policy::reference) .def("shapes", &pd::VarDesc::GetShapes, @@ -251,7 +250,6 @@ void BindVarDsec(pybind11::module *m) { .value("STEP_SCOPES", pd::proto::VarType::STEP_SCOPES) .value("LOD_RANK_TABLE", pd::proto::VarType::LOD_RANK_TABLE) .value("LOD_TENSOR_ARRAY", pd::proto::VarType::LOD_TENSOR_ARRAY) - .value("CHANNEL", pd::proto::VarType::CHANNEL) .value("PLACE_LIST", pd::proto::VarType::PLACE_LIST) .value("READER", pd::proto::VarType::READER) .value("RAW", pd::proto::VarType::RAW); @@ -285,12 +283,12 @@ void BindOpDesc(pybind11::module *m) { .def("set_output", &pd::OpDesc::SetOutput) .def("input_arg_names", &pd::OpDesc::InputArgumentNames) .def("output_arg_names", &pd::OpDesc::OutputArgumentNames) - .def("rename_input", &pd::OpDesc::RenameInput) - .def("rename_output", &pd::OpDesc::RenameOutput) + .def("_rename_input", &pd::OpDesc::RenameInput) + .def("_rename_output", &pd::OpDesc::RenameOutput) .def("has_attr", &pd::OpDesc::HasAttr) .def("attr_type", &pd::OpDesc::GetAttrType) .def("attr_names", &pd::OpDesc::AttrNames) - .def("set_attr", &pd::OpDesc::SetAttr) + .def("_set_attr", &pd::OpDesc::SetAttr) .def("attr", &pd::OpDesc::GetAttr) .def("set_block_attr", &pd::OpDesc::SetBlockAttr) .def("set_blocks_attr", &pd::OpDesc::SetBlocksAttr) @@ -300,8 +298,8 @@ void BindOpDesc(pybind11::module *m) { std::string ser(seriralized); self.SetAttr(name, ser); }) - .def("block_attr_id", &pd::OpDesc::GetBlockAttrId) - .def("blocks_attr_ids", &pd::OpDesc::GetBlocksAttrIds) + .def("_block_attr_id", &pd::OpDesc::GetBlockAttrId) + .def("_blocks_attr_ids", &pd::OpDesc::GetBlocksAttrIds) .def("check_attrs", &pd::OpDesc::CheckAttrs) .def("infer_shape", &pd::OpDesc::InferShape) .def("infer_var_type", &pd::OpDesc::InferVarType) diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index 8b62502e3f920a3bf7d80f9e7edc2f3647a0e5b1..295af1c5837d70c32b522cc47c8c3e12d5bd61c7 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -21,10 +21,10 @@ limitations under the License. */ #include #include -#include "paddle/fluid/framework/channel.h" #include "paddle/fluid/framework/executor.h" #include "paddle/fluid/framework/feed_fetch_method.h" #include "paddle/fluid/framework/framework.pb.h" +#include "paddle/fluid/framework/ir/pass_builder.h" #include "paddle/fluid/framework/lod_rank_table.h" #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/lod_tensor_array.h" @@ -595,6 +595,29 @@ All parameter, weight, gradient are variables in Paddle. m.def("is_profiler_enabled", platform::IsProfileEnabled); m.def("reset_profiler", platform::ResetProfiler); + py::class_> pass(m, "Pass"); + pass.def(py::init()) + .def("set_str", [](ir::Pass &self, const std::string &name, + const std::string &attr) { + self.Set(name, new std::string(attr)); + }); + + py::class_> pb( + m, "PassBuilder"); + pb.def(py::init()) + .def("append_pass", + [](ir::PassBuilder &self, + const std::string &pass_type) -> std::shared_ptr { + return self.AppendPass(pass_type); + }) + .def("all_passes", [](ir::PassBuilder &self) { return self.AllPasses(); }) + .def("insert_pass", + [](ir::PassBuilder &self, size_t idx, const std::string &pass_type) { + return self.InsertPass(idx, pass_type); + }) + .def("remove_pass", + [](ir::PassBuilder &self, size_t idx) { self.RemovePass(idx); }); + // -- python binds for parallel executor. py::class_ pe(m, "ParallelExecutor"); py::class_ exec_strategy(pe, "ExecutionStrategy"); @@ -677,7 +700,11 @@ All parameter, weight, gradient are variables in Paddle. }, [](BuildStrategy &self, bool b) { self.fuse_elewise_add_act_ops_ = b; - }); + }) + .def("_create_passes_from_strategy", + [](BuildStrategy &self) -> std::shared_ptr { + return self.CreatePassesFromStrategy(); + }); pe.def(py::init &, const std::unordered_set &, diff --git a/paddle/fluid/string/pretty_log.h b/paddle/fluid/string/pretty_log.h index a3b4e38f453835828a4a53130e11c854ac3f4a74..10c9eb80d0a7e07d5974ca10d740e71e7717b5c5 100644 --- a/paddle/fluid/string/pretty_log.h +++ b/paddle/fluid/string/pretty_log.h @@ -56,13 +56,13 @@ struct Style { }; template -static void PrettyLogEndl(const std::string& style, const char* fmt, - const Args&... args) { +static void PrettyLogEndl(const std::string &style, const char *fmt, + const Args &... args) { std::cerr << style << Sprintf(fmt, args...) << reset() << std::endl; } template -static void PrettyLog(const std::string& style, const char* fmt, - const Args&... args) { +static void PrettyLog(const std::string &style, const char *fmt, + const Args &... args) { std::cerr << style << Sprintf(fmt, args...) << reset(); } diff --git a/paddle/fluid/train/CMakeLists.txt b/paddle/fluid/train/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..fae28fcb4c3102240438b62c203c65281f029192 --- /dev/null +++ b/paddle/fluid/train/CMakeLists.txt @@ -0,0 +1,29 @@ +function(train_test TARGET_NAME) + set(options "") + set(oneValueArgs "") + set(multiValueArgs ARGS) + cmake_parse_arguments(train_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) + + set(arg_list "") + if(train_test_ARGS) + foreach(arg ${train_test_ARGS}) + list(APPEND arg_list "_${arg}") + endforeach() + else() + list(APPEND arg_list "_") + endif() + foreach(arg ${arg_list}) + string(REGEX REPLACE "^_$" "" arg "${arg}") + cc_test(test_train_${TARGET_NAME}${arg} + SRCS test_train_${TARGET_NAME}.cc + DEPS paddle_fluid_origin + ARGS --dirname=${PYTHON_TESTS_DIR}/book/${TARGET_NAME}${arg}.train.model/) + set_tests_properties(test_train_${TARGET_NAME}${arg} + PROPERTIES DEPENDS test_${TARGET_NAME}) + endforeach() +endfunction(train_test) + + +if(WITH_TESTING) + train_test(recognize_digits ARGS mlp conv) +endif() diff --git a/paddle/fluid/train/test_train_recognize_digits.cc b/paddle/fluid/train/test_train_recognize_digits.cc new file mode 100644 index 0000000000000000000000000000000000000000..e8731dd51ad698e53b7f10cc781c52134f2d17a8 --- /dev/null +++ b/paddle/fluid/train/test_train_recognize_digits.cc @@ -0,0 +1,89 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include + +#include "gflags/gflags.h" +#include "gtest/gtest.h" + +#include "paddle/fluid/framework/executor.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/program_desc.h" +#include "paddle/fluid/framework/tensor_util.h" +#include "paddle/fluid/inference/io.h" +#include "paddle/fluid/platform/device_context.h" +#include "paddle/fluid/platform/init.h" +#include "paddle/fluid/platform/place.h" + +DEFINE_string(dirname, "", "Directory of the train model."); + +namespace paddle { + +void Train() { + CHECK(!FLAGS_dirname.empty()); + framework::InitDevices(false); + const auto cpu_place = platform::CPUPlace(); + framework::Executor executor(cpu_place); + framework::Scope scope; + + auto train_program = inference::Load( + &executor, &scope, FLAGS_dirname + "__model_combined__.main_program", + FLAGS_dirname + "__params_combined__"); + + std::string loss_name = ""; + for (auto op_desc : train_program->Block(0).AllOps()) { + if (op_desc->Type() == "mean") { + loss_name = op_desc->Output("Out")[0]; + break; + } + } + + PADDLE_ENFORCE_NE(loss_name, "", "loss not found"); + + // prepare data + auto x_var = scope.Var("img"); + auto x_tensor = x_var->GetMutable(); + x_tensor->Resize({64, 1, 28, 28}); + + auto x_data = x_tensor->mutable_data(cpu_place); + for (int i = 0; i < 64 * 28 * 28; ++i) { + x_data[i] = 1.0; + } + + auto y_var = scope.Var("label"); + auto y_tensor = y_var->GetMutable(); + y_tensor->Resize({64, 1}); + auto y_data = y_tensor->mutable_data(cpu_place); + for (int i = 0; i < 64 * 1; ++i) { + y_data[i] = static_cast(1); + } + + auto loss_var = scope.Var(loss_name); + float first_loss = 0.0; + float last_loss = 0.0; + for (int i = 0; i < 100; ++i) { + executor.Run(*train_program.get(), &scope, 0, false, true); + if (i == 0) { + first_loss = loss_var->Get().data()[0]; + } else if (i == 99) { + last_loss = loss_var->Get().data()[0]; + } + } + EXPECT_LT(last_loss, first_loss); +} + +TEST(train, recognize_digits) { Train(); } + +} // namespace paddle diff --git a/paddle/legacy/trainer/tests/CMakeLists.txt b/paddle/legacy/trainer/tests/CMakeLists.txt index 08548bea4c4a7fc4fa99d9305208abd4ee442572..fbefcced5643b65372072856bfeb6c87cd4071a8 100644 --- a/paddle/legacy/trainer/tests/CMakeLists.txt +++ b/paddle/legacy/trainer/tests/CMakeLists.txt @@ -16,7 +16,11 @@ endfunction() trainer_test(test_Compare) trainer_test(test_PyDataProviderWrapper) trainer_test(test_recurrent_machine_generation) -trainer_test(test_Trainer) +if(NOT APPLE) + trainer_test(test_Trainer) +else() + message(WARNING "These tests has been disabled in OSX for random fail: \n test_Trainer") +endif() ############### test_TrainerOnePass ########################## if(WITH_PYTHON) diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh index f50a68c54114d5cce15418ad22f38c83163ba866..d9214d0b8cef948dfca2ac9b1e3597fefc990786 100755 --- a/paddle/scripts/paddle_build.sh +++ b/paddle/scripts/paddle_build.sh @@ -70,8 +70,8 @@ function cmake_gen() { PYTHON_FLAGS="" SYSTEM=`uname -s` if [ "$SYSTEM" == "Darwin" ]; then + echo "Using python abi: $1" if [[ "$1" == "cp27-cp27m" ]] || [[ "$1" == "" ]]; then - echo "using python abi: $1" if [ -d "/Library/Frameworks/Python.framework/Versions/2.7" ]; then export LD_LIBRARY_PATH=/Library/Frameworks/Python.framework/Versions/2.7 export DYLD_LIBRARY_PATH=/Library/Frameworks/Python.framework/Versions/2.7 @@ -82,7 +82,18 @@ function cmake_gen() { else exit 1 fi - # TODO: qiyang add python3 part here + elif [ "$1" == "cp35-cp35m" ]; then + if [ -d "/Library/Frameworks/Python.framework/Versions/3.5" ]; then + export LD_LIBRARY_PATH=/Library/Frameworks/Python.framework/Versions/3.5/lib/ + export DYLD_LIBRARY_PATH=/Library/Frameworks/Python.framework/Versions/3.5/lib/ + export PATH=/Library/Frameworks/Python.framework/Versions/3.5/bin/:${PATH} + PYTHON_FLAGS="-DPYTHON_EXECUTABLE:FILEPATH=/Library/Frameworks/Python.framework/Versions/3.5/bin/python3 + -DPYTHON_INCLUDE_DIR:PATH=/Library/Frameworks/Python.framework/Versions/3.5/include/python3.5m/ + -DPYTHON_LIBRARY:FILEPATH=/Library/Frameworks/Python.framework/Versions/3.5/lib/libpython3.5m.dylib" + WITH_FLUID_ONLY=${WITH_FLUID_ONLY:-ON} + else + exit 1 + fi fi else if [ "$1" != "" ]; then @@ -147,6 +158,7 @@ function cmake_gen() { -DINFERENCE_DEMO_INSTALL_DIR=${INFERENCE_DEMO_INSTALL_DIR} -DWITH_ANAKIN=${WITH_ANAKIN:-OFF} -DPY_VERSION=${PY_VERSION:-2.7} + -DCMAKE_INSTALL_PREFIX=${INSTALL_PREFIX:-/paddle/build} ======================================== EOF # Disable UNITTEST_USE_VIRTUALENV in docker because @@ -178,7 +190,8 @@ EOF -DWITH_INFERENCE_API_TEST=${WITH_INFERENCE_API_TEST:-ON} \ -DINFERENCE_DEMO_INSTALL_DIR=${INFERENCE_DEMO_INSTALL_DIR} \ -DWITH_ANAKIN=${WITH_ANAKIN:-OFF} \ - -DPY_VERSION=${PY_VERSION:-2.7} + -DPY_VERSION=${PY_VERSION:-2.7} \ + -DCMAKE_INSTALL_PREFIX=${INSTALL_PREFIX:-/paddle/build} } @@ -361,7 +374,7 @@ EOF ctest --output-on-failure # make install should also be test when unittest make install -j `nproc` - pip install /usr/local/opt/paddle/share/wheels/*.whl + pip install ${INSTALL_PREFIX:-/paddle/build}/opt/paddle/share/wheels/*.whl if [[ ${WITH_FLUID_ONLY:-OFF} == "OFF" ]] ; then paddle version fi @@ -379,13 +392,14 @@ function run_mac_test() { EOF # TODO: jiabin need to refine this part when these tests fixed on mac - ctest --output-on-failure -j8 + ctest --output-on-failure -j $1 # make install should also be test when unittest make install -j 8 - pip install /usr/local/opt/paddle/share/wheels/*.whl + pip install --user ${INSTALL_PREFIX:-/paddle/build}/opt/paddle/share/wheels/*.whl if [[ ${WITH_FLUID_ONLY:-OFF} == "OFF" ]] ; then paddle version fi + pip uninstall -y paddlepaddle fi } @@ -627,10 +641,10 @@ EOF function gen_capi_package() { if [[ ${WITH_C_API} == "ON" ]]; then - install_prefix="${PADDLE_ROOT}/build/capi_output" - rm -rf $install_prefix - make DESTDIR="$install_prefix" install - cd $install_prefix/usr/local + capi_install_prefix=${INSTALL_PREFIX:-/paddle/build}/capi_output + rm -rf $capi_install_prefix + make DESTDIR="$capi_install_prefix" install + cd $capi_install_prefix/ ls | egrep -v "^Found.*item$" | xargs tar -czf ${PADDLE_ROOT}/build/paddle.tgz fi } @@ -641,11 +655,21 @@ function gen_fluid_inference_lib() { if [[ ${WITH_C_API:-OFF} == "OFF" && ${WITH_INFERENCE:-ON} == "ON" ]] ; then cat <>> import paddle.fluid as fluid + >>> uni_op_freq, adj_2_op_freq = fluid.contrib.op_freq_statistic( + >>> fluid.default_main_program()) + >>> for op_type, op_num in uni_op_freq: + >>> print("%s \t %d" % (op_type, op_num)) + >>> for op_type, op_num in adj_2_op_freq: + >>> print("%s \t %d" % (op_type, op_num)) + + """ + + if not isinstance(program, Program): + raise TypeError("The input type should be Porgram." + "But you passed in %s" % (type(program))) + + uni_op_freq = OrderedDict() + adj_2_op_freq = OrderedDict() + op_in_ops = OrderedDict() + + parameters = [p.name for p in program.blocks[0].all_parameters()] + + # get uni_op_freq + for op in program.global_block().ops: + had_recorded = False + for var_name in op.output_arg_names: + if var_name in parameters: + continue + if not had_recorded and uni_op_freq.has_key(op.type): + uni_op_freq[op.type] += 1 + had_recorded = True + elif not had_recorded: + uni_op_freq[op.type] = 1 + had_recorded = True + + # get adj_2_op_freq + var_gen_op = {} + for op in program.global_block().ops: + for var_name in op.input_arg_names: + if var_name in parameters: + continue + if var_gen_op.has_key(var_name): + assert len(var_gen_op[var_name]) > 0 + if op_in_ops.has_key(op.type): + op_in_ops[op.type].append(var_gen_op[var_name][-1]) + else: + op_in_ops[op.type] = [var_gen_op[var_name][-1]] + else: + print("Var's generate op is not found,%s, %s" % + (var_name, op.type)) + + for var_name in op.output_arg_names: + if var_gen_op.has_key(var_name): + var_gen_op[var_name].append(op.type) + else: + var_gen_op[var_name] = [op.type] + + for op, in_ops in op_in_ops.iteritems(): + for in_op in in_ops: + op_op = in_op + "->" + op + if adj_2_op_freq.has_key(op_op): + adj_2_op_freq[op_op] += 1 + else: + adj_2_op_freq[op_op] = 1 + + uni_op_freq = sorted( + uni_op_freq.items(), key=lambda item: item[1], reverse=True) + adj_2_op_freq = sorted( + adj_2_op_freq.items(), key=lambda item: item[1], reverse=True) + + return uni_op_freq, adj_2_op_freq diff --git a/python/paddle/fluid/contrib/quantize/__init__.py b/python/paddle/fluid/contrib/quantize/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..14c208d0e7f35ebfbbe1c36d0b11a8d0f0efb4a6 --- /dev/null +++ b/python/paddle/fluid/contrib/quantize/__init__.py @@ -0,0 +1,20 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +from . import quantize_transpiler +from .quantize_transpiler import * + +__all__ = quantize_transpiler.__all__ diff --git a/python/paddle/fluid/contrib/quantize/quantize_transpiler.py b/python/paddle/fluid/contrib/quantize/quantize_transpiler.py new file mode 100644 index 0000000000000000000000000000000000000000..032d0353ea6d80c4356ea9a9886ea59c48feec7a --- /dev/null +++ b/python/paddle/fluid/contrib/quantize/quantize_transpiler.py @@ -0,0 +1,557 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import collections +import numpy as np + +from paddle.fluid.framework import default_main_program, default_startup_program, program_guard +from paddle.fluid.layer_helper import LayerHelper +from paddle.fluid import unique_name +from paddle.fluid import core +from paddle.fluid.initializer import Constant +from paddle.fluid.param_attr import ParamAttr +from paddle.fluid.layer_helper import LayerHelper +from paddle.fluid.layers.nn import autoincreased_step_counter +from paddle.fluid.framework import Variable +from paddle.fluid.executor import global_scope +from paddle.fluid.transpiler.inference_transpiler import InferenceTranspiler + +__all__ = ['QuantizeTranspiler'] + +_QUANTIZABLE_OP_TYPES = ['conv2d', 'depthwise_conv2d', 'mul'] + + +def _quantized_var_name(var_name): + """ + Return quantized variable name for the input `var_name`. + """ + return "%s.quantized" % (var_name) + + +def _dequantized_var_name(var_name): + """ + Return dequantized variable name for the input `var_name`. + """ + return "%s.dequantized" % (var_name) + + +def _quantized_scale_name(var_name): + """ + Return quantized variable name for the input `var_name`. + """ + return "%s.scale" % (var_name) + + +def _original_var_name(var_name): + """ + Return the original variable name. + """ + if var_name.endswith('.quantized.dequantized'): + return var_name[:-len('.quantized.dequantized')] + if var_name.endswith('.quantized'): + return var_name[:-len('.quantized')] + if var_name.endswith('.dequantized'): + return var_name[:-len('.dequantized')] + if var_name.endswith('.scale'): + return var_name[:-len('.scale')] + else: + return var_name + + +def _is_float(v): + return isinstance(v, float) or isinstance(v, np.float32) + + +def quant(x, scale, num_bits): + y = np.round(x / scale * ((1 << (num_bits - 1)) - 1)) + return y + + +class QuantizeTranspiler(object): + def __init__(self, + weight_bits=8, + activation_bits=8, + activation_quantize_type='abs_max', + weight_quantize_type='abs_max', + window_size=10000): + """ + Convert and rewrite the fluid Program according to weight and + activation quantization type. + + Args: + weight_bits (int): quantization bit number for weights, + the bias is not quantized. + activation_bits (int): quantization bit number for activation. + activation_quantize_type (str): quantization type for activation, + now support 'abs_max', 'range_abs_max'. If use 'abs_max' mode, + the quantization scale will be calculated dynamically each step + in both training and testing period. If use 'range_abs_max', + a static quantization scale will be calculated during training + and used in inference. + weight_quantize_type (str): quantization type for weights, + support 'abs_max'. The 'range_abs_max' usually is not used for + weight, since weights are fixed once the model is well trained. + window_size (int): the window size for 'range_abs_max' quantization. + + Examples: + + .. code-block:: python + + # the original program will be rewrite, if you don't want to + # change it, please clone at first. + # quantize_program = program.clone() + t = fluid.QuantizeTranspiler() + t.transpile(quantize_program) + + """ + self.weight_bits = weight_bits + self.activation_bits = activation_bits + quant_type = ['abs_max', 'range_abs_max'] + if weight_quantize_type not in quant_type: + raise ValueError( + "Unknown weight_quantize_type: '%s'. It can only be ", + "'abs_max' or 'range_abs_max'.", str(weight_quantize_type)) + if activation_quantize_type not in quant_type: + raise ValueError( + "Unknown activation_quantize_type : '%s'. It can only be ", + "'abs_max' or 'range_abs_max'.", str(activation_quantize_type)) + + self.weight_quantize_type = weight_quantize_type + self.activation_quantize_type = activation_quantize_type + + self.window_size = window_size + self.helper = LayerHelper(self.__class__.__name__) + self.fake_quant_op_types = [ + 'fake_quantize_abs_max', 'fake_quantize_range_abs_max' + ] + self.fake_dequant_op_types = ['fake_dequantize_max_abs'] + self.is_test = None + self.global_step = None + + def training_transpile(self, program=None, startup_program=None): + """Rewrites a training input program in place for simulated + quantization. Insert fake quantization and de-quantization ops into + program to simulate the error introduced by quantization. And change + the graident ops' input by using the faked quantization weights and + activation. Since the program is transformed in place, the graph + connection will change. + + Args: + program (Program): the input program to be transpile. + """ + self.is_test = False + program = default_main_program() if program is None else program + startup_program = default_startup_program() if startup_program is \ + None else startup_program + + # marked the variable which has been quantized and dequantized. + dequanted_vars = [ + collections.OrderedDict() for _ in range(len(program.blocks)) + ] + grad_op_types = ['%s_grad' % (type) for type in _QUANTIZABLE_OP_TYPES] + + params = [p.name for p in program.global_block().iter_parameters()] + + def _transpile_forward(block, op): + idx = block.ops.index(op) + block_id = block.idx + # insert quant op and dequant op + for name in op.input_arg_names: + if name in dequanted_vars[block_id]: + dequant_var = dequanted_vars[block_id][name] + else: + var = block.var(name) + quant_bits = self.weight_bits if var.name in params \ + else self.activation_bits + quant_type = self.weight_quantize_type if var.name \ + in params else self.activation_quantize_type + + quant_var, scale_var = self._insert_quant_op( + block, idx, var, quant_bits, quant_type) + dequant_var = self._insert_dequant_op( + block, idx + 1, quant_var, scale_var, quant_bits) + dequanted_vars[block_id][name] = dequant_var + # rename the forward op inputs + op._rename_input(name, dequant_var.name) + + def _transpile_backward(block, op): + block_id = block.idx + no_dequanted_input_vars = True + for name in op.input_arg_names: + if name in dequanted_vars[block_id]: + dequant_var = dequanted_vars[block_id][name] + op._rename_input(name, dequant_var.name) + no_dequanted_input_vars = False + if no_dequanted_input_vars: + raise ValueError("There is no dequanted inputs for op %s." % + (op.type)) + + with program_guard(program, startup_program): + self._create_global_step() + for block in program.blocks: + ops = list(block.ops) + block_id = block.idx + for op in ops: + # rewrite the forward ProgramDes + if op.type in _QUANTIZABLE_OP_TYPES: + _transpile_forward(block, op) + # rename the backward op inputs + if op.type in grad_op_types: + _transpile_backward(block, op) + + def _create_global_step(self): + if self.weight_quantize_type == 'range_abs_max' or \ + self.activation_quantize_type == 'range_abs_max': + self.global_step = autoincreased_step_counter() + + def freeze_program(self, program, place, fuse_bn=False, scope=None): + """Freeze input training program for inference. + + Args: + program (Program): the input program to be transpile. + """ + + self.is_test = True + scope = global_scope() if scope is None else scope + program = default_main_program() if program is None else program + + if fuse_bn: + bn_fuse_transpiler = BNFuseTranspiler() + bn_fuse_transpiler.transpile(program, place) + + persistable_vars = [ + v.name + for v in filter(lambda var: var.persistable, program.list_vars()) + ] + op_in_rename_map = [ + collections.OrderedDict() for _ in range(len(program.blocks)) + ] + op_out_rename_map = [ + collections.OrderedDict() for _ in range(len(program.blocks)) + ] + var_scale_map = [ + collections.OrderedDict() for _ in range(len(program.blocks)) + ] + + def _remove_fake_quant_and_dequant_op(block, op): + idx = block.ops.index(op) + block_id = block.idx + k = op.output('Out')[0] + v = op.input('X')[0] + if v not in op_in_rename_map[block_id]: + op_in_rename_map[block_id][k] = v + else: + op_in_rename_map[block_id][k] = op_in_rename_map[block_id][v] + block._remove_op(idx) + + def _insert_post_dequant_op(block, op): + idx = block.ops.index(op) + block_id = block.idx + max_range = None + scale_var = None + for name in op.input_arg_names: + if name in op_in_rename_map[block_id]: + op._rename_input(name, op_in_rename_map[block_id][name]) + + scale_v = var_scale_map[block_id][_original_var_name(name)] + if _original_var_name(name) in persistable_vars: + param_range = (1 << (self.weight_bits - 1)) - 1 + act_range = (1 << (self.activation_bits - 1)) - 1 + assert _is_float(scale_v) + max_range = param_range * act_range / scale_v + else: + assert isinstance(scale_v, Variable) + scale_var = var_scale_map[block_id][_original_var_name( + name)] + + if len(op.output_arg_names) != 1: + raise ValueError("Only support one output, but op %s has" + " more than one output." % (op.type)) + out_var = block.var(op.output_arg_names[0]) + dequant_var = block.create_var( + name=_dequantized_var_name(out_var.name), + type=out_var.type, + shape=out_var.shape, + dtype=out_var.dtype) + # insert fake_dequantize_op + dequant_op = block._insert_op( + idx + 1, + type="fake_dequantize_max_abs", + attrs={'max_range': float(max_range)}, + inputs={"X": out_var, + 'Scale': scale_var}, + outputs={"Out": dequant_var}) + op_out_rename_map[block_id][out_var.name] = dequant_var.name + return dequant_var + + def _load_var(name): + return np.array(scope.find_var(name).get_tensor()) + + def _restore_var(name, arr): + t = scope.find_var(name).get_tensor() + t.set(arr, place) + + for block in program.blocks: + ops = list(block.ops) + block_id = block.idx + for op in ops: + op_type = op.type + + # insert dequant_op after fc/conv, need to rename + # input of the followed ops + for name in op.input_arg_names: + if name in op_out_rename_map[block_id]: + op._rename_input(name, + op_out_rename_map[block_id][name]) + + if op_type in self.fake_quant_op_types: + in_arg_name = op.input('X')[0] + if in_arg_name in persistable_vars: + if self.weight_quantize_type == 'abs_max': + param = _load_var(in_arg_name) + scale_v = np.max(np.abs(param)) + else: + scale_v = _load_var(op.output('OutScale')[0]) + var_scale_map[block_id][in_arg_name] = scale_v + else: + scale_v = block.var(op.output('OutScale')[0]) + var_scale_map[block_id][in_arg_name] = scale_v + + if in_arg_name in persistable_vars: + _remove_fake_quant_and_dequant_op(block, op) + # quantize weight and restore + param_t = _load_var(in_arg_name) + param_q_t = quant(param_t, scale_v, self.weight_bits) + _restore_var(in_arg_name, param_q_t) + + if op_type in self.fake_dequant_op_types: + _remove_fake_quant_and_dequant_op(block, op) + + if op_type in _QUANTIZABLE_OP_TYPES: + dequant_var = _insert_post_dequant_op(block, op) + + # remove the unused var in ProgramDesc + self._remove_unused_var(program) + #program = program.clone() + + def convert_to_int8(self, program, place, scope=None): + scope = global_scope() if scope is None else scope + program = default_main_program() if program is None else program + + def _load_var(name): + return np.array(scope.find_var(name).get_tensor()) + + global_block = program.global_block() + + def convert_to_int8(var): + int8_var_name = var.name + ".int8" + int8_var = global_block.create_parameter( + name=int8_var_name.encode('ascii'), + type=var.type, + dtype=core.VarDesc.VarType.INT8, + shape=var.shape) + + tensor = _load_var(var.name) + + scope.var(int8_var_name) + int8_tensor = scope.find_var(int8_var_name).get_tensor() + int8_tensor.set(tensor.astype(np.int8), place) + return int8_var + + input_map = {} + for block in program.blocks: + for op in list(block.ops): + if op.type in _QUANTIZABLE_OP_TYPES: + for name in op.input_arg_names: + var = block.var(name) + if var.persistable: + if name not in input_map: + int8_var = convert_to_int8(var) + input_map[name] = int8_var.name + op._rename_input(name, input_map[name]) + self._remove_unused_var(program) + + def _remove_unused_var(self, program): + all_remove_vars = [] + for block in program.blocks: + args = [] + for op in block.ops: + args += op.input_arg_names + args += op.output_arg_names + args = list(set(args)) + var_names = block.vars.keys() + sub_block_remove_vars = [] + for var in var_names: + if var not in args: + sub_block_remove_vars.append(var) + all_remove_vars.append(sub_block_remove_vars) + + remove_vars = [list(set(v)) for v in all_remove_vars] + for i, block in enumerate(program.blocks): + for v in remove_vars[i]: + block._remove_var(v) + + def _insert_quant_abs_max_op(self, block, idx, var, quant_bits): + """Insert fake_quantize_abs_max op. + """ + quant_var = block.create_var( + name=_quantized_var_name(var.name), + type=var.type, + shape=var.shape, + dtype=var.dtype) + scale = block.create_var( + name=_quantized_scale_name(var.name), + type=var.type, + shape=var.shape, + dtype=var.dtype) + quant_op = block._insert_op( + idx, + type='fake_quantize_abs_max', + attrs={'bit_length': quant_bits}, + inputs={'X': var}, + outputs={'Out': quant_var, + 'OutScale': scale}) + return quant_var, scale + + def _insert_quant_range_abs_max_op(self, block, idx, var, quant_bits): + """Insert fake_quantize_range_abs_max + """ + quant_var = block.create_var( + name=_quantized_var_name(var.name), + type=var.type, + shape=var.shape, + dtype=var.dtype) + scale = self.helper.create_parameter( + attr=ParamAttr( + name=_quantized_scale_name(var.name), + initializer=Constant(0.001), + trainable=False), + shape=[1], + dtype=var.dtype) + scale.stop_gradient = True + + ins = {'X': var, 'InScale': scale} + outs = {'Out': quant_var, 'OutScale': scale} + if not self.is_test: + # A global step counter variable with type int64 + scales = self.helper.create_global_variable( + name=unique_name.generate('scales'), + persistable=True, + dtype=var.dtype, + shape=[self.window_size]) + self.helper.set_variable_initializer( + scales, initializer=Constant(value=0)) + + ins['Iter'] = self.global_step + outs['OutScales'] = scales + + attrs = { + 'window_size': self.window_size, + 'bit_length': quant_bits, + 'is_test': self.is_test + } + + quant_op = block._insert_op( + idx, + type='fake_quantize_range_abs_max', + attrs=attrs, + inputs=ins, + outputs=outs) + + return quant_var, scale + + def _insert_quant_op(self, block, idx, var, quant_bits, quant_type): + """ + Insert fake_quantize_op + """ + if quant_type == 'abs_max': + return self._insert_quant_abs_max_op(block, idx, var, quant_bits) + elif quant_type == 'range_abs_max': + return self._insert_quant_range_abs_max_op(block, idx, var, + quant_bits) + + def _insert_dequant_op(self, block, idx, var, scale, quant_bits): + """ + Insert fake_quantize_op + """ + dequant_var = block.create_var( + name=_dequantized_var_name(var.name), + type=var.type, + shape=var.shape, + dtype=var.dtype) + # insert fake_dequantize_op + max_range = (1 << (quant_bits - 1)) - 1 + dequant_op = block._insert_op( + idx, + type="fake_dequantize_max_abs", + attrs={'max_range': float(max_range)}, + inputs={"X": var, + 'Scale': scale}, + outputs={"Out": dequant_var}) + return dequant_var + + +class BNFuseTranspiler(InferenceTranspiler): + def _fuse_param(self, current_op, bn_op, bias_op, with_bias): + def _update_param(op, param_name, new_param): + var = self.block.vars[param_name] + tensor = self.scope.find_var(param_name).get_tensor() + tensor.set(np.array(new_param), self.place) + + def _load_param(param_name): + return np.array(self.scope.find_var(param_name).get_tensor()) + + bias_bn = _load_param(bn_op.input("Bias")[0]) #Bias + scale_bn = _load_param(bn_op.input("Scale")[0]) #Scale + mean_bn = _load_param(bn_op.input("Mean")[0]) #Mean + var_bn = _load_param(bn_op.input("Variance")[0]) #Variance + + if current_op.type in ['conv2d', 'depthwise_conv2d']: + current_param = _load_param( + _original_var_name(current_op.input("Filter")[0])) + elif current_op.type == 'mul': + current_param = _load_param( + _original_var_name(current_op.input("Y")[0])) + + std_bn = np.float32(np.sqrt(np.add(var_bn, 1e-5))) + tmp = np.float32(np.divide(scale_bn, std_bn)) + + # add bias of batch_norm_op to conv2d + if with_bias: + bias = _load_param(bias_op.input("Y")) + else: + bias = np.zeros(bias_bn.shape) + bias = np.float32( + np.add(np.multiply(np.subtract(bias, mean_bn), tmp), bias_bn)) + + # re-compute weight of conv2d/fc + tmp = tmp.reshape(tmp.shape[0], -1) + dst_param = current_param.reshape((tmp.shape[0], -1)) + dst_param = np.float32(np.multiply(dst_param, tmp)) + dst_param = dst_param.reshape(current_param.shape) + + # update parameters + if current_op.type in ['conv2d', 'depthwise_conv2d']: + _update_param(current_op, + _original_var_name(current_op.input("Filter")[0]), + dst_param) + elif current_op.type == 'mul': + _update_param(current_op, + _original_var_name(current_op.input("Y")[0]), + dst_param) + + _update_param(bias_op, bias_op.input("Y")[0], bias) + + # collect the renamed input + self.input_map[bn_op.output("Y")[0]] = bias_op.output("Out")[0] diff --git a/python/paddle/fluid/contrib/tests/CMakeLists.txt b/python/paddle/fluid/contrib/tests/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..79bec8c4ad34d682895250bc29b1fddb3a569bd4 --- /dev/null +++ b/python/paddle/fluid/contrib/tests/CMakeLists.txt @@ -0,0 +1,6 @@ +file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py") +string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}") + +foreach(src ${TEST_OPS}) + py_test(${src} SRCS ${src}.py) +endforeach() diff --git a/python/paddle/fluid/contrib/tests/test_quantize_transpiler.py b/python/paddle/fluid/contrib/tests/test_quantize_transpiler.py new file mode 100644 index 0000000000000000000000000000000000000000..86fa84ad4bd7a55fb27f4e43128f0bfda6dfe6db --- /dev/null +++ b/python/paddle/fluid/contrib/tests/test_quantize_transpiler.py @@ -0,0 +1,279 @@ +# copyright (c) 2018 paddlepaddle authors. all rights reserved. +# +# licensed under the apache license, version 2.0 (the "license"); +# you may not use this file except in compliance with the license. +# you may obtain a copy of the license at +# +# http://www.apache.org/licenses/license-2.0 +# +# unless required by applicable law or agreed to in writing, software +# distributed under the license is distributed on an "as is" basis, +# without warranties or conditions of any kind, either express or implied. +# see the license for the specific language governing permissions and +# limitations under the license. + +import numpy as np +import six + +import unittest +import paddle +import paddle.fluid as fluid +from paddle.fluid.contrib.quantize.quantize_transpiler import _original_var_name +from paddle.fluid.contrib.quantize.quantize_transpiler import QuantizeTranspiler + + +def linear_fc(num): + data = fluid.layers.data(name='image', shape=[1, 32, 32], dtype='float32') + label = fluid.layers.data(name='label', shape=[1], dtype='int64') + hidden = data + for _ in six.moves.xrange(num): + hidden = fluid.layers.fc(hidden, size=128, act='relu') + loss = fluid.layers.cross_entropy(input=hidden, label=label) + loss = fluid.layers.mean(loss) + return loss + + +def residual_block(num): + def conv_bn_layer(input, + ch_out, + filter_size, + stride, + padding, + act='relu', + bias_attr=False): + tmp = fluid.layers.conv2d( + input=input, + filter_size=filter_size, + num_filters=ch_out, + stride=stride, + padding=padding, + act=None, + bias_attr=bias_attr) + return fluid.layers.batch_norm(input=tmp, act=act) + + data = fluid.layers.data(name='image', shape=[1, 32, 32], dtype='float32') + label = fluid.layers.data(name='label', shape=[1], dtype='int64') + hidden = data + for _ in six.moves.xrange(num): + conv = conv_bn_layer(hidden, 16, 3, 1, 1, act=None, bias_attr=True) + short = conv_bn_layer(hidden, 16, 1, 1, 0, act=None) + hidden = fluid.layers.elementwise_add(x=conv, y=short, act='relu') + fc = fluid.layers.fc(input=hidden, size=10) + loss = fluid.layers.cross_entropy(input=fc, label=label) + loss = fluid.layers.mean(loss) + return loss + + +def conv_net(img, label): + conv_pool_1 = fluid.nets.simple_img_conv_pool( + input=img, + filter_size=5, + num_filters=20, + pool_size=2, + pool_stride=2, + act="relu") + conv_pool_1 = fluid.layers.batch_norm(conv_pool_1) + conv_pool_2 = fluid.nets.simple_img_conv_pool( + input=conv_pool_1, + filter_size=5, + num_filters=50, + pool_size=2, + pool_stride=2, + act="relu") + prediction = fluid.layers.fc(input=conv_pool_2, size=10, act='softmax') + loss = fluid.layers.cross_entropy(input=prediction, label=label) + avg_loss = fluid.layers.mean(loss) + return avg_loss + + +class TestQuantizeTranspiler(unittest.TestCase): + def setUp(self): + # since quant_op and dequant_op is not ready, use cos and sin for test + self.weight_quant_op_type = 'fake_quantize_abs_max' + self.dequant_op_type = 'fake_dequantize_max_abs' + self.quantizable_op_and_inputs = { + 'conv2d': ['Input', 'Filter'], + 'depthwise_conv2d': ['Input', 'Filter'], + 'mul': ['X', 'Y'] + } + self.quantizable_op_grad_and_inputs = { + 'conv2d_grad': ['Input', 'Filter'], + 'depthwise_conv2d_grad': ['Input', 'Filter'], + 'mul_grad': ['X', 'Y'] + } + + def check_program(self, program): + quantized_ops = {} + + persistable_vars = [ + v.name + for v in filter(lambda var: var.persistable, program.list_vars()) + ] + + for block in program.blocks: + for idx, op in enumerate(block.ops): + # check forward + if op.type in self.quantizable_op_and_inputs: + for i, arg_name in enumerate(op.input_arg_names): + quant_op_type = self.weight_quant_op_type if \ + _original_var_name(arg_name) \ + in persistable_vars else self.act_quant_op_type + self.assertTrue( + arg_name.endswith('.quantized.dequantized')) + if arg_name not in quantized_ops: + self.assertEqual(block.ops[idx - 2 * i - 1].type, + self.dequant_op_type) + self.assertEqual(block.ops[idx - 2 * i - 2].type, + quant_op_type) + quantized_ops[arg_name] = block.ops[idx - 2 * i - 2] + else: + op_idx = block.ops.index(quantized_ops[arg_name]) + self.assertLess(op_idx, idx) + + # check backward + if op.type in self.quantizable_op_grad_and_inputs: + for pname in self.quantizable_op_grad_and_inputs[op.type]: + arg_name = op.input(pname)[0] + self.assertTrue( + arg_name.endswith('.quantized.dequantized')) + self.assertTrue(arg_name in quantized_ops) + + def linear_fc_quant(self, quant_type): + main = fluid.Program() + startup = fluid.Program() + with fluid.program_guard(main, startup): + loss = linear_fc(3) + opt = fluid.optimizer.Adam(learning_rate=0.001) + opt.minimize(loss) + t = QuantizeTranspiler(activation_quantize_type=quant_type) + t.training_transpile(main) + self.check_program(main) + + def test_linear_fc_quant_abs_max(self): + self.act_quant_op_type = 'fake_quantize_abs_max' + self.linear_fc_quant('abs_max') + + def test_linear_fc_quant_range_abs_max(self): + self.act_quant_op_type = 'fake_quantize_range_abs_max' + self.linear_fc_quant('range_abs_max') + + def residual_block_quant(self, quant_type): + main = fluid.Program() + startup = fluid.Program() + with fluid.program_guard(main, startup): + loss = residual_block(2) + opt = fluid.optimizer.Adam(learning_rate=0.001) + opt.minimize(loss) + t = QuantizeTranspiler(activation_quantize_type=quant_type) + t.training_transpile(main) + self.check_program(main) + + def test_residual_block_abs_max(self): + self.act_quant_op_type = 'fake_quantize_abs_max' + self.residual_block_quant('abs_max') + + def test_residual_block_range_abs_max(self): + self.act_quant_op_type = 'fake_quantize_range_abs_max' + self.residual_block_quant('range_abs_max') + + def freeze_program(self, use_cuda, seed): + def build_program(main, startup, is_test): + main.random_seed = seed + startup.random_seed = seed + with fluid.unique_name.guard(): + with fluid.program_guard(main, startup): + img = fluid.layers.data( + name='image', shape=[1, 28, 28], dtype='float32') + label = fluid.layers.data( + name='label', shape=[1], dtype='int64') + loss = conv_net(img, label) + if not is_test: + opt = fluid.optimizer.Adam(learning_rate=0.001) + opt.minimize(loss) + return [img, label], loss + + main = fluid.Program() + startup = fluid.Program() + test_program = fluid.Program() + + import random + random.seed(0) + np.random.seed(0) + + feeds, loss = build_program(main, startup, False) + build_program(test_program, startup, True) + test_program = test_program.clone(for_test=True) + + quant_transpiler = QuantizeTranspiler() + quant_transpiler.training_transpile(main) + quant_transpiler.training_transpile(test_program) + + place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() + exe = fluid.Executor(place) + iters = 5 + batch_size = 8 + class_num = 10 + exe.run(startup) + + train_reader = paddle.batch( + paddle.reader.shuffle( + paddle.dataset.mnist.train(), buf_size=500), + batch_size=batch_size) + test_reader = paddle.batch( + paddle.dataset.mnist.test(), batch_size=batch_size) + feeder = fluid.DataFeeder(feed_list=feeds, place=place) + + with fluid.program_guard(main): + for _ in range(iters): + data = next(train_reader()) + loss_v = exe.run(program=main, + feed=feeder.feed(data), + fetch_list=[loss]) + + with fluid.program_guard(test_program): + test_data = next(test_reader()) + w_var = fluid.framework._get_var('conv2d_1.w_0.quantized', + test_program) + # Testing during training + test_loss1, w_quant = exe.run(program=test_program, + feed=feeder.feed(test_data), + fetch_list=[loss, w_var]) + + # Freeze program for inference, but the weight of fc/conv is still float type. + quant_transpiler.freeze_program(test_program, place) + test_loss2, = exe.run(program=test_program, + feed=feeder.feed(test_data), + fetch_list=[loss]) + self.assertAlmostEqual(test_loss1, test_loss2, delta=5e-3) + w_freeze = np.array(fluid.global_scope().find_var('conv2d_1.w_0') + .get_tensor()) + # fail: -432.0 != -433.0, this is due to the calculation precision + #self.assertAlmostEqual(np.sum(w_freeze), np.sum(w_quant)) + + # Convert parameter to 8-bit. + quant_transpiler.convert_to_int8(test_program, place) + # Save the 8-bit parameter and model file. + fluid.io.save_inference_model('model_8bit', ['image', 'label'], + [loss], exe, test_program) + # Test whether the 8-bit parameter and model file can be loaded successfully. + [infer, feed, fetch] = fluid.io.load_inference_model('model_8bit', + exe) + # Check the loaded 8-bit weight. + w_8bit = np.array(fluid.global_scope().find_var('conv2d_1.w_0.int8') + .get_tensor()) + + self.assertEqual(w_8bit.dtype, np.int8) + self.assertEqual(np.sum(w_8bit), np.sum(w_freeze)) + + def not_test_freeze_program_cuda(self): + if fluid.core.is_compiled_with_cuda(): + with fluid.unique_name.guard(): + self.freeze_program(True, seed=1) + + def not_test_freeze_program_cpu(self): + with fluid.unique_name.guard(): + self.freeze_program(False, seed=2) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py index d44a49226883df19aae1a5520fc7763272d5dc95..e5135744aebf38785be14c00ccafd7438332b86b 100644 --- a/python/paddle/fluid/framework.py +++ b/python/paddle/fluid/framework.py @@ -37,12 +37,9 @@ from . import unique_name __all__ = [ 'Program', - 'Operator', - 'Parameter', 'default_startup_program', 'default_main_program', 'program_guard', - 'get_var', 'name_scope', ] @@ -540,8 +537,7 @@ class Operator(object): 'feed', 'fetch', 'save', 'load', 'recurrent', 'go', 'rnn_memory_helper_grad', 'conditional_block', 'while', 'send', 'recv', 'listen_and_serv', 'parallel_do', 'save_combine', 'load_combine', - 'ncclInit', 'channel_create', 'channel_close', 'channel_send', - 'channel_recv', 'select', 'checkpoint_notify', 'gen_nccl_id' + 'ncclInit', 'select', 'checkpoint_notify', 'gen_nccl_id' } def __init__(self, @@ -655,11 +651,11 @@ class Operator(object): self._update_desc_attr(attr_name, attr_val) self.desc.check_attrs() - if self.has_kernel(type): + if self._has_kernel(type): self.desc.infer_var_type(self.block.desc) self.desc.infer_shape(self.block.desc) - def has_kernel(self, op_type): + def _has_kernel(self, op_type): return op_type not in self.OP_WITHOUT_KERNEL_SET def set_input(self, name, value): @@ -703,7 +699,7 @@ class Operator(object): """ return self.desc.input(name) - def rename_input(self, old_name, new_name): + def _rename_input(self, old_name, new_name): """ Rename the `old_name` to `new_name`. @@ -714,9 +710,9 @@ class Operator(object): Returns: None """ - self.desc.rename_input(old_name, new_name) + self.desc._rename_input(old_name, new_name) - def rename_output(self, old_name, new_name): + def _rename_output(self, old_name, new_name): """ Rename the `old_name` to `new_name`. @@ -727,7 +723,7 @@ class Operator(object): Returns: None """ - self.desc.rename_output(old_name, new_name) + self.desc._rename_output(old_name, new_name) @property def input_names(self): @@ -791,7 +787,7 @@ class Operator(object): """ return self.desc.attr_type(name) - def set_attr(self, name, val): + def _set_attr(self, name, val): """ Set the value of attribute by attribute's name. @@ -824,7 +820,7 @@ class Operator(object): isinstance(val, core.ProgramDesc): self.desc.set_serialized_attr(name, val.serialize_to_string()) else: - self.desc.set_attr(name, val) + self.desc._set_attr(name, val) @property def attr_names(self): @@ -843,7 +839,7 @@ class Operator(object): """ return self.desc.attr(name) - def block_attr_id(self, name): + def _block_attr_id(self, name): """ Get the block attribute's id by name. @@ -853,9 +849,9 @@ class Operator(object): Returns: int: the block index. """ - return self.desc.block_attr_id(name) + return self.desc._block_attr_id(name) - def block_attr(self, name): + def _block_attr(self, name): """ Get the block attribute by name. @@ -866,11 +862,11 @@ class Operator(object): block: the block attribute. """ - id = self.block_attr_id(name) + id = self._block_attr_id(name) assert (id >= 0 and id < len(self.block.program.blocks)) return self.block.program.blocks[id] - def blocks_attr(self, name): + def _blocks_attr(self, name): """ Get the blocks attribute by name. @@ -881,13 +877,13 @@ class Operator(object): list: list of the blocks attribute. """ attrs = [] - for i in self.blocks_attr_ids(name): + for i in self._blocks_attr_ids(name): assert (i >= 0 and i < len(self.block.program.blocks)) attrs.append(self.block.program.blocks[i]) return attrs - def blocks_attr_ids(self, name): + def _blocks_attr_ids(self, name): """ Get the blocks attribute's ids by name. @@ -898,7 +894,7 @@ class Operator(object): list: list of the blocks ids. """ - return self.desc.blocks_attr_ids(name) + return self.desc._blocks_attr_ids(name) def all_attrs(self): """ @@ -912,11 +908,11 @@ class Operator(object): for n in attr_names: attr_type = self.desc.attr_type(n) if attr_type == core.AttrType.BLOCK: - attr_map[n] = self.block_attr(n) + attr_map[n] = self._block_attr(n) continue if attr_type == core.AttrType.BLOCKS: - attr_map[n] = self.blocks_attr(n) + attr_map[n] = self._blocks_attr(n) continue attr_map[n] = self.attr(n) @@ -1790,7 +1786,7 @@ class Program(object): for j in six.moves.range(block.op_size()): op = block.op(j) if op.has_attr('is_test'): - op.set_attr('is_test', True) + op._set_attr('is_test', True) res.blocks = [ Block(res, i) for i in six.moves.range(res.desc.num_blocks()) ] @@ -2164,7 +2160,7 @@ def program_guard(main_program, startup_program=None): switch_startup_program(startup_program) -def get_var(name, program=None): +def _get_var(name, program=None): """ Get a variable by name from the global block of a program. diff --git a/python/paddle/fluid/io.py b/python/paddle/fluid/io.py index e703e5ac7943b006741f12886a14bf344a6b9b28..604f3eacd75beff306915b224b30c369dd3a486f 100644 --- a/python/paddle/fluid/io.py +++ b/python/paddle/fluid/io.py @@ -600,7 +600,7 @@ def save_inference_model(dirname, """ if isinstance(feeded_var_names, six.string_types): feeded_var_names = [feeded_var_names] - else: + elif export_for_deployment: if len(feeded_var_names) > 0: # TODO(paddle-dev): polish these code blocks if not (bool(feeded_var_names) and all( @@ -610,61 +610,60 @@ def save_inference_model(dirname, if isinstance(target_vars, Variable): target_vars = [target_vars] - else: + elif export_for_deployment: if not (bool(target_vars) and all( isinstance(var, Variable) for var in target_vars)): raise ValueError("'target_vars' should be a list of Variable.") if main_program is None: main_program = default_main_program() - copy_program = main_program.clone() + + # if there is lookup table, the trainer 0 will notify all pserver to save. + if main_program._is_distributed and main_program._is_chief and main_program._distributed_lookup_table: + lookup_table_filename = os.path.join(dirname, "__lookup_table__") + _save_lookup_tables_by_notify(executor, lookup_table_filename, + main_program._distributed_lookup_table, + main_program._endpoints) if not os.path.isdir(dirname): os.makedirs(dirname) + if model_filename is not None: + model_basename = os.path.basename(model_filename) + else: + model_basename = "__model__" + model_basename = os.path.join(dirname, model_basename) # When export_for_deployment is true, we modify the program online so that # it can only be loaded for inference directly. If it's false, the whole # original program and related meta are saved so that future usage can be # more flexible. if export_for_deployment: - global_block = copy_program.global_block() + main_program = main_program.clone() + global_block = main_program.global_block() for i, op in enumerate(global_block.ops): op.desc.set_is_target(False) if op.type == "feed" or op.type == "fetch": global_block._remove_op(i) - copy_program.desc.flush() + main_program.desc.flush() - pruned_program = copy_program._prune(targets=target_vars) - saved_program = pruned_program._inference_optimize(prune_read_op=True) + main_program = main_program._prune(targets=target_vars) + main_program = main_program._inference_optimize(prune_read_op=True) fetch_var_names = [v.name for v in target_vars] - prepend_feed_ops(saved_program, feeded_var_names) - append_fetch_ops(saved_program, fetch_var_names) + prepend_feed_ops(main_program, feeded_var_names) + append_fetch_ops(main_program, fetch_var_names) + + with open(model_basename, "wb") as f: + f.write(main_program.desc.serialize_to_string()) else: # TODO(panyx0718): Save more information so that it can also be used # for training and more flexible post-processing. - saved_program = copy_program - - if model_filename is not None: - model_filename = os.path.basename(model_filename) - else: - model_filename = "__model__" - model_filename = os.path.join(dirname, model_filename) + with open(model_basename + ".main_program", "wb") as f: + f.write(main_program.desc.serialize_to_string()) if params_filename is not None: params_filename = os.path.basename(params_filename) - - with open(model_filename, "wb") as f: - f.write(saved_program.desc.serialize_to_string()) - - save_persistables(executor, dirname, saved_program, params_filename) - - # if there is lookup table, the trainer 0 will notify all pserver to save. - if main_program._is_distributed and main_program._is_chief and main_program._distributed_lookup_table: - lookup_table_filename = os.path.join(dirname, "__lookup_table__") - _save_lookup_tables_by_notify(executor, lookup_table_filename, - main_program._distributed_lookup_table, - main_program._endpoints) + save_persistables(executor, dirname, main_program, params_filename) def load_inference_model(dirname, diff --git a/python/paddle/fluid/layers/control_flow.py b/python/paddle/fluid/layers/control_flow.py index 0049773bbeb514d5dfef490e73b9988bd5371029..4af97e8632a47fbd981362dc8249a3f6b7269ecd 100644 --- a/python/paddle/fluid/layers/control_flow.py +++ b/python/paddle/fluid/layers/control_flow.py @@ -21,7 +21,7 @@ from .. import core from ..framework import Program, Variable, Operator from ..layer_helper import LayerHelper, unique_name from ..initializer import force_init_on_cpu -from .ops import logical_and, logical_not, logical_or +from .nn import logical_and, logical_not, logical_or import numpy import warnings import six @@ -1570,6 +1570,10 @@ class DynamicRNN(object): The dynamic RNN can mark multiple variables as its output. Use `drnn()` to get the output sequence. + + NOTES: + Currently it is not supported that setting is_sparse to True of any + layers within DynamicRNN. """ BEFORE_RNN = 0 IN_RNN = 1 diff --git a/python/paddle/fluid/layers/detection.py b/python/paddle/fluid/layers/detection.py index 574d0d727cba9fa9de0cffbe116f71b9e65a7092..1cfcbbb9c1614f21848e62cce79befc673e1739c 100644 --- a/python/paddle/fluid/layers/detection.py +++ b/python/paddle/fluid/layers/detection.py @@ -42,19 +42,11 @@ __all__ = [ 'roi_perspective_transform', 'generate_proposal_labels', 'generate_proposals', -] - -__auto__ = [ 'iou_similarity', 'box_coder', 'polygon_box_transform', ] -__all__ += __auto__ - -for _OP in set(__auto__): - globals()[_OP] = generate_layer_fn(_OP) - def rpn_target_assign(bbox_pred, cls_logits, @@ -284,7 +276,7 @@ def detection_output(loc, target_box=loc, code_type='decode_center_size') compile_shape = scores.shape - run_shape = ops.shape(scores) + run_shape = nn.shape(scores) scores = nn.flatten(x=scores, axis=2) scores = nn.softmax(input=scores) scores = nn.reshape(x=scores, shape=compile_shape, actual_shape=run_shape) @@ -308,6 +300,101 @@ def detection_output(loc, return nmsed_outs +@templatedoc() +def iou_similarity(x, y, name=None): + """ + ${comment} + + Args: + x(${x_type}): ${x_comment} + y(${y_type}): ${y_comment} + + Returns: + out(${out_type}): ${out_comment} + """ + helper = LayerHelper("iou_similarity", **locals()) + if name is None: + out = helper.create_tmp_variable(dtype=x.dtype) + else: + out = helper.create_variable( + name=name, dtype=x.dtype, persistable=False) + + helper.append_op( + type="iou_similarity", + inputs={"X": x, + "Y": y}, + attrs={}, + outputs={"Out": out}) + return out + + +@templatedoc() +def box_coder(prior_box, + prior_box_var, + target_box, + code_type="encode_center_size", + box_normalized=True, + name=None): + """ + ${comment} + + Args: + prior_box(${prior_box_type}): ${prior_box_comment} + prior_box_var(${prior_box_var_type}): ${prior_box_var_comment} + target_box(${target_box_type}): ${target_box_comment} + code_type(${code_type_type}): ${code_type_comment} + box_normalized(${box_normalized_type}): ${box_normalized_comment} + + Returns: + output_box(${output_box_type}): ${output_box_comment} + """ + helper = LayerHelper("box_coder", **locals()) + + if name is None: + output_box = helper.create_tmp_variable(dtype=prior_box.dtype) + else: + output_box = helper.create_variable( + name=name, dtype=prior_box.dtype, persistable=False) + + helper.append_op( + type="box_coder", + inputs={ + "PriorBox": prior_box, + "PriorBoxVar": prior_box_var, + "TargetBox": target_box + }, + attrs={"code_type": code_type, + "box_normalized": box_normalized}, + outputs={"OutputBox": output_box}) + return output_box + + +@templatedoc() +def polygon_box_transform(input, name=None): + """ + ${comment} + + Args: + input(${input_type}): ${input_comment} + + Returns: + output(${output_type}): ${output_comment} + """ + helper = LayerHelper("polygon_box_transform", **locals()) + if name is None: + output = helper.create_tmp_variable(dtype=input.dtype) + else: + output = helper.create_variable( + name=name, dtype=prior_box.input, persistable=False) + + helper.append_op( + type="polygon_box_transform", + inputs={"Input": input}, + attrs={}, + outputs={"Output": output}) + return output + + @templatedoc() def detection_map(detect_res, label, @@ -697,7 +784,7 @@ def ssd_loss(location, raise ValueError("Only support mining_type == max_negative now.") num, num_prior, num_class = confidence.shape - conf_shape = ops.shape(confidence) + conf_shape = nn.shape(confidence) def __reshape_to_2d(var): return nn.flatten(x=var, axis=2) @@ -724,7 +811,7 @@ def ssd_loss(location, target_label.stop_gradient = True conf_loss = nn.softmax_with_cross_entropy(confidence, target_label) # 3. Mining hard examples - actual_shape = ops.slice(conf_shape, axes=[0], starts=[0], ends=[2]) + actual_shape = nn.slice(conf_shape, axes=[0], starts=[0], ends=[2]) actual_shape.stop_gradient = True conf_loss = nn.reshape( x=conf_loss, shape=(num, num_prior), actual_shape=actual_shape) diff --git a/python/paddle/fluid/layers/io.py b/python/paddle/fluid/layers/io.py index d56fa76300e7054ef71a7729483a579fa35f1dac..81c78cba219007a9348af961e4b0dc227edba747 100644 --- a/python/paddle/fluid/layers/io.py +++ b/python/paddle/fluid/layers/io.py @@ -507,7 +507,6 @@ def py_reader(capacity, 1. The basic usage of :code:`py_reader` is as follows: - >>> import paddle.v2 >>> import paddle.fluid as fluid >>> import paddle.dataset.mnist as mnist >>> @@ -515,7 +514,7 @@ def py_reader(capacity, >>> shapes=[(-1,3,224,224), (-1,1)], >>> dtypes=['float32', 'int64']) >>> reader.decorate_paddle_reader( - >>> paddle.v2.reader.shuffle(paddle.batch(mnist.train()) + >>> paddle.reader.shuffle(paddle.batch(mnist.train()) >>> >>> img, label = fluid.layers.read_file(reader) >>> loss = network(img, label) # some network definition @@ -534,7 +533,6 @@ def py_reader(capacity, 2. When training and testing are both performed, two different :code:`py_reader` should be created with different names, e.g.: - >>> import paddle.v2 >>> import paddle.fluid as fluid >>> import paddle.dataset.mnist as mnist >>> @@ -548,7 +546,7 @@ def py_reader(capacity, >>> dtypes=['float32', 'int64'], >>> name='train_reader') >>> train_reader.decorate_paddle_reader( - >>> paddle.v2.reader.shuffle(paddle.batch(mnist.train()) + >>> paddle.reader.shuffle(paddle.batch(mnist.train()) >>> >>> test_reader = fluid.layers.py_reader(capacity=32, >>> shapes=[(-1,3,224,224), (-1,1)], diff --git a/python/paddle/fluid/layers/metric_op.py b/python/paddle/fluid/layers/metric_op.py index b1598bfec210474ae1e17f9f88e8b57aa80b8452..a3064b565d096f7feda18379c66ffc8bf2f4a55c 100644 --- a/python/paddle/fluid/layers/metric_op.py +++ b/python/paddle/fluid/layers/metric_op.py @@ -78,7 +78,12 @@ def accuracy(input, label, k=1, correct=None, total=None): return acc_out -def auc(input, label, curve='ROC', num_thresholds=2**12 - 1, topk=1): +def auc(input, + label, + curve='ROC', + num_thresholds=2**12 - 1, + topk=1, + slide_steps=1): """ **Area Under the Curve (AUC) Layer** @@ -105,6 +110,8 @@ def auc(input, label, curve='ROC', num_thresholds=2**12 - 1, topk=1): num_thresholds(int): The number of thresholds to use when discretizing the roc curve. Default 200. topk(int): only topk number of prediction output will be used for auc. + slide_steps: when calc batch auc, we can not only use step currently but the previous steps can be used. slide_steps=1 means use the current step, slide_steps=3 means use current step and the previous second steps, slide_steps=0 use all of the steps. + Returns: Variable: A scalar representing the current AUC. @@ -120,16 +127,48 @@ def auc(input, label, curve='ROC', num_thresholds=2**12 - 1, topk=1): auc_out = helper.create_tmp_variable(dtype="float64") batch_auc_out = helper.create_tmp_variable(dtype="float64") # make tp, tn, fp, fn persistable, so that can accumulate all batches. + + # for batch auc + batch_stat_pos = helper.create_global_variable( + persistable=True, + dtype='int64', + shape=[slide_steps, num_thresholds + 1]) + batch_stat_neg = helper.create_global_variable( + persistable=True, + dtype='int64', + shape=[slide_steps, num_thresholds + 1]) + + # for global auc stat_pos = helper.create_global_variable( - persistable=True, dtype='int64', shape=[num_thresholds + 1]) + persistable=True, dtype='int64', shape=[1, num_thresholds + 1]) stat_neg = helper.create_global_variable( - persistable=True, dtype='int64', shape=[num_thresholds + 1]) + persistable=True, dtype='int64', shape=[1, num_thresholds + 1]) - for var in [stat_pos, stat_neg]: + for var in [batch_stat_pos, batch_stat_neg, stat_pos, stat_neg]: helper.set_variable_initializer( var, Constant( value=0.0, force_cpu=True)) + # Batch AUC + helper.append_op( + type="auc", + inputs={ + "Predict": [input], + "Label": [label], + "StatPos": [batch_stat_pos], + "StatNeg": [batch_stat_neg] + }, + attrs={ + "curve": curve, + "num_thresholds": num_thresholds, + "slide_steps": slide_steps + }, + outputs={ + "AUC": [batch_auc_out], + "StatPosOut": [batch_stat_pos], + "StatNegOut": [batch_stat_neg] + }) + # Global AUC helper.append_op( type="auc", inputs={ @@ -138,12 +177,16 @@ def auc(input, label, curve='ROC', num_thresholds=2**12 - 1, topk=1): "StatPos": [stat_pos], "StatNeg": [stat_neg] }, - attrs={"curve": curve, - "num_thresholds": num_thresholds}, + attrs={ + "curve": curve, + "num_thresholds": num_thresholds, + "slide_steps": 0 + }, outputs={ "AUC": [auc_out], - "BatchAUC": [batch_auc_out], "StatPosOut": [stat_pos], "StatNegOut": [stat_neg] }) - return auc_out, batch_auc_out, [stat_pos, stat_neg] + return auc_out, batch_auc_out, [ + batch_stat_pos, batch_stat_neg, stat_pos, stat_neg + ] diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index 6e0f3de4141aff3efb98b6d1162823a13f83a7bb..8c0ef7a82421ffc04bf669e6850e075226c09d27 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -133,6 +133,23 @@ __all__ = [ 'elementwise_max', 'elementwise_min', 'elementwise_pow', + 'uniform_random_batch_size_like', + 'gaussian_random', + 'sampling_id', + 'gaussian_random_batch_size_like', + 'sum', + 'slice', + 'shape', + 'logical_and', + 'logical_or', + 'logical_xor', + 'logical_not', + 'clip', + 'clip_by_norm', + 'mean', + 'mul', + 'sigmoid_cross_entropy_with_logits', + 'maxout', ] @@ -141,7 +158,6 @@ def fc(input, num_flatten_dims=1, param_attr=None, bias_attr=None, - use_mkldnn=False, act=None, is_test=False, name=None): @@ -193,8 +209,6 @@ def fc(input, If it is set to None, the bias is initialized zero. Default: None. act (str, default None): Activation to be applied to the output of this layer. is_test(bool): A flag indicating whether execution is in test phase. - use_mkldnn(bool): Use mkldnn kernel or not, it is valid only when the mkldnn - library is installed. Default: False name (str, default None): The name of this layer. Returns: @@ -241,7 +255,7 @@ def fc(input, type="sum", inputs={"X": mul_results}, outputs={"Out": pre_bias}, - attrs={"use_mkldnn": use_mkldnn}) + attrs={"use_mkldnn": False}) # add bias pre_activation = helper.append_bias_op(pre_bias, dim_start=num_flatten_dims) # add activation @@ -1034,8 +1048,8 @@ def cross_entropy(input, label, soft_label=False, ignore_index=-100): soft_label (bool): a flag indicating whether to interpretate the given labels as soft labels. Default: `False`. - ignore_index (int): Specifies a target value that is ignored and does - not contribute to the input gradient. Only valid + ignore_index (int): Specifies a target value that is ignored and does + not contribute to the input gradient. Only valid if soft_label is set to False. Default: -100 Returns: @@ -1405,7 +1419,6 @@ def conv2d(input, param_attr=None, bias_attr=None, use_cudnn=True, - use_mkldnn=False, act=None, name=None): """ @@ -1483,8 +1496,6 @@ def conv2d(input, bias_attr (ParamAttr): Bias parameter for the Conv2d layer. Default: None use_cudnn (bool): Use cudnn kernel or not, it is valid only when the cudnn library is installed. Default: True - use_mkldnn (bool): Use mkldnn kernels or not, it is valid only when compiled - with mkldnn library. Default: False act (str): Activation type. Default: None name (str|None): A name for this layer(optional). If set None, the layer will be named automatically. @@ -1557,7 +1568,7 @@ def conv2d(input, 'dilations': dilation, 'groups': groups, 'use_cudnn': use_cudnn, - 'use_mkldnn': use_mkldnn + 'use_mkldnn': False }) pre_act = helper.append_bias_op(pre_bias, dim_start=1, dim_end=2) @@ -1575,7 +1586,6 @@ def conv3d(input, param_attr=None, bias_attr=None, use_cudnn=True, - use_mkldnn=False, act=None, name=None): """ @@ -1649,7 +1659,6 @@ def conv3d(input, bias_attr (ParamAttr): Bias parameter for the Conv3d layer. Default: None use_cudnn (bool): Use cudnn kernel or not, it is valid only when the cudnn library is installed. Default: True - use_mkldnn (bool): Use mkldnn kernels or not. act (str): Activation type. Default: None name (str|None): A name for this layer(optional). If set None, the layer will be named automatically. @@ -1719,7 +1728,7 @@ def conv3d(input, 'dilations': dilation, 'groups': groups, 'use_cudnn': use_cudnn, - 'use_mkldnn': use_mkldnn + 'use_mkldnn': False }) pre_act = helper.append_bias_op(pre_bias, dim_start=1, dim_end=2) @@ -1901,7 +1910,6 @@ def pool2d(input, global_pooling=False, use_cudnn=True, ceil_mode=False, - use_mkldnn=False, name=None): """ ${comment} @@ -1919,7 +1927,6 @@ def pool2d(input, global_pooling: ${global_pooling_comment} use_cudnn: ${use_cudnn_comment} ceil_mode: ${ceil_mode_comment} - use_mkldnn: ${use_mkldnn_comment} name (str|None): A name for this layer(optional). If set None, the layer will be named automatically. @@ -1979,7 +1986,7 @@ def pool2d(input, "paddings": pool_padding, "use_cudnn": use_cudnn, "ceil_mode": ceil_mode, - "use_mkldnn": use_mkldnn + "use_mkldnn": False }) return pool_out @@ -1993,7 +2000,6 @@ def pool3d(input, global_pooling=False, use_cudnn=True, ceil_mode=False, - use_mkldnn=False, name=None): """ This function adds the operator for pooling in 3-dimensions, using the @@ -2008,7 +2014,6 @@ def pool3d(input, global_pooling (bool): ${global_pooling_comment} use_cudnn (bool): ${use_cudnn_comment} ceil_mode (bool): ${ceil_mode_comment} - use_mkldnn (bool): ${use_mkldnn_comment} name (str): A name for this layer(optional). If set None, the layer will be named automatically. @@ -2049,7 +2054,7 @@ def pool3d(input, "paddings": pool_padding, "use_cudnn": use_cudnn, "ceil_mode": ceil_mode, - "use_mkldnn": use_mkldnn + "use_mkldnn": False }) return pool_out @@ -2064,7 +2069,6 @@ def batch_norm(input, bias_attr=None, data_layout='NCHW', in_place=False, - use_mkldnn=False, name=None, moving_mean_name=None, moving_variance_name=None, @@ -2106,7 +2110,6 @@ def batch_norm(input, bias_attr(ParamAttr): The parameter attribute for Parameter `bias`. data_layout(string, default NCHW): NCHW|NHWC in_place(bool, Default False): Make the input and output of batch norm reuse memory. - use_mkldnn(bool, Default false): ${use_mkldnn_comment} name(string, Default None): A name for this layer(optional). If set None, the layer will be named automatically. moving_mean_name(string, Default None): The name of moving_mean which store the global Mean. @@ -2198,7 +2201,7 @@ def batch_norm(input, "momentum": momentum, "epsilon": epsilon, "is_test": is_test, - "use_mkldnn": use_mkldnn, + "use_mkldnn": False, "fuse_with_relu": fuse_with_relu }) @@ -2795,20 +2798,20 @@ def sequence_pad(x, pad_value, maxlen=None): Args: x(Variable): Input variable which should contain lod information. - pad_value(Variable): The Variable that holds values that will be fill - into padded steps. It can be a scalar or a tensor whose shape - equals to time steps in sequences. If it's a scalar, it will be + pad_value(Variable): The Variable that holds values that will be fill + into padded steps. It can be a scalar or a tensor whose shape + equals to time steps in sequences. If it's a scalar, it will be automatically broadcasted to the shape of time step. - maxlen(int, default None): The length of padded sequences. It can be - None or any positive int. When it is None, all sequences will be - padded up to the length of the longest one among them; when it a - certain positive value, it must be greater than the length of the + maxlen(int, default None): The length of padded sequences. It can be + None or any positive int. When it is None, all sequences will be + padded up to the length of the longest one among them; when it a + certain positive value, it must be greater than the length of the longest original sequence." - + Returns: - Variable: The padded sequence batch and the original lengths before + Variable: The padded sequence batch and the original lengths before padding. All sequences has the same length. - + Examples: .. code-block:: python @@ -4424,8 +4427,8 @@ def softmax_with_cross_entropy(logits, soft_label is set to true, Label is a Tensor with soft_label (bool): A flag to indicate whether to interpretate the given labels as soft labels. By default, `soft_label` is set to False. - ignore_index (int): Specifies a target value that is ignored and does - not contribute to the input gradient. Only valid + ignore_index (int): Specifies a target value that is ignored and does + not contribute to the input gradient. Only valid if soft_label is set to False. Default: -100 Returns: @@ -4682,14 +4685,14 @@ def reshape(x, shape, actual_shape=None, act=None, inplace=True, name=None): def squeeze(input, axes, name=None): """ - Remove single-dimensional entries from the shape of a tensor. Takes a - parameter axes with a list of axes to squeeze. If axes is not provided, all - the single dimensions will be removed from the shape. If an axis is + Remove single-dimensional entries from the shape of a tensor. Takes a + parameter axes with a list of axes to squeeze. If axes is not provided, all + the single dimensions will be removed from the shape. If an axis is selected with shape entry not equal to one, an error is raised. - + Examples: Case 1: - Given + Given X.shape = (1, 3, 1, 5) and axes = [0] @@ -4698,11 +4701,11 @@ def squeeze(input, axes, name=None): Case 2: Given X.shape = (1, 3, 1, 5) - and + and axes = [] we get: Out.shape = (3, 5) - + Args: input (Variable): The input variable to be squeezed. axes (list): List of integers, indicating the dimensions to be squeezed. @@ -4732,14 +4735,14 @@ def squeeze(input, axes, name=None): def unsqueeze(input, axes, name=None): """ - Insert single-dimensional entries to the shape of a tensor. Takes one - required argument axes, a list of dimensions that will be inserted. - Dimension indices in axes are as seen in the output tensor. + Insert single-dimensional entries to the shape of a tensor. Takes one + required argument axes, a list of dimensions that will be inserted. + Dimension indices in axes are as seen in the output tensor. - For example: - Given a tensor such that tensor with shape [3, 4, 5], + For example: + Given a tensor such that tensor with shape [3, 4, 5], then Unsqueezed tensor with axes=[0, 4] has shape [1, 3, 4, 5, 1]. - + Args: input (Variable): The input variable to be unsqueezed. axes (list): List of integers, indicating the dimensions to be inserted. @@ -5838,39 +5841,39 @@ def pad2d(input, Example: Given that X is a channel of image from input: - + X = [[1, 2, 3], [4, 5, 6]] - + Case 0: - + paddings = [0, 1, 2, 3], mode = 'constant' pad_value = 0 - + Out = [[0, 0, 1, 2, 3, 0, 0, 0] [0, 0, 4, 5, 6, 0, 0, 0] [0, 0, 0, 0, 0, 0, 0, 0]] - + Case 1: - + paddings = [0, 1, 2, 1], mode = 'reflect' - + Out = [[3, 2, 1, 2, 3, 2] [6, 5, 4, 5, 6, 5] [3, 2, 1, 2, 3, 2]] - + Case 2: - + paddings = [0, 1, 2, 1], mode = 'edge' - + Out = [[1, 1, 1, 2, 3, 3] [4, 4, 4, 5, 6, 6] [4, 4, 4, 5, 6, 6]] - - + + Args: input (Variable): The input image with [N, C, H, W] format or [N, H, W, C] format. paddings (tuple|list): The padding size. If padding is a tuple, it must @@ -6069,7 +6072,7 @@ def prelu(x, mode, param_attr=None, name=None): channel:elements in a channel share same weight element:each element has a weight name(str|None): A name for this layer(optional). If set None, the layer - will be named automatically. + will be named automatically. Returns: Variable: The output tensor with the same shape as input. @@ -6247,10 +6250,10 @@ def flatten(x, axis=1, name=None): def sequence_enumerate(input, win_size, pad_value=0, name=None): """ Generate a new sequence for the input index sequence, which enumerates all the - sub-sequences with length `win_size` of the input. + sub-sequences with length `win_size` of the input. The enumerated sequence has the same 1st dimension with variable `input`, and the 2nd dimension is `win_size`, padded by `pad_value` if necessary in generation. - + Examples: Case 1: Input: @@ -6377,20 +6380,20 @@ def unstack(x, axis=0, num=None): **UnStack Layer** This layer unstacks input :code:`x` into several tensors along axis. - + If :code:`axis` < 0, it would be replaced with :code:`axis+rank(x)`. If :code:`num` is None, it would be inferred from :code:`x.shape[axis]`, and if :code:`x.shape[axis]` <= 0 or is unknown, :code:`ValueError` is - raised. + raised. Args: - x (Variable): Input variable. + x (Variable): Input variable. axis (int): The axis along which the input is unstacked. num (int|None): The number of output variables. - + Returns: list(Variable): The unstacked variables. - + """ helper = LayerHelper('unstack', **locals()) @@ -6423,21 +6426,21 @@ def expand(x, expand_times, name=None): .. code-block:: text Input(X) is a 3-D tensor with shape [2, 3, 1]: - + [ [[1], [2], [3]], [[4], [5], [6]] ] - + Attr(expand_times): [1, 2, 2] - + Output(Out) is a 3-D tensor with shape [2, 6, 2]: - + [ [[1, 1], [2, 2], [3, 3], [1, 1], [2, 2], [3, 3]], [[4, 4], [5, 5], [6, 6], [4, 4], [5, 5], [6, 6]] ] - + Args: x (Variable): A tensor with rank in [1, 6]. expand_times (list|tuple): Expand times number for each dimension. @@ -6463,6 +6466,239 @@ def expand(x, expand_times, name=None): return out +from paddle.fluid.framework import convert_np_dtype_to_dtype_ + + +@templatedoc() +def uniform_random_batch_size_like(input, + shape, + dtype='float32', + input_dim_idx=0, + output_dim_idx=0, + min=-1.0, + max=1.0, + seed=0): + """ + ${comment} + + Args: + input (Variable): ${input_comment} + shape (tuple|list): ${shape_comment} + input_dim_idx (Int): ${input_dim_idx_comment} + output_dim_idx (Int): ${output_dim_idx_comment} + min (Float): ${min_comment} + max (Float): ${max_comment} + seed (Int): ${seed_comment} + dtype(np.dtype|core.VarDesc.VarType|str): The type of data : float32, float_16, int etc + Returns: + out (Variable): ${out_comment} + + """ + + helper = LayerHelper('uniform_random_batch_size_like', **locals()) + out = helper.create_tmp_variable(dtype) + c_dtype = convert_np_dtype_to_dtype_(dtype) + helper.append_op( + type='uniform_random_batch_size_like', + inputs={'Input': input}, + outputs={'Out': out}, + attrs={ + 'shape': shape, + 'input_dim_idx': input_dim_idx, + 'output_dim_idx': output_dim_idx, + 'min': min, + 'max': max, + 'seed': seed, + 'dtype': c_dtype + }) + + return out + + +@templatedoc() +def gaussian_random(shape, mean=0.0, std=1.0, seed=0, dtype='float32'): + """ + ${comment} + + Args: + shape (tuple|list): ${shape_comment} + mean (Float): ${mean_comment} + std (Float): ${std_comment} + seed (Int): ${seed_comment} + dtype(np.dtype|core.VarDesc.VarType|str): Output data type. + + Returns: + out (Variable): ${out_comment} + + """ + + helper = LayerHelper('gaussian_random', **locals()) + out = helper.create_tmp_variable(dtype) + c_dtype = convert_np_dtype_to_dtype_(dtype) + helper.append_op( + type='gaussian_random', + outputs={'Out': out}, + attrs={ + 'shape': shape, + 'mean': mean, + 'std': std, + 'seed': seed, + 'dtype': c_dtype, + 'use_mkldnn': False + }) + + return out + + +@templatedoc() +def sampling_id(x, min=0.0, max=1.0, seed=0, dtype='float32'): + """ + ${comment} + + Args: + x (Variable): ${x_comment} + min (Float): ${min_comment} + max (Float): ${max_comment} + seed (Float): ${seed_comment} + dtype(np.dtype|core.VarDesc.VarType|str): The type of output data : float32, float_16, int etc + + Returns: + out (Variable): ${out_comment} + + """ + + helper = LayerHelper('sampling_id', **locals()) + out = helper.create_tmp_variable(dtype) + helper.append_op( + type='sampling_id', + inputs={'X': x}, + outputs={'Out': out}, + attrs={'min': min, + 'max': max, + 'seed': seed}) + + return out + + +@templatedoc() +def gaussian_random_batch_size_like(input, + shape, + input_dim_idx=0, + output_dim_idx=0, + mean=0.0, + std=1.0, + seed=0, + dtype='float32'): + """ + ${comment} + + Args: + input (Variable): ${input_comment} + shape (tuple|list): ${shape_comment} + input_dim_idx (Int): ${input_dim_idx_comment} + output_dim_idx (Int): ${output_dim_idx_comment} + mean (Float): ${mean_comment} + std (Float): ${std_comment} + seed (Int): ${seed_comment} + dtype(np.dtype|core.VarDesc.VarType|str): The type of output data : float32, float_16, int etc + + Returns: + out (Variable): ${out_comment} + """ + + helper = LayerHelper('gaussian_random_batch_size_like', **locals()) + out = helper.create_tmp_variable(dtype) + c_dtype = convert_np_dtype_to_dtype_(dtype) + helper.append_op( + type='gaussian_random_batch_size_like', + inputs={'Input': input}, + outputs={'Out': out}, + attrs={ + 'shape': shape, + 'input_dim_idx': input_dim_idx, + 'output_dim_idx': output_dim_idx, + 'mean': mean, + 'std': std, + 'seed': seed, + 'dtype': c_dtype + }) + + return out + + +@templatedoc() +def sum(x): + """ + ${comment} + + Args: + x (Variable): ${x_comment} + + Returns: + out (Variable): ${out_comment} + """ + + helper = LayerHelper('sum', **locals()) + out = helper.create_tmp_variable(dtype=helper.input_dtype('x')) + helper.append_op( + type='sum', + inputs={'X': x}, + outputs={'Out': out}, + attrs={'use_mkldnn': False}) + + return out + + +@templatedoc() +def slice(input, axes, starts, ends): + """ + ${comment} + + Args: + input (Variable): ${input_comment}. + axes (List): ${axes_comment} + starts (List): ${starts_comment} + ends (List): ${ends_comment} + + Returns: + out (Variable): ${out_comment} + + """ + + helper = LayerHelper('slice', **locals()) + out = helper.create_tmp_variable(dtype=helper.input_dtype('input')) + helper.append_op( + type='slice', + inputs={'Input': input}, + outputs={'Out': out}, + attrs={'axes': axes, + 'starts': starts, + 'ends': ends}) + + return out + + +@templatedoc() +def shape(input): + """ + ${comment} + + Args: + input (Variable): ${input_comment} + + Returns: + out (Variable): ${out_comment} + + """ + + helper = LayerHelper('shape', **locals()) + out = helper.create_tmp_variable(dtype=helper.input_dtype('input')) + helper.append_op( + type='shape', inputs={'Input': input}, outputs={'Out': out}) + + return out + + def _elementwise_op(helper): op_type = helper.layer_type x = helper.kwargs.get('x', None) @@ -6499,7 +6735,7 @@ def scale(x, scale=1.0, bias=0.0, bias_after_scale=True, act=None, name=None): bias(${bias_type}): ${bias_comment} bias_after_scale(${bias_after_scale_type}): ${bias_after_scale_comment} act(basestring|None): Activation applied to the output. - name(basestring|None): Name of the output. + name(basestring|None): Name of the output. Returns: out(${out_type}): ${out_comment} @@ -6524,31 +6760,31 @@ def scale(x, scale=1.0, bias=0.0, bias_after_scale=True, act=None, name=None): return helper.append_activation(out) -def elementwise_add(x, y, axis=-1, use_mkldnn=False, act=None, name=None): +def elementwise_add(x, y, axis=-1, act=None, name=None): return _elementwise_op(LayerHelper('elementwise_add', **locals())) -def elementwise_div(x, y, axis=-1, use_mkldnn=False, act=None, name=None): +def elementwise_div(x, y, axis=-1, act=None, name=None): return _elementwise_op(LayerHelper('elementwise_div', **locals())) -def elementwise_sub(x, y, axis=-1, use_mkldnn=False, act=None, name=None): +def elementwise_sub(x, y, axis=-1, act=None, name=None): return _elementwise_op(LayerHelper('elementwise_sub', **locals())) -def elementwise_mul(x, y, axis=-1, use_mkldnn=False, act=None, name=None): +def elementwise_mul(x, y, axis=-1, act=None, name=None): return _elementwise_op(LayerHelper('elementwise_mul', **locals())) -def elementwise_max(x, y, axis=-1, use_mkldnn=False, act=None, name=None): +def elementwise_max(x, y, axis=-1, act=None, name=None): return _elementwise_op(LayerHelper('elementwise_max', **locals())) -def elementwise_min(x, y, axis=-1, use_mkldnn=False, act=None, name=None): +def elementwise_min(x, y, axis=-1, act=None, name=None): return _elementwise_op(LayerHelper('elementwise_min', **locals())) -def elementwise_pow(x, y, axis=-1, use_mkldnn=False, act=None, name=None): +def elementwise_pow(x, y, axis=-1, act=None, name=None): return _elementwise_op(LayerHelper('elementwise_pow', **locals())) @@ -6563,3 +6799,288 @@ for func in [ "act (basestring|None): Activation applied to the output.", "name (basestring|None): Name of the output." ]) + + +def _logical_op(op_name, x, y, out=None, name=None, binary_op=True): + helper = LayerHelper(op_name, **locals()) + + if binary_op: + assert x.dtype == y.dtype + + if out is None: + if name is None: + out = helper.create_tmp_variable(dtype=x.dtype) + else: + out = helper.create_variable( + name=name, dtype=x.dtype, persistable=False) + + if binary_op: + helper.append_op( + type=op_name, inputs={"X": x, + "Y": y}, outputs={"Out": out}) + else: + helper.append_op(type=op_name, inputs={"X": x}, outputs={"Out": out}) + + return out + + +@templatedoc() +def logical_and(x, y, out=None, name=None): + """ + ${comment} + + Args: + x(${x_type}): ${x_comment} + y(${y_type}): ${y_comment} + out(Tensor): Output tensor of logical operation. + name(basestring|None): Name of the output. + + Returns: + out(${out_type}): ${out_comment} + """ + + return _logical_op( + op_name="logical_and", x=x, y=y, name=name, out=out, binary_op=True) + + +@templatedoc() +def logical_or(x, y, out=None, name=None): + """ + ${comment} + + Args: + x(${x_type}): ${x_comment} + y(${y_type}): ${y_comment} + out(Tensor): Output tensor of logical operation. + name(basestring|None): Name of the output. + + Returns: + out(${out_type}): ${out_comment} + """ + + return _logical_op( + op_name="logical_or", x=x, y=y, name=name, out=out, binary_op=True) + + +@templatedoc() +def logical_xor(x, y, out=None, name=None): + """ + ${comment} + + Args: + x(${x_type}): ${x_comment} + y(${y_type}): ${y_comment} + out(Tensor): Output tensor of logical operation. + name(basestring|None): Name of the output. + + Returns: + out(${out_type}): ${out_comment} + """ + + return _logical_op( + op_name="logical_xor", x=x, y=y, name=name, out=out, binary_op=True) + + +@templatedoc() +def logical_not(x, out=None, name=None): + """ + ${comment} + + Args: + x(${x_type}): ${x_comment} + out(Tensor): Output tensor of logical operation. + name(basestring|None): Name of the output. + + Returns: + out(${out_type}): ${out_comment} + """ + + return _logical_op( + op_name="logical_not", x=x, y=None, name=name, out=out, binary_op=False) + + +@templatedoc() +def clip(x, min, max, name=None): + """ + ${comment} + + Args: + x(${x_type}): ${x_comment} + min(${min_type}): ${min_comment} + max(${max_type}): ${max_comment} + name(basestring|None): Name of the output. + + Returns: + out(${out_type}): ${out_comment} + """ + + helper = LayerHelper("clip", **locals()) + + if name is None: + out = helper.create_tmp_variable(dtype=x.dtype) + else: + out = helper.create_variable( + name=name, dtype=x.dtype, persistable=False) + + helper.append_op( + type="clip", + inputs={"X": x}, + attrs={"min": min, + "max": max}, + outputs={"Out": out}) + + return out + + +@templatedoc() +def clip_by_norm(x, max_norm, name=None): + """ + ${comment} + + Args: + x(${x_type}): ${x_comment} + max_norm(${max_norm_type}): ${max_norm_comment} + name(basestring|None): Name of the output. + + Returns: + out(${out_type}): ${out_comment} + """ + + helper = LayerHelper("clip_by_norm", **locals()) + + if name is None: + out = helper.create_tmp_variable(dtype=x.dtype) + else: + out = helper.create_variable( + name=name, dtype=x.dtype, persistable=False) + + helper.append_op( + type="clip_by_norm", + inputs={"X": x}, + attrs={"max_norm": max_norm}, + outputs={"Out": out}) + + return out + + +@templatedoc() +def mean(x, name=None): + """ + ${comment} + + Args: + x(${x_type}): ${x_comment} + name(basestring|None): Name of the output. + + Returns: + out(${out_type}): ${out_comment} + """ + + helper = LayerHelper("mean", **locals()) + + if name is None: + out = helper.create_tmp_variable(dtype=x.dtype) + else: + out = helper.create_variable( + name=name, dtype=x.dtype, persistable=False) + + helper.append_op( + type="mean", inputs={"X": x}, attrs={}, outputs={"Out": out}) + + return out + + +@templatedoc() +def mul(x, y, x_num_col_dims=1, y_num_col_dims=1, name=None): + """ + ${comment} + + Args: + x(${x_type}): ${x_comment} + y(${y_type}): ${y_comment} + x_num_col_dims(${x_num_col_dims_type}): ${x_num_col_dims_comment} + y_num_col_dims(${y_num_col_dims_type}): ${y_num_col_dims_comment} + name(basestring|None): Name of the output. + + Returns: + out(${out_type}): ${out_comment} + """ + + helper = LayerHelper("mul", **locals()) + + if name is None: + out = helper.create_tmp_variable(dtype=x.dtype) + else: + out = helper.create_variable( + name=name, dtype=x.dtype, persistable=False) + + helper.append_op( + type="mul", + inputs={"X": x, + "Y": y}, + attrs={ + "x_num_col_dims": x_num_col_dims, + "y_num_col_dims": y_num_col_dims + }, + outputs={"Out": out}) + return out + + +@templatedoc() +def sigmoid_cross_entropy_with_logits(x, label, name=None): + """ + ${comment} + + Args: + x(${x_type}): ${x_comment} + label(${label_type}): ${label_comment} + name(basestring|None): Name of the output. + + Returns: + out(${out_type}): ${out_comment} + """ + + helper = LayerHelper("sigmoid_cross_entropy_with_logits", **locals()) + + if name is None: + out = helper.create_tmp_variable(dtype=x.dtype) + else: + out = helper.create_variable( + name=name, dtype=x.dtype, persistable=False) + + helper.append_op( + type="sigmoid_cross_entropy_with_logits", + inputs={"X": x, + "Label": label}, + attrs={}, + outputs={"Out": out}) + return out + + +@templatedoc() +def maxout(x, groups, name=None): + """ + ${comment} + + Args: + x(${x_type}): ${x_comment} + groups(${groups_type}): ${groups_comment} + name(basestring|None): Name of the output. + + Returns: + out(${out_type}): ${out_comment} + """ + helper = LayerHelper("maxout", **locals()) + + if name is None: + out = helper.create_tmp_variable(dtype=x.dtype) + else: + out = helper.create_variable( + name=name, dtype=x.dtype, persistable=False) + + helper.append_op( + type="maxout", + inputs={"X": x}, + attrs={"groups": groups}, + outputs={"Out": out}) + return out diff --git a/python/paddle/fluid/layers/ops.py b/python/paddle/fluid/layers/ops.py index 7867bfe00e25711643eab1ab8d0141dbbad3da52..9a8300524d8784fae598635796888382b1adbccf 100644 --- a/python/paddle/fluid/layers/ops.py +++ b/python/paddle/fluid/layers/ops.py @@ -35,25 +35,7 @@ __activations_noattr__ = [ 'softsign', ] -__all__ = [ - 'mean', - 'mul', - 'sigmoid_cross_entropy_with_logits', - 'clip', - 'clip_by_norm', - 'logical_and', - 'logical_or', - 'logical_xor', - 'logical_not', - 'uniform_random_batch_size_like', - 'gaussian_random', - 'sampling_id', - 'gaussian_random_batch_size_like', - 'sum', - 'slice', - 'shape', - 'maxout', -] +__all__ = [] for _OP in set(__all__): globals()[_OP] = generate_layer_fn(_OP) @@ -63,6 +45,8 @@ for _OP in set(__all__): # e.g.: test_program_code.py, test_dist_train.py globals()['_scale'] = generate_layer_fn('scale') +globals()['_elementwise_div'] = generate_layer_fn('elementwise_div') + __all__ += __activations_noattr__ for _OP in set(__activations_noattr__): diff --git a/python/paddle/fluid/nets.py b/python/paddle/fluid/nets.py index 06513801dd8b34d366f9632f6943c8046872c31b..1dabad54f5b976e0fcabf6918d3bc6ece4eed384 100644 --- a/python/paddle/fluid/nets.py +++ b/python/paddle/fluid/nets.py @@ -40,8 +40,7 @@ def simple_img_conv_pool(input, param_attr=None, bias_attr=None, act=None, - use_cudnn=True, - use_mkldnn=False): + use_cudnn=True): """ The simple_img_conv_pool is composed with one Convolution2d and one Pool2d. @@ -84,8 +83,6 @@ def simple_img_conv_pool(input, act (str): Activation type for Conv2d. Default: None use_cudnn (bool): Use cudnn kernel or not, it is valid only when the cudnn library is installed. Default: True - use_mkldnn (bool): Use mkldnn kernels or not, it is valid only when compiled - with mkldnn library. Default: False Return: Variable: The result of input after Convolution2d and Pool2d. @@ -112,8 +109,7 @@ def simple_img_conv_pool(input, param_attr=param_attr, bias_attr=bias_attr, act=act, - use_cudnn=use_cudnn, - use_mkldnn=use_mkldnn) + use_cudnn=use_cudnn) pool_out = layers.pool2d( input=conv_out, @@ -122,8 +118,7 @@ def simple_img_conv_pool(input, pool_stride=pool_stride, pool_padding=pool_padding, global_pooling=global_pooling, - use_cudnn=use_cudnn, - use_mkldnn=use_mkldnn) + use_cudnn=use_cudnn) return pool_out @@ -138,8 +133,7 @@ def img_conv_group(input, conv_batchnorm_drop_rate=0.0, pool_stride=1, pool_type="max", - use_cudnn=True, - use_mkldnn=False): + use_cudnn=True): """ The Image Convolution Group is composed of Convolution2d, BatchNorm, DropOut, and Pool2d. According to the input arguments, img_conv_group will do serials of @@ -177,8 +171,6 @@ def img_conv_group(input, average-pooling. Default :math:`max`. use_cudnn (bool): Use cudnn kernel or not, it is valid only when the cudnn library is installed. Default: True - use_mkldnn (bool): Use mkldnn kernels or not, it is valid only when compiled - with mkldnn library. Default: False Return: Variable: The final result after serial computation using Convolution2d, @@ -226,8 +218,7 @@ def img_conv_group(input, padding=conv_padding[i], param_attr=param_attr[i], act=local_conv_act, - use_cudnn=use_cudnn, - use_mkldnn=use_mkldnn) + use_cudnn=use_cudnn) if conv_with_batchnorm[i]: tmp = layers.batch_norm(input=tmp, act=conv_act, in_place=True) @@ -240,8 +231,7 @@ def img_conv_group(input, pool_size=pool_size, pool_type=pool_type, pool_stride=pool_stride, - use_cudnn=use_cudnn, - use_mkldnn=use_mkldnn) + use_cudnn=use_cudnn) return pool_out diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py index ad09005d866b10146e6fcd7cf108c51f34322607..1b9571f6d3a6a69d1ac35f6be74b80eaa2ce6251 100644 --- a/python/paddle/fluid/optimizer.py +++ b/python/paddle/fluid/optimizer.py @@ -26,6 +26,7 @@ from .layer_helper import LayerHelper from .regularizer import append_regularization_ops from .clip import append_gradient_clip_ops, error_clip_callback from contextlib import contextmanager +from .layers import ops __all__ = [ 'SGD', 'Momentum', 'Adagrad', 'Adam', 'Adamax', 'DecayedAdagrad', 'Ftrl', @@ -1301,7 +1302,7 @@ class ModelAverage(Optimizer): x=tmp, dtype='float32' if self._dtype == None else self._dtype) sum = layers.cast( x=sum, dtype='float32' if self._dtype == None else self._dtype) - layers.elementwise_div(x=sum, y=tmp, out=param) + ops._elementwise_div(x=sum, y=tmp, out=param) def _add_average_restore_op(self, block, param_grad): param = block._clone_variable(param_grad[0]) diff --git a/python/paddle/fluid/parallel_executor.py b/python/paddle/fluid/parallel_executor.py index 44af29d3390e35129d0ee65b31eacad6b28a9d60..57d272cbfb948840679e80e8db40379c57603113 100644 --- a/python/paddle/fluid/parallel_executor.py +++ b/python/paddle/fluid/parallel_executor.py @@ -74,28 +74,7 @@ class ParallelExecutor(object): build_strategy=None, num_trainers=1, trainer_id=0, - scope=None, - **kwargs): - if len(kwargs) != 0: - err_msg = "" - for key in kwargs: - if key in dir(ExecutionStrategy): - err_msg += \ - "Setting {0} by constructor is deprecated. Use " \ - "strategy=ExecutionStrategy(); strategy.{0}=xxx; " \ - "pe=ParallelExecutor(exec_strategy=strategy) " \ - "instead.\n ".format(key) - elif key in dir(BuildStrategy): - err_msg += \ - "Setting {0} by constructor is deprecated. Use " \ - "strategy=BuildStrategy(); See help(" \ - "paddle.fluid.ParallelExecutor.BuildStrategy) \n".format( - key) - else: - err_msg += "Setting {0} by constructor is deprecated. Use strategy.\n".format( - key) - raise ValueError(err_msg) - + scope=None): self._places = [] self._act_places = [] if use_cuda: diff --git a/python/paddle/fluid/param_attr.py b/python/paddle/fluid/param_attr.py index f0be794327f51cbbc4202b8b7b401b712b6d66a3..a51607bfdb1dde3d25f490770cc2ba368ceb27ff 100644 --- a/python/paddle/fluid/param_attr.py +++ b/python/paddle/fluid/param_attr.py @@ -185,7 +185,17 @@ class WeightNormParamAttr(ParamAttr): Args: dim(list): The parameter's name. Default None. - kwargs: Any field in ParamAttr. Default None. + name(str): The parameter's name. Default None. + initializer(Initializer): The method to initial this parameter. Default None. + learning_rate(float): The parameter's learning rate. The learning rate when + optimize is :math:`global\_lr * parameter\_lr * scheduler\_factor`. + Default 1.0. + regularizer(WeightDecayRegularizer): Regularization factor. Default None. + trainable(bool): Whether this parameter is trainable. Default True. + gradient_clip(BaseGradientClipAttr): The method to clip this parameter's + gradient. Default None. + do_model_average(bool): Whether this parameter should do model average. + Default False. Examples: .. code-block:: python @@ -204,6 +214,21 @@ class WeightNormParamAttr(ParamAttr): # these paramters for inference. params_with_weight_norm = [] - def __init__(self, dim=None, **kwargs): - super(WeightNormParamAttr, self).__init__(**kwargs) + def __init__(self, + dim=None, + name=None, + initializer=None, + learning_rate=1.0, + regularizer=None, + trainable=True, + gradient_clip=None, + do_model_average=False): + super(WeightNormParamAttr, self).__init__( + name=name, + initializer=initializer, + learning_rate=learning_rate, + regularizer=regularizer, + trainable=trainable, + gradient_clip=gradient_clip, + do_model_average=do_model_average) self.dim = dim diff --git a/python/paddle/fluid/tests/CMakeLists.txt b/python/paddle/fluid/tests/CMakeLists.txt index d24417bbacb503d9ea70e68e7e0edb59e7dddbde..1885dda44ab5eaeca6a4f54e4b84379c71ec3167 100644 --- a/python/paddle/fluid/tests/CMakeLists.txt +++ b/python/paddle/fluid/tests/CMakeLists.txt @@ -1,3 +1,4 @@ +set(PYTHON_TESTS_DIR ${CMAKE_CURRENT_BINARY_DIR} CACHE PATH "python tests directory") file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py") string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}") diff --git a/python/paddle/fluid/tests/book/high-level-api/recognize_digits/CMakeLists.txt b/python/paddle/fluid/tests/book/high-level-api/recognize_digits/CMakeLists.txt index 673c965b662a022739f8d489c331f4de9455a926..ad056aaa7b30b06d950486fd059c5b6a15770551 100644 --- a/python/paddle/fluid/tests/book/high-level-api/recognize_digits/CMakeLists.txt +++ b/python/paddle/fluid/tests/book/high-level-api/recognize_digits/CMakeLists.txt @@ -2,6 +2,16 @@ file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py") string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}") # default test -foreach(src ${TEST_OPS}) - py_test(${src} SRCS ${src}.py) -endforeach() +if(NOT APPLE) + foreach(src ${TEST_OPS}) + py_test(${src} SRCS ${src}.py) + endforeach() +else() + foreach(src ${TEST_OPS}) + if(${src} STREQUAL "test_recognize_digits_conv") + message(WARNING "These tests has been disabled in OSX for random fail: \n" ${src}) + else() + py_test(${src} SRCS ${src}.py) + endif() + endforeach() +endif() diff --git a/python/paddle/fluid/tests/book/test_recognize_digits.py b/python/paddle/fluid/tests/book/test_recognize_digits.py index 4b4f3e403776625fb5ca2f9b03d14ee7efe23d53..4a70976a4837c668a5e0ba6d49b598d046a8ec5d 100644 --- a/python/paddle/fluid/tests/book/test_recognize_digits.py +++ b/python/paddle/fluid/tests/book/test_recognize_digits.py @@ -67,6 +67,7 @@ def train(nn_type, use_cuda, parallel, save_dirname=None, + save_full_dirname=None, model_filename=None, params_filename=None, is_local=True): @@ -143,6 +144,13 @@ def train(nn_type, exe, model_filename=model_filename, params_filename=params_filename) + if save_full_dirname is not None: + fluid.io.save_inference_model( + save_full_dirname, [], [], + exe, + model_filename=model_filename, + params_filename=params_filename, + export_for_deployment=False) return else: print( @@ -214,10 +222,12 @@ def infer(use_cuda, def main(use_cuda, parallel, nn_type, combine): save_dirname = None + save_full_dirname = None model_filename = None params_filename = None if not use_cuda and not parallel: save_dirname = "recognize_digits_" + nn_type + ".inference.model" + save_full_dirname = "recognize_digits_" + nn_type + ".train.model" if combine == True: model_filename = "__model_combined__" params_filename = "__params_combined__" @@ -228,6 +238,7 @@ def main(use_cuda, parallel, nn_type, combine): use_cuda=use_cuda, parallel=parallel, save_dirname=save_dirname, + save_full_dirname=save_full_dirname, model_filename=model_filename, params_filename=params_filename) infer( diff --git a/python/paddle/fluid/tests/no_test_concurrency.py b/python/paddle/fluid/tests/no_test_concurrency.py deleted file mode 100644 index b5d7676f4a2cb085c6900cd0bd0644afa2b2afd5..0000000000000000000000000000000000000000 --- a/python/paddle/fluid/tests/no_test_concurrency.py +++ /dev/null @@ -1,260 +0,0 @@ -# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import print_function - -import unittest -import paddle.fluid as fluid -import paddle.fluid.core as core -from paddle.fluid import framework, unique_name, layer_helper -from paddle.fluid.executor import Executor -from paddle.fluid.layers import fill_constant, assign, While, elementwise_add, Print - - -class TestRoutineOp(unittest.TestCase): - def test_simple_routine(self): - ch = fluid.make_channel(dtype=core.VarDesc.VarType.LOD_TENSOR) - - # Create LOD_TENSOR and put it into the scope. This placeholder - # variable will be filled in and returned by fluid.channel_recv - result = self._create_tensor('return_value', - core.VarDesc.VarType.LOD_TENSOR, - core.VarDesc.VarType.INT64) - - with fluid.Go(): - input_value = fill_constant( - shape=[1], dtype=core.VarDesc.VarType.FP64, value=1234) - fluid.channel_send(ch, input_value) - - result, status = fluid.channel_recv(ch, result) - fluid.channel_close(ch) - - cpu = core.CPUPlace() - exe = Executor(cpu) - - outs = exe.run(fetch_list=[result]) - self.assertEqual(outs[0], 1234) - - def test_daisy_chain(self): - ''' - Mimics classic Daisy-chain test: https://talks.golang.org/2012/concurrency.slide#39 - ''' - n = 100 - - leftmost = fluid.make_channel(dtype=core.VarDesc.VarType.LOD_TENSOR) - left = leftmost - - # TODO(thuan): Use fluid.While() after scope capture is implemented. - # https://github.com/PaddlePaddle/Paddle/issues/8502 - for i in range(n): - right = fluid.make_channel(dtype=core.VarDesc.VarType.LOD_TENSOR) - with fluid.Go(): - one_tensor = self._create_one_dim_tensor(1) - result = self._create_tensor('return_value', - core.VarDesc.VarType.LOD_TENSOR, - core.VarDesc.VarType.INT64) - - result, status = fluid.channel_recv(right, result) - one_added = fluid.layers.elementwise_add(x=one_tensor, y=result) - fluid.channel_send(left, one_added) - left = right - - # Trigger the channel propagation by sending a "1" to rightmost channel - with fluid.Go(): - one_tensor = self._create_one_dim_tensor(1) - fluid.channel_send(right, one_tensor) - - leftmost_result = self._create_tensor('return_value', - core.VarDesc.VarType.LOD_TENSOR, - core.VarDesc.VarType.INT64) - leftmost_result, status = fluid.channel_recv(leftmost, leftmost_result) - - cpu = core.CPUPlace() - exe = Executor(cpu) - leftmost_data = exe.run(fetch_list=[leftmost_result]) - - # The leftmost_data should be equal to the number of channels + 1 - self.assertEqual(leftmost_data[0][0], n + 1) - - def _create_one_dim_tensor(self, value): - one_dim_tensor = fill_constant(shape=[1], dtype='int', value=value) - one_dim_tensor.stop_gradient = True - return one_dim_tensor - - def _create_tensor(self, name, type, dtype): - return framework.default_main_program().current_block().create_var( - name=unique_name.generate(name), type=type, dtype=dtype) - - def _create_persistable_tensor(self, name, type, dtype): - return framework.default_main_program().current_block().create_var( - name=unique_name.generate(name), - type=type, - dtype=dtype, - persistable=True) - - def test_select(self): - with framework.program_guard(framework.Program()): - ch1 = fluid.make_channel( - dtype=core.VarDesc.VarType.LOD_TENSOR, capacity=1) - - result1 = self._create_tensor('return_value', - core.VarDesc.VarType.LOD_TENSOR, - core.VarDesc.VarType.FP64) - - input_value = fill_constant( - shape=[1], dtype=core.VarDesc.VarType.FP64, value=10) - - with fluid.Select() as select: - with select.case(fluid.channel_send, ch1, input_value): - # Execute something. - pass - - with select.default(): - pass - - # This should not block because we are using a buffered channel. - result1, status = fluid.channel_recv(ch1, result1) - fluid.channel_close(ch1) - - cpu = core.CPUPlace() - exe = Executor(cpu) - - result = exe.run(fetch_list=[result1]) - self.assertEqual(result[0][0], 10) - - def test_fibonacci(self): - """ - Mimics Fibonacci Go example: https://tour.golang.org/concurrency/5 - """ - with framework.program_guard(framework.Program()): - quit_ch_input_var = self._create_persistable_tensor( - 'quit_ch_input', core.VarDesc.VarType.LOD_TENSOR, - core.VarDesc.VarType.INT32) - quit_ch_input = fill_constant( - shape=[1], - dtype=core.VarDesc.VarType.INT32, - value=0, - out=quit_ch_input_var) - - result = self._create_persistable_tensor( - 'result', core.VarDesc.VarType.LOD_TENSOR, - core.VarDesc.VarType.INT32) - fill_constant( - shape=[1], - dtype=core.VarDesc.VarType.INT32, - value=0, - out=result) - - x = fill_constant( - shape=[1], dtype=core.VarDesc.VarType.INT32, value=0) - y = fill_constant( - shape=[1], dtype=core.VarDesc.VarType.INT32, value=1) - - while_cond = fill_constant( - shape=[1], dtype=core.VarDesc.VarType.BOOL, value=True) - - while_false = fill_constant( - shape=[1], dtype=core.VarDesc.VarType.BOOL, value=False) - - x_tmp = fill_constant( - shape=[1], dtype=core.VarDesc.VarType.INT32, value=0) - - def fibonacci(channel, quit_channel): - while_op = While(cond=while_cond) - with while_op.block(): - result2 = fill_constant( - shape=[1], dtype=core.VarDesc.VarType.INT32, value=0) - - with fluid.Select() as select: - with select.case( - fluid.channel_send, channel, x, is_copy=True): - assign(input=x, output=x_tmp) - assign(input=y, output=x) - assign(elementwise_add(x=x_tmp, y=y), output=y) - - with select.case(fluid.channel_recv, quit_channel, - result2): - # Quit - helper = layer_helper.LayerHelper('assign') - helper.append_op( - type='assign', - inputs={'X': [while_false]}, - outputs={'Out': [while_cond]}) - - ch1 = fluid.make_channel(dtype=core.VarDesc.VarType.LOD_TENSOR) - quit_ch = fluid.make_channel(dtype=core.VarDesc.VarType.LOD_TENSOR) - - with fluid.Go(): - for i in range(10): - fluid.channel_recv(ch1, result) - Print(result) - - fluid.channel_send(quit_ch, quit_ch_input) - - fibonacci(ch1, quit_ch) - - fluid.channel_close(ch1) - fluid.channel_close(quit_ch) - - cpu = core.CPUPlace() - exe = Executor(cpu) - - exe_result = exe.run(fetch_list=[result]) - self.assertEqual(exe_result[0][0], 34) - - def test_ping_pong(self): - """ - Mimics Ping Pong example: https://gobyexample.com/channel-directions - """ - with framework.program_guard(framework.Program()): - result = self._create_tensor('return_value', - core.VarDesc.VarType.LOD_TENSOR, - core.VarDesc.VarType.FP64) - - ping_result = self._create_tensor('ping_return_value', - core.VarDesc.VarType.LOD_TENSOR, - core.VarDesc.VarType.FP64) - - def ping(ch, message): - fluid.channel_send(ch, message, is_copy=True) - - def pong(ch1, ch2): - fluid.channel_recv(ch1, ping_result) - fluid.channel_send(ch2, ping_result, is_copy=True) - - pings = fluid.make_channel( - dtype=core.VarDesc.VarType.LOD_TENSOR, capacity=1) - pongs = fluid.make_channel( - dtype=core.VarDesc.VarType.LOD_TENSOR, capacity=1) - - msg = fill_constant( - shape=[1], dtype=core.VarDesc.VarType.FP64, value=9) - - ping(pings, msg) - pong(pings, pongs) - - fluid.channel_recv(pongs, result) - - fluid.channel_close(pings) - fluid.channel_close(pongs) - - cpu = core.CPUPlace() - exe = Executor(cpu) - - exe_result = exe.run(fetch_list=[result]) - self.assertEqual(exe_result[0][0], 9) - - -if __name__ == '__main__': - unittest.main() diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt index d02c890209e65bdceb5da23ba5b9c7c0356174b8..7de0ebce06e9de439d3570bee9ac7dbce33ee868 100644 --- a/python/paddle/fluid/tests/unittests/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt @@ -28,7 +28,6 @@ list(REMOVE_ITEM TEST_OPS test_cond_op) # FIXME(qijun): https://github.com/Paddl list(REMOVE_ITEM TEST_OPS op_test) # op_test is a helper python file, not a test list(REMOVE_ITEM TEST_OPS decorators) # decorators is a helper python file, not a test - if(APPLE) if(NOT WITH_DISTRIBUTE) list(REMOVE_ITEM TEST_OPS test_desc_clone) @@ -77,11 +76,13 @@ if(WITH_DISTRIBUTE) if(NOT APPLE) set_tests_properties(test_dist_mnist PROPERTIES TIMEOUT 200) set_tests_properties(test_dist_word2vec PROPERTIES TIMEOUT 200) - py_test_modules(test_dist_se_resnext MODULES test_dist_se_resnext SERIAL) + py_test_modules(test_dist_se_resnext MODULES test_dist_se_resnext) + set_tests_properties(test_dist_se_resnext PROPERTIES TIMEOUT 1000) + # TODO: fix this test + #py_test_modules(test_dist_transformer MODULES test_dist_transformer) + #set_tests_properties(test_dist_transformer PROPERTIES TIMEOUT 1000) endif(NOT APPLE) py_test_modules(test_dist_transpiler MODULES test_dist_transpiler) - #FIXME(gongwb): random fails. - #py_test_modules(test_dist_transformer MODULES test_dist_transformer SERIAL) endif() py_test_modules(test_parallel_executor_crf MODULES test_parallel_executor_crf SERIAL) py_test_modules(test_parallel_executor_fetch_feed MODULES test_parallel_executor_fetch_feed SERIAL) diff --git a/python/paddle/fluid/tests/unittests/dist_ctr.py b/python/paddle/fluid/tests/unittests/dist_ctr.py new file mode 100644 index 0000000000000000000000000000000000000000..902dc6544ed6858c4cd8d64b14d6af2367059091 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/dist_ctr.py @@ -0,0 +1,109 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import paddle +import paddle.fluid as fluid + +import dist_ctr_reader +from test_dist_base import TestDistRunnerBase, runtime_main + +IS_SPARSE = True + +# Fix seed for test +fluid.default_startup_program().random_seed = 1 +fluid.default_main_program().random_seed = 1 + + +class TestDistCTR2x2(TestDistRunnerBase): + def get_model(self, batch_size=2): + dnn_input_dim, lr_input_dim = dist_ctr_reader.load_data_meta() + """ network definition """ + dnn_data = fluid.layers.data( + name="dnn_data", + shape=[-1, 1], + dtype="int64", + lod_level=1, + append_batch_size=False) + lr_data = fluid.layers.data( + name="lr_data", + shape=[-1, 1], + dtype="int64", + lod_level=1, + append_batch_size=False) + label = fluid.layers.data( + name="click", + shape=[-1, 1], + dtype="int64", + lod_level=0, + append_batch_size=False) + + # build dnn model + dnn_layer_dims = [128, 64, 32, 1] + dnn_embedding = fluid.layers.embedding( + is_distributed=False, + input=dnn_data, + size=[dnn_input_dim, dnn_layer_dims[0]], + param_attr=fluid.ParamAttr( + name="deep_embedding", + initializer=fluid.initializer.Constant(value=0.01)), + is_sparse=IS_SPARSE) + dnn_pool = fluid.layers.sequence_pool( + input=dnn_embedding, pool_type="sum") + dnn_out = dnn_pool + for i, dim in enumerate(dnn_layer_dims[1:]): + fc = fluid.layers.fc( + input=dnn_out, + size=dim, + act="relu", + param_attr=fluid.ParamAttr( + initializer=fluid.initializer.Constant(value=0.01)), + name='dnn-fc-%d' % i) + dnn_out = fc + + # build lr model + lr_embbding = fluid.layers.embedding( + is_distributed=False, + input=lr_data, + size=[lr_input_dim, 1], + param_attr=fluid.ParamAttr( + name="wide_embedding", + initializer=fluid.initializer.Constant(value=0.01)), + is_sparse=IS_SPARSE) + lr_pool = fluid.layers.sequence_pool(input=lr_embbding, pool_type="sum") + + merge_layer = fluid.layers.concat(input=[dnn_out, lr_pool], axis=1) + + predict = fluid.layers.fc(input=merge_layer, size=2, act='softmax') + acc = fluid.layers.accuracy(input=predict, label=label) + auc_var, batch_auc_var, auc_states = fluid.layers.auc(input=predict, + label=label) + cost = fluid.layers.cross_entropy(input=predict, label=label) + avg_cost = fluid.layers.mean(x=cost) + + inference_program = paddle.fluid.default_main_program().clone() + + sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.0001) + sgd_optimizer.minimize(avg_cost) + + dataset = dist_ctr_reader.Dataset() + train_reader = paddle.batch(dataset.train(), batch_size=batch_size) + test_reader = paddle.batch(dataset.test(), batch_size=batch_size) + + return inference_program, avg_cost, train_reader, test_reader, None, predict + + +if __name__ == "__main__": + runtime_main(TestDistCTR2x2) diff --git a/python/paddle/fluid/tests/unittests/dist_ctr_reader.py b/python/paddle/fluid/tests/unittests/dist_ctr_reader.py new file mode 100644 index 0000000000000000000000000000000000000000..95e39d891f7e6a3dcb57540bd96fe70027443cda --- /dev/null +++ b/python/paddle/fluid/tests/unittests/dist_ctr_reader.py @@ -0,0 +1,172 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import logging +import paddle +import tarfile + +logging.basicConfig() +logger = logging.getLogger("paddle") +logger.setLevel(logging.INFO) + +DATA_URL = "http://paddle-ctr-data.cdn.bcebos.com/avazu_ctr_data.tgz" +DATA_MD5 = "c11df99fbd14e53cd4bfa6567344b26e" +""" +avazu_ctr_data/train.txt +avazu_ctr_data/infer.txt +avazu_ctr_data/test.txt +avazu_ctr_data/data.meta.txt +""" + + +def read_data(file_name): + path = paddle.dataset.common.download(DATA_URL, "avazu_ctr_data", DATA_MD5) + tar = tarfile.open(path, "r:gz") + tar_info = None + for member in tar.getmembers(): + if member.name.endswith(file_name): + tar_info = member + f = tar.extractfile(tar_info) + ret_lines = [_.decode('utf-8') for _ in f.readlines()] + return ret_lines + + +class TaskMode: + TRAIN_MODE = 0 + TEST_MODE = 1 + INFER_MODE = 2 + + def __init__(self, mode): + self.mode = mode + + def is_train(self): + return self.mode == self.TRAIN_MODE + + def is_test(self): + return self.mode == self.TEST_MODE + + def is_infer(self): + return self.mode == self.INFER_MODE + + @staticmethod + def create_train(): + return TaskMode(TaskMode.TRAIN_MODE) + + @staticmethod + def create_test(): + return TaskMode(TaskMode.TEST_MODE) + + @staticmethod + def create_infer(): + return TaskMode(TaskMode.INFER_MODE) + + +class ModelType: + CLASSIFICATION = 0 + REGRESSION = 1 + + def __init__(self, mode): + self.mode = mode + + def is_classification(self): + return self.mode == self.CLASSIFICATION + + def is_regression(self): + return self.mode == self.REGRESSION + + @staticmethod + def create_classification(): + return ModelType(ModelType.CLASSIFICATION) + + @staticmethod + def create_regression(): + return ModelType(ModelType.REGRESSION) + + +def load_dnn_input_record(sent): + return list(map(int, sent.split())) + + +def load_lr_input_record(sent): + res = [] + for _ in [x.split(':') for x in sent.split()]: + res.append(int(_[0])) + return res + + +feeding_index = {'dnn_input': 0, 'lr_input': 1, 'click': 2} + + +class Dataset(object): + def train(self): + ''' + Load trainset. + ''' + file_name = "train.txt" + logger.info("load trainset from %s" % file_name) + mode = TaskMode.create_train() + return self._parse_creator(file_name, mode) + + def test(self): + ''' + Load testset. + ''' + file_name = "test.txt" + logger.info("load testset from %s" % file_name) + mode = TaskMode.create_test() + return self._parse_creator(file_name, mode) + + def infer(self): + ''' + Load infer set. + ''' + file_name = "infer.txt" + logger.info("load inferset from %s" % file_name) + mode = TaskMode.create_infer() + return self._parse_creator(file_name, mode) + + def _parse_creator(self, file_name, mode): + ''' + Parse dataset. + ''' + + def _parse(): + data = read_data(file_name) + for line_id, line in enumerate(data): + fs = line.strip().split('\t') + dnn_input = load_dnn_input_record(fs[0]) + lr_input = load_lr_input_record(fs[1]) + if not mode.is_infer(): + click = int(fs[2]) + yield [dnn_input, lr_input, click] + else: + yield [dnn_input, lr_input] + + return _parse + + +def load_data_meta(): + ''' + load data meta info from path, return (dnn_input_dim, lr_input_dim) + ''' + lines = read_data('data.meta.txt') + err_info = "wrong meta format" + assert len(lines) == 2, err_info + assert 'dnn_input_dim:' in lines[0] and 'lr_input_dim:' in lines[ + 1], err_info + res = map(int, [_.split(':')[1] for _ in lines]) + res = list(res) + logger.info('dnn input dim: %d' % res[0]) + logger.info('lr input dim: %d' % res[1]) + return res diff --git a/python/paddle/fluid/tests/unittests/dist_mnist.py b/python/paddle/fluid/tests/unittests/dist_mnist.py index 85a96c0b53f6bc08687965048d6251265055a6fe..877d21ae882ab4efb49beb6a846ab71a22c2aab7 100644 --- a/python/paddle/fluid/tests/unittests/dist_mnist.py +++ b/python/paddle/fluid/tests/unittests/dist_mnist.py @@ -47,7 +47,7 @@ def cnn_model(data): pool_stride=2, act="relu", param_attr=fluid.ParamAttr(initializer=fluid.initializer.Constant( - value=0.3))) + value=0.01))) conv_pool_2 = fluid.nets.simple_img_conv_pool( input=conv_pool_1, filter_size=5, @@ -56,7 +56,7 @@ def cnn_model(data): pool_stride=2, act="relu", param_attr=fluid.ParamAttr(initializer=fluid.initializer.Constant( - value=0.2))) + value=0.01))) SIZE = 10 input_shape = conv_pool_2.shape @@ -68,7 +68,7 @@ def cnn_model(data): size=SIZE, act="softmax", param_attr=fluid.param_attr.ParamAttr( - initializer=fluid.initializer.Constant(value=0.1))) + initializer=fluid.initializer.Constant(value=0.01))) return predict diff --git a/python/paddle/fluid/tests/unittests/dist_se_resnext.py b/python/paddle/fluid/tests/unittests/dist_se_resnext.py index a4ffe7d40c40501ebd43fec0b664159227ea34bd..5da370570680e9f10a22ad882e3346e6381dfe63 100644 --- a/python/paddle/fluid/tests/unittests/dist_se_resnext.py +++ b/python/paddle/fluid/tests/unittests/dist_se_resnext.py @@ -247,7 +247,7 @@ class DistSeResneXt2x2(TestDistRunnerBase): # Reader train_reader = paddle.batch( - paddle.dataset.flowers.train(), batch_size=batch_size) + paddle.dataset.flowers.test(use_xmap=False), batch_size=batch_size) test_reader = paddle.batch( paddle.dataset.flowers.test(use_xmap=False), batch_size=batch_size) diff --git a/python/paddle/fluid/tests/unittests/dist_simnet_bow.py b/python/paddle/fluid/tests/unittests/dist_simnet_bow.py new file mode 100644 index 0000000000000000000000000000000000000000..6456d1b53a129db04ace7ff4413a3d76e922ccde --- /dev/null +++ b/python/paddle/fluid/tests/unittests/dist_simnet_bow.py @@ -0,0 +1,238 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import numpy as np +import argparse +import time +import math +import random + +import paddle +import paddle.fluid as fluid +import paddle.fluid.profiler as profiler +from paddle.fluid import core +import unittest +from multiprocessing import Process +import os +import signal +from functools import reduce +from test_dist_base import TestDistRunnerBase, runtime_main + +DTYPE = "int64" +DATA_URL = 'http://paddle-dist-ce-data.bj.bcebos.com/simnet.train.1000' +DATA_MD5 = '24e49366eb0611c552667989de2f57d5' + +# For Net +base_lr = 0.2 +emb_lr = base_lr * 3 +dict_dim = 1500 +emb_dim = 128 +hid_dim = 128 +margin = 0.1 +sample_rate = 1 + +# Fix seed for test +fluid.default_startup_program().random_seed = 1 +fluid.default_main_program().random_seed = 1 + + +def get_acc(cos_q_nt, cos_q_pt, batch_size): + cond = fluid.layers.less_than(cos_q_nt, cos_q_pt) + cond = fluid.layers.cast(cond, dtype='float64') + cond_3 = fluid.layers.reduce_sum(cond) + acc = fluid.layers.elementwise_div( + cond_3, + fluid.layers.fill_constant( + shape=[1], value=batch_size * 1.0, dtype='float64'), + name="simnet_acc") + return acc + + +def get_loss(cos_q_pt, cos_q_nt): + loss_op1 = fluid.layers.elementwise_sub( + fluid.layers.fill_constant_batch_size_like( + input=cos_q_pt, shape=[-1, 1], value=margin, dtype='float32'), + cos_q_pt) + loss_op2 = fluid.layers.elementwise_add(loss_op1, cos_q_nt) + loss_op3 = fluid.layers.elementwise_max( + fluid.layers.fill_constant_batch_size_like( + input=loss_op2, shape=[-1, 1], value=0.0, dtype='float32'), + loss_op2) + avg_cost = fluid.layers.mean(loss_op3) + return avg_cost + + +def get_optimizer(): + # SGD optimizer + optimizer = fluid.optimizer.SGD(learning_rate=base_lr) + return optimizer + + +def train_network(batch_size, is_distributed=False, is_sparse=False): + # query + q = fluid.layers.data( + name="query_ids", shape=[1], dtype="int64", lod_level=1) + ## embedding + q_emb = fluid.layers.embedding( + input=q, + is_distributed=is_distributed, + size=[dict_dim, emb_dim], + param_attr=fluid.ParamAttr( + initializer=fluid.initializer.Constant(value=0.01), + name="__emb__", + learning_rate=emb_lr), + is_sparse=is_sparse) + ## vsum + q_sum = fluid.layers.sequence_pool(input=q_emb, pool_type='sum') + q_ss = fluid.layers.softsign(q_sum) + ## fc layer after conv + q_fc = fluid.layers.fc( + input=q_ss, + size=hid_dim, + param_attr=fluid.ParamAttr( + initializer=fluid.initializer.Constant(value=0.01), + name="__q_fc__", + learning_rate=base_lr)) + # label data + label = fluid.layers.data(name="label", shape=[1], dtype="int64") + # pt + pt = fluid.layers.data( + name="pos_title_ids", shape=[1], dtype="int64", lod_level=1) + ## embedding + pt_emb = fluid.layers.embedding( + input=pt, + is_distributed=is_distributed, + size=[dict_dim, emb_dim], + param_attr=fluid.ParamAttr( + initializer=fluid.initializer.Constant(value=0.01), + name="__emb__", + learning_rate=emb_lr), + is_sparse=is_sparse) + ## vsum + pt_sum = fluid.layers.sequence_pool(input=pt_emb, pool_type='sum') + pt_ss = fluid.layers.softsign(pt_sum) + ## fc layer + pt_fc = fluid.layers.fc( + input=pt_ss, + size=hid_dim, + param_attr=fluid.ParamAttr( + initializer=fluid.initializer.Constant(value=0.01), + name="__fc__", + learning_rate=base_lr), + bias_attr=fluid.ParamAttr(name="__fc_b__")) + # nt + nt = fluid.layers.data( + name="neg_title_ids", shape=[1], dtype="int64", lod_level=1) + ## embedding + nt_emb = fluid.layers.embedding( + input=nt, + is_distributed=is_distributed, + size=[dict_dim, emb_dim], + param_attr=fluid.ParamAttr( + initializer=fluid.initializer.Constant(value=0.01), + name="__emb__", + learning_rate=emb_lr), + is_sparse=is_sparse) + ## vsum + nt_sum = fluid.layers.sequence_pool(input=nt_emb, pool_type='sum') + nt_ss = fluid.layers.softsign(nt_sum) + ## fc layer + nt_fc = fluid.layers.fc( + input=nt_ss, + size=hid_dim, + param_attr=fluid.ParamAttr( + initializer=fluid.initializer.Constant(value=0.01), + name="__fc__", + learning_rate=base_lr), + bias_attr=fluid.ParamAttr(name="__fc_b__")) + cos_q_pt = fluid.layers.cos_sim(q_fc, pt_fc) + cos_q_nt = fluid.layers.cos_sim(q_fc, nt_fc) + # loss + avg_cost = get_loss(cos_q_pt, cos_q_nt) + # acc + acc = get_acc(cos_q_nt, cos_q_pt, batch_size) + return [avg_cost, acc, cos_q_pt] + + +def combination(x, y): + res = [[[xi, yi] for yi in y] for xi in x] + return res[0] + + +def get_one_data(file_list): + for file in file_list: + contents = [] + with open(file, "r") as fin: + for i in fin: + contents.append(i.strip()) + for index, q in enumerate(contents): + try: + one_data = [[int(j) for j in i.split(" ")] + for i in q.split(";")[:-1]] + if one_data[1][0] + one_data[1][1] != len(one_data) - 3: + q = fin.readline() + continue + tmp = combination(one_data[3:3 + one_data[1][0]], + one_data[3 + one_data[1][0]:]) + except Exception as e: + continue + + for each in tmp: + yield [one_data[2], 0, each[0], each[1]] + + +def get_batch_reader(file_list, batch_size): + def batch_reader(): + res = [] + for i in get_one_data(file_list): + if random.random() <= sample_rate: + res.append(i) + if len(res) >= batch_size: + yield res + res = [] + + return batch_reader + + +def get_train_reader(batch_size): + # The training data set. + train_file = os.path.join(paddle.dataset.common.DATA_HOME, "simnet", + "train") + train_reader = get_batch_reader([train_file], batch_size) + train_feed = ["query_ids", "pos_title_ids", "neg_title_ids", "label"] + return train_reader, train_feed + + +class TestDistSimnetBow2x2(TestDistRunnerBase): + def get_model(self, batch_size=2): + # Train program + avg_cost, acc, predict = \ + train_network(batch_size, bool(int(os.environ["IS_DISTRIBUTED"])), bool(int(os.environ["IS_SPARSE"]))) + + inference_program = fluid.default_main_program().clone() + + # Optimization + opt = get_optimizer() + opt.minimize(avg_cost) + + # Reader + train_reader, _ = get_train_reader(batch_size) + return inference_program, avg_cost, train_reader, train_reader, acc, predict + + +if __name__ == "__main__": + paddle.dataset.common.download(DATA_URL, 'simnet', DATA_MD5, "train") + runtime_main(TestDistSimnetBow2x2) diff --git a/python/paddle/fluid/tests/unittests/dist_text_classification.py b/python/paddle/fluid/tests/unittests/dist_text_classification.py new file mode 100644 index 0000000000000000000000000000000000000000..095a474fd3ac056c678f9051ed80ef363ae968c9 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/dist_text_classification.py @@ -0,0 +1,231 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import numpy as np +import argparse +import time +import math + +import paddle +import paddle.fluid as fluid +import paddle.fluid.profiler as profiler +from paddle.fluid import core +import unittest +from multiprocessing import Process +import os +import signal +import six +import tarfile +import string +import re +from functools import reduce +from test_dist_base import TestDistRunnerBase, runtime_main + +DTYPE = "float32" +VOCAB_URL = 'http://paddle-dist-ce-data.bj.bcebos.com/imdb.vocab' +VOCAB_MD5 = '23c86a0533c0151b6f12fa52b106dcc2' +DATA_URL = 'http://paddle-dist-ce-data.bj.bcebos.com/text_classification.tar.gz' +DATA_MD5 = '29ebfc94f11aea9362bbb7f5e9d86b8a' + + +# Load dictionary. +def load_vocab(filename): + vocab = {} + if six.PY2: + with open(filename, 'r') as f: + for idx, line in enumerate(f): + vocab[line.strip()] = idx + else: + with open(filename, 'r', encoding="utf-8") as f: + for idx, line in enumerate(f): + vocab[line.strip()] = idx + return vocab + + +def get_worddict(dict_path): + word_dict = load_vocab(dict_path) + word_dict[""] = len(word_dict) + dict_dim = len(word_dict) + return word_dict, dict_dim + + +def conv_net(input, + dict_dim, + emb_dim=128, + window_size=3, + num_filters=128, + fc0_dim=96, + class_dim=2): + emb = fluid.layers.embedding( + input=input, + size=[dict_dim, emb_dim], + is_sparse=False, + param_attr=fluid.ParamAttr(initializer=fluid.initializer.Constant( + value=0.01))) + + conv_3 = fluid.nets.sequence_conv_pool( + input=emb, + num_filters=num_filters, + filter_size=window_size, + act="tanh", + pool_type="max", + param_attr=fluid.ParamAttr( + initializer=fluid.initializer.Constant(value=0.01))) + + fc_0 = fluid.layers.fc( + input=[conv_3], + size=fc0_dim, + param_attr=fluid.ParamAttr( + initializer=fluid.initializer.Constant(value=0.01))) + + prediction = fluid.layers.fc( + input=[fc_0], + size=class_dim, + act="softmax", + param_attr=fluid.ParamAttr( + initializer=fluid.initializer.Constant(value=0.01))) + + return prediction + + +def inference_network(dict_dim): + data = fluid.layers.data( + name="words", shape=[1], dtype="int64", lod_level=1) + out = conv_net(data, dict_dim) + return out + + +def get_reader(word_dict, batch_size): + # The training data set. + train_reader = paddle.batch(train(word_dict), batch_size=batch_size) + + # The testing data set. + test_reader = paddle.batch(test(word_dict), batch_size=batch_size) + + return train_reader, test_reader + + +def get_optimizer(learning_rate): + optimizer = fluid.optimizer.SGD(learning_rate=learning_rate) + return optimizer + + +class TestDistTextClassification2x2(TestDistRunnerBase): + def get_model(self, batch_size=2): + vocab = os.path.join(paddle.dataset.common.DATA_HOME, + "text_classification", "imdb.vocab") + word_dict, dict_dim = get_worddict(vocab) + + # Input data + data = fluid.layers.data( + name="words", shape=[1], dtype="int64", lod_level=1) + label = fluid.layers.data(name='label', shape=[1], dtype='int64') + + # Train program + predict = conv_net(data, dict_dim) + cost = fluid.layers.cross_entropy(input=predict, label=label) + avg_cost = fluid.layers.mean(x=cost) + acc = fluid.layers.accuracy(input=predict, label=label) + inference_program = fluid.default_main_program().clone() + + # Optimization + opt = get_optimizer(learning_rate=0.001) + opt.minimize(avg_cost) + + # Reader + train_reader, test_reader = get_reader(word_dict, batch_size) + + return inference_program, avg_cost, train_reader, test_reader, acc, predict + + +def tokenize(pattern): + """ + Read files that match the given pattern. Tokenize and yield each file. + """ + + with tarfile.open( + paddle.dataset.common.download(DATA_URL, 'text_classification', + DATA_MD5)) as tarf: + # Note that we should use tarfile.next(), which does + # sequential access of member files, other than + # tarfile.extractfile, which does random access and might + # destroy hard disks. + tf = tarf.next() + while tf != None: + if bool(pattern.match(tf.name)): + # newline and punctuations removal and ad-hoc tokenization. + yield tarf.extractfile(tf).read().rstrip(six.b( + "\n\r")).translate( + None, six.b(string.punctuation)).lower().split() + tf = tarf.next() + + +def reader_creator(pos_pattern, neg_pattern, word_idx): + UNK = word_idx[''] + INS = [] + + def load(pattern, out, label): + for doc in tokenize(pattern): + out.append(([word_idx.get(w, UNK) for w in doc], label)) + + load(pos_pattern, INS, 0) + load(neg_pattern, INS, 1) + + def reader(): + for doc, label in INS: + yield doc, label + + return reader + + +def train(word_idx): + """ + IMDB training set creator. + + It returns a reader creator, each sample in the reader is an zero-based ID + sequence and label in [0, 1]. + + :param word_idx: word dictionary + :type word_idx: dict + :return: Training reader creator + :rtype: callable + """ + return reader_creator( + re.compile("train/pos/.*\.txt$"), + re.compile("train/neg/.*\.txt$"), word_idx) + + +def test(word_idx): + """ + IMDB test set creator. + + It returns a reader creator, each sample in the reader is an zero-based ID + sequence and label in [0, 1]. + + :param word_idx: word dictionary + :type word_idx: dict + :return: Test reader creator + :rtype: callable + """ + return reader_creator( + re.compile("test/pos/.*\.txt$"), + re.compile("test/neg/.*\.txt$"), word_idx) + + +if __name__ == "__main__": + paddle.dataset.common.download(VOCAB_URL, 'text_classification', VOCAB_MD5) + paddle.dataset.common.download(DATA_URL, 'text_classification', DATA_MD5) + runtime_main(TestDistTextClassification2x2) diff --git a/python/paddle/fluid/tests/unittests/dist_transformer.py b/python/paddle/fluid/tests/unittests/dist_transformer.py index 175bd130e5a8324227953eeeb769474e78f94fd2..a2cc57425841100a2b61279d1b447b88ed4b9a54 100644 --- a/python/paddle/fluid/tests/unittests/dist_transformer.py +++ b/python/paddle/fluid/tests/unittests/dist_transformer.py @@ -1488,7 +1488,7 @@ def wrap_decoder(trg_vocab_size, if weight_sharing: predict = layers.matmul( x=dec_output, - y=fluid.get_var(word_emb_param_names[0]), + y=fluid.framework._get_var(word_emb_param_names[0]), transpose_y=True) else: predict = layers.fc(input=dec_output, @@ -1699,10 +1699,9 @@ class DistTransformer2x2(TestDistRunnerBase): exe.run(startup_prog) exe.run(pserver_prog) - def run_trainer(self, use_cuda, args): - place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() - TrainTaskConfig.use_gpu = use_cuda - sum_cost, avg_cost, predict, token_num, local_lr_scheduler, test_program = get_model( + def run_trainer(self, args): + TrainTaskConfig.use_gpu = args.use_cuda + sum_cost, avg_cost, predict, token_num, local_lr_scheduler = get_model( args.is_dist, not args.sync_mode) if args.is_dist: @@ -1718,6 +1717,11 @@ class DistTransformer2x2(TestDistRunnerBase): TrainTaskConfig.batch_size = 20 trainer_prog = fluid.default_main_program() + if args.use_cuda: + place = fluid.CUDAPlace(0) + else: + place = fluid.CPUPlace() + startup_exe = fluid.Executor(place) TrainTaskConfig.local = not args.is_dist diff --git a/python/paddle/fluid/tests/unittests/dist_word2vec.py b/python/paddle/fluid/tests/unittests/dist_word2vec.py index f3e740fc7027a4a562b836c3113b87d55062c185..835306edd0f17490dd10110db40f42dce30b25bb 100644 --- a/python/paddle/fluid/tests/unittests/dist_word2vec.py +++ b/python/paddle/fluid/tests/unittests/dist_word2vec.py @@ -122,4 +122,7 @@ class TestDistWord2vec2x2(TestDistRunnerBase): if __name__ == "__main__": + import os + os.environ['CPU_NUM'] = '1' + os.environ['USE_CUDA'] = "FALSE" runtime_main(TestDistWord2vec2x2) diff --git a/python/paddle/fluid/tests/unittests/op_test.py b/python/paddle/fluid/tests/unittests/op_test.py index b5549c507ed753f4504afd655be59b444164e6f3..e97643cddef22465436051a41ef4b825e9634d23 100644 --- a/python/paddle/fluid/tests/unittests/op_test.py +++ b/python/paddle/fluid/tests/unittests/op_test.py @@ -345,7 +345,7 @@ class OpTest(unittest.TestCase): actual_t, expect_t, atol=atol, equal_nan=equal_nan), "Output (" + out_name + ") has diff at " + str(place) + "\nExpect " + str(expect_t) + "\n" + "But Got" + - str(actual_t)) + str(actual_t) + " in class " + self.__class__.__name__) if isinstance(expect, tuple): self.assertListEqual(actual.recursive_sequence_lengths(), expect[1], "Output (" + out_name + diff --git a/python/paddle/fluid/tests/unittests/test_auc_op.py b/python/paddle/fluid/tests/unittests/test_auc_op.py index 1de4a9d016a177944253d12094722d3a05614be2..810e8a1a8547a92de923877695178e780981edeb 100644 --- a/python/paddle/fluid/tests/unittests/test_auc_op.py +++ b/python/paddle/fluid/tests/unittests/test_auc_op.py @@ -36,7 +36,11 @@ class TestAucOp(OpTest): "StatPos": stat_pos, "StatNeg": stat_neg } - self.attrs = {'curve': 'ROC', 'num_thresholds': num_thresholds} + self.attrs = { + 'curve': 'ROC', + 'num_thresholds': num_thresholds, + "slide_steps": 1 + } python_auc = metrics.Auc(name="auc", curve='ROC', @@ -45,7 +49,6 @@ class TestAucOp(OpTest): self.outputs = { 'AUC': np.array(python_auc.eval()), - 'BatchAUC': np.array(python_auc.eval()), 'StatPosOut': np.array(python_auc._stat_pos), 'StatNegOut': np.array(python_auc._stat_neg) } diff --git a/python/paddle/fluid/tests/unittests/test_conv2d_op.py b/python/paddle/fluid/tests/unittests/test_conv2d_op.py index 6a2732e9399aa5a93f4c47eb73bfd23dba608c3d..2ecc2504a8c9c5ecfc32cee96df9e368ff219cbb 100644 --- a/python/paddle/fluid/tests/unittests/test_conv2d_op.py +++ b/python/paddle/fluid/tests/unittests/test_conv2d_op.py @@ -67,6 +67,7 @@ class TestConv2dOp(OpTest): def setUp(self): self.op_type = "conv2d" self.use_cudnn = False + self.use_cuda = False self.use_mkldnn = False self.data_format = "AnyLayout" self.dtype = np.float32 @@ -101,24 +102,25 @@ class TestConv2dOp(OpTest): } self.outputs = {'Output': output} - def testcudnn(self): - return core.is_compiled_with_cuda() and self.use_cudnn + def testcuda(self): + return core.is_compiled_with_cuda() and (self.use_cudnn or + self.use_cuda) def test_check_output(self): - place = core.CUDAPlace(0) if self.testcudnn() else core.CPUPlace() + place = core.CUDAPlace(0) if self.testcuda() else core.CPUPlace() self.check_output_with_place(place, atol=1e-5) def test_check_grad(self): if self.dtype == np.float16: return - place = core.CUDAPlace(0) if self.testcudnn() else core.CPUPlace() + place = core.CUDAPlace(0) if self.testcuda() else core.CPUPlace() self.check_grad_with_place( place, set(['Input', 'Filter']), 'Output', max_relative_error=0.02) def test_check_grad_no_filter(self): if self.dtype == np.float16: return - place = core.CUDAPlace(0) if self.testcudnn() else core.CPUPlace() + place = core.CUDAPlace(0) if self.testcuda() else core.CPUPlace() self.check_grad_with_place( place, ['Input'], 'Output', @@ -128,7 +130,7 @@ class TestConv2dOp(OpTest): def test_check_grad_no_input(self): if self.dtype == np.float16: return - place = core.CUDAPlace(0) if self.testcudnn() else core.CPUPlace() + place = core.CUDAPlace(0) if self.testcuda() else core.CPUPlace() self.check_grad_with_place( place, ['Filter'], 'Output', @@ -325,18 +327,33 @@ class TestFP16CUDNNWithInput1x1Filter1x1(TestWithInput1x1Filter1x1): class TestDepthwiseConv(TestConv2dOp): def init_test_case(self): + self.use_cuda = True self.pad = [1, 1] self.stride = [2, 2] self.input_size = [2, 3, 5, 5] # NCHW self.groups = 3 assert np.mod(self.input_size[1], self.groups) == 0 f_c = self.input_size[1] // self.groups - self.filter_size = [6, f_c, 3, 3] + self.filter_size = [3, f_c, 3, 3] self.op_type = "depthwise_conv2d" class TestDepthwiseConv2(TestConv2dOp): def init_test_case(self): + self.use_cuda = True + self.pad = [1, 1] + self.stride = [1, 1] + self.input_size = [2, 3, 5, 5] # NCHW + self.groups = 3 + assert np.mod(self.input_size[1], self.groups) == 0 + f_c = self.input_size[1] // self.groups + self.filter_size = [3, f_c, 3, 3] + self.op_type = "depthwise_conv2d" + + +class TestDepthwiseConv3(TestConv2dOp): + def init_test_case(self): + self.use_cuda = True self.pad = [1, 1] self.stride = [1, 1] self.input_size = [2, 3, 5, 5] # NCHW @@ -347,6 +364,34 @@ class TestDepthwiseConv2(TestConv2dOp): self.op_type = "depthwise_conv2d" +class TestDepthwiseConvWithDilation(TestConv2dOp): + def init_test_case(self): + self.use_cuda = True + self.pad = [1, 1] + self.stride = [2, 2] + self.input_size = [2, 3, 5, 5] # NCHW + self.groups = 3 + self.dilations = [2, 2] + assert np.mod(self.input_size[1], self.groups) == 0 + f_c = self.input_size[1] // self.groups + self.filter_size = [6, f_c, 3, 3] + self.op_type = "depthwise_conv2d" + + +class TestDepthwiseConvWithDilation2(TestConv2dOp): + def init_test_case(self): + self.use_cuda = True + self.pad = [1, 1] + self.stride = [1, 1] + self.input_size = [2, 3, 5, 5] # NCHW + self.groups = 3 + self.dilations = [2, 2] + assert np.mod(self.input_size[1], self.groups) == 0 + f_c = self.input_size[1] // self.groups + self.filter_size = [6, f_c, 3, 3] + self.op_type = "depthwise_conv2d" + + # Please Don't remove the following code. # Currently, CI use cudnn V5.0 which not support dilation conv. # class TestCUDNNWithDilation(TestWithDilation): diff --git a/python/paddle/fluid/tests/unittests/test_detection_map_op.py b/python/paddle/fluid/tests/unittests/test_detection_map_op.py index f6eb8f2c6d8b94f92e24ff789c91efb53a645a46..0c5343a97d5ef0f97fc6b144dfc82174eacb8573 100644 --- a/python/paddle/fluid/tests/unittests/test_detection_map_op.py +++ b/python/paddle/fluid/tests/unittests/test_detection_map_op.py @@ -20,6 +20,7 @@ import six import sys import collections import math +import paddle.fluid as fluid from op_test import OpTest @@ -32,7 +33,7 @@ class TestDetectionMAPOp(OpTest): self.detect = np.array(self.detect).astype('float32') self.mAP = np.array(self.mAP).astype('float32') - if (len(self.class_pos_count) > 0): + if len(self.class_pos_count) > 0: self.class_pos_count = np.array(self.class_pos_count).astype( 'int32') self.true_pos = np.array(self.true_pos).astype('float32') @@ -273,7 +274,7 @@ class TestDetectionMAPOp11Point(TestDetectionMAPOp): class TestDetectionMAPOpMultiBatch(TestDetectionMAPOp): def init_test_case(self): super(TestDetectionMAPOpMultiBatch, self).init_test_case() - self.class_pos_count = [0, 2, 1] + self.class_pos_count = [0, 2, 1, 0] self.true_pos_lod = [[0, 3, 2]] self.true_pos = [[0.7, 1.], [0.3, 0.], [0.2, 1.], [0.8, 0.], [0.1, 1.]] self.false_pos_lod = [[0, 3, 2]] diff --git a/python/paddle/fluid/tests/unittests/test_dist_base.py b/python/paddle/fluid/tests/unittests/test_dist_base.py index 37cad73019c529f64868b6ad3c6e2fffe59cc0d8..04924bec057e301bfb342a62bb4c1e0b3c3aff4c 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_base.py +++ b/python/paddle/fluid/tests/unittests/test_dist_base.py @@ -18,23 +18,27 @@ import time import unittest import os import sys -import six import signal import subprocess +import six import argparse +import paddle.fluid as fluid + +RUN_STEP = 10 + class TestDistRunnerBase(object): def get_model(self, batch_size=2): raise NotImplementedError( "get_model should be implemented by child classes.") - def get_transpiler(self, trainer_id, main_program, pserver_endpoints, - trainers, sync_mode): + @staticmethod + def get_transpiler(trainer_id, main_program, pserver_endpoints, trainers, + sync_mode): # NOTE: import fluid until runtime, or else forking processes will cause error. - import paddle - import paddle.fluid as fluid - t = fluid.DistributeTranspiler() + config = fluid.DistributeTranspilerConfig() + t = fluid.DistributeTranspiler(config=config) t.transpile( trainer_id=trainer_id, program=main_program, @@ -44,11 +48,9 @@ class TestDistRunnerBase(object): return t def run_pserver(self, args): - import paddle - import paddle.fluid as fluid + self.get_model(batch_size=2) - if args.mem_opt: - fluid.memory_optimize(fluid.default_main_program()) + # NOTE: pserver should not call memory optimize t = self.get_transpiler(args.trainer_id, fluid.default_main_program(), args.endpoints, args.trainers, args.sync_mode) @@ -61,29 +63,34 @@ class TestDistRunnerBase(object): exe.run(startup_prog) exe.run(pserver_prog) - def run_trainer(self, use_cuda, args): - import paddle - import paddle.fluid as fluid - place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() + def run_trainer(self, args): test_program, avg_cost, train_reader, test_reader, batch_acc, predict = \ self.get_model(batch_size=2) + if args.mem_opt: - fluid.memory_optimize(fluid.default_main_program()) + fluid.memory_optimize(fluid.default_main_program(), skip_grads=True) if args.is_dist: t = self.get_transpiler(args.trainer_id, fluid.default_main_program(), args.endpoints, args.trainers, args.sync_mode) + trainer_prog = t.get_trainer_program() else: trainer_prog = fluid.default_main_program() + if args.use_cuda: + place = fluid.CUDAPlace(0) + else: + place = fluid.CPUPlace() + startup_exe = fluid.Executor(place) startup_exe.run(fluid.default_startup_program()) strategy = fluid.ExecutionStrategy() strategy.num_threads = 1 strategy.allow_op_delay = False + build_stra = fluid.BuildStrategy() if args.use_reduce: @@ -92,7 +99,7 @@ class TestDistRunnerBase(object): build_stra.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.AllReduce exe = fluid.ParallelExecutor( - use_cuda, + args.use_cuda, loss_name=avg_cost.name, exec_strategy=strategy, build_strategy=build_stra) @@ -103,27 +110,26 @@ class TestDistRunnerBase(object): ] feeder = fluid.DataFeeder(feed_var_list, place) - reader_generator = test_reader() + reader_generator = train_reader() - data = next(reader_generator) - first_loss, = exe.run(fetch_list=[avg_cost.name], - feed=feeder.feed(data)) - print(first_loss) + def get_data(): + origin_batch = next(reader_generator) + if args.is_dist and args.use_reader_alloc: + new_batch = [] + for offset, item in enumerate(origin_batch): + if offset % 2 == args.trainer_id: + new_batch.append(item) + return new_batch + else: + return origin_batch - for i in six.moves.xrange(5): - data = next(reader_generator) - loss, = exe.run(fetch_list=[avg_cost.name], feed=feeder.feed(data)) - - data = next(reader_generator) - last_loss, = exe.run(fetch_list=[avg_cost.name], feed=feeder.feed(data)) - print(last_loss) + for _ in six.moves.xrange(RUN_STEP): + loss, = exe.run(fetch_list=[avg_cost.name], + feed=feeder.feed(get_data())) + print(loss) def runtime_main(test_class): - import paddle - import paddle.fluid as fluid - import paddle.fluid.core as core - parser = argparse.ArgumentParser(description='Run dist test.') parser.add_argument( '--role', type=str, required=True, choices=['pserver', 'trainer']) @@ -135,7 +141,10 @@ def runtime_main(test_class): '--current_endpoint', type=str, required=False, default="") parser.add_argument('--sync_mode', action='store_true') parser.add_argument('--mem_opt', action='store_true') + parser.add_argument('--use_cuda', action='store_true') parser.add_argument('--use_reduce', action='store_true') + parser.add_argument( + '--use_reader_alloc', action='store_true', required=False, default=True) args = parser.parse_args() @@ -143,8 +152,7 @@ def runtime_main(test_class): if args.role == "pserver" and args.is_dist: model.run_pserver(args) else: - use_cuda = True if core.is_compiled_with_cuda() else False - model.run_trainer(use_cuda, args) + model.run_trainer(args) import paddle.compat as cpt @@ -156,6 +164,17 @@ class TestDistBase(unittest.TestCase): def _setup_config(self): raise NotImplementedError("tests should have _setup_config implemented") + def _after_setup_config(self): + if self._enforce_place == "CPU": + self.__use_cuda = False + elif self._enforce_place == "GPU": + self.__use_cuda = True + else: + if fluid.core.is_compiled_with_cuda(): + self.__use_cuda = True + else: + self.__use_cuda = False + def setUp(self): self._trainers = 2 self._pservers = 2 @@ -163,24 +182,27 @@ class TestDistBase(unittest.TestCase): self._find_free_port(), self._find_free_port()) self._python_interp = "python" self._sync_mode = True + self._enforce_place = None self._mem_opt = False self._use_reduce = False + self._use_reader_alloc = True self._setup_config() + self._after_setup_config() def _find_free_port(self): with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as s: s.bind(('', 0)) return s.getsockname()[1] - def start_pserver(self, model_file, check_error_log): + def start_pserver(self, model_file, check_error_log, required_envs): ps0_ep, ps1_ep = self._ps_endpoints.split(",") ps_cmd = "%s %s --role pserver --endpoints %s --trainer_id 0 --current_endpoint %s --trainers %d --is_dist" ps0_cmd = ps_cmd % \ - (self._python_interp, model_file, self._ps_endpoints, ps0_ep, - self._trainers) + (self._python_interp, model_file, self._ps_endpoints, ps0_ep, + self._trainers) ps1_cmd = ps_cmd % \ - (self._python_interp, model_file, self._ps_endpoints, ps1_ep, - self._trainers) + (self._python_interp, model_file, self._ps_endpoints, ps1_ep, + self._trainers) if self._sync_mode: ps0_cmd += " --sync_mode" @@ -189,23 +211,23 @@ class TestDistBase(unittest.TestCase): ps0_cmd += " --mem_opt" ps1_cmd += " --mem_opt" - ps0_pipe = subprocess.PIPE - ps1_pipe = subprocess.PIPE - if check_error_log: - print(ps0_cmd) - print(ps1_cmd) - ps0_pipe = open("/tmp/ps0_err.log", "wb") - ps1_pipe = open("/tmp/ps1_err.log", "wb") + print(ps0_cmd) + print(ps1_cmd) + ps0_pipe = open("/tmp/ps0_err.log", "wb") + ps1_pipe = open("/tmp/ps1_err.log", "wb") ps0_proc = subprocess.Popen( - ps0_cmd.strip().split(" "), stdout=subprocess.PIPE, stderr=ps0_pipe) + ps0_cmd.strip().split(" "), + stdout=subprocess.PIPE, + stderr=ps0_pipe, + env=required_envs) ps1_proc = subprocess.Popen( - ps1_cmd.strip().split(" "), stdout=subprocess.PIPE, stderr=ps1_pipe) + ps1_cmd.strip().split(" "), + stdout=subprocess.PIPE, + stderr=ps1_pipe, + env=required_envs) - if not check_error_log: - return ps0_proc, ps1_proc, None, None - else: - return ps0_proc, ps1_proc, ps0_pipe, ps1_pipe + return ps0_proc, ps1_proc, ps0_pipe, ps1_pipe def _wait_ps_ready(self, pid): retry_times = 50 @@ -222,59 +244,59 @@ class TestDistBase(unittest.TestCase): (e, retry_times)) retry_times -= 1 - def check_with_place(self, model_file, delta=1e-3, check_error_log=False): - # TODO(typhoonzero): should auto adapt GPU count on the machine. - required_envs = { - "PATH": os.getenv("PATH", ""), - "PYTHONPATH": os.getenv("PYTHONPATH", ""), - "LD_LIBRARY_PATH": os.getenv("LD_LIBRARY_PATH", ""), - "FLAGS_fraction_of_gpu_memory_to_use": "0.15", - "FLAGS_cudnn_deterministic": "1", - "CPU_NUM": "1" - } + def _run_local(self, model, envs, check_error_log): - if check_error_log: - required_envs["GLOG_v"] = "7" - required_envs["GLOG_logtostderr"] = "1" + cmd = "%s %s --role trainer" % (self._python_interp, model) - # Run local to get a base line - env_local = {"CUDA_VISIBLE_DEVICES": "0"} - env_local.update(required_envs) - local_cmd = "%s %s --role trainer" % (self._python_interp, model_file) - if not check_error_log: - local_proc = subprocess.Popen( - local_cmd.split(" "), - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - env=env_local) + if self.__use_cuda: + cmd += " --use_cuda" + env_local = {"CUDA_VISIBLE_DEVICES": "0"} else: + env_local = {'CPU_NUM': '1'} + + envs.update(env_local) + + if check_error_log: err_log = open("/tmp/trainer.err.log", "wb") local_proc = subprocess.Popen( - local_cmd.split(" "), + cmd.split(" "), stdout=subprocess.PIPE, stderr=err_log, - env=env_local) + env=envs) + else: + local_proc = subprocess.Popen( + cmd.split(" "), + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + env=envs) - local_proc.wait() - out, err = local_proc.communicate() - local_ret = cpt.to_text(out) - sys.stderr.write('local_loss: %s\n' % local_ret) - sys.stderr.write('local_stderr: %s\n' % err) + local_out, local_err = local_proc.communicate() + local_ret = cpt.to_text(local_out) + + if check_error_log: + err_log.close() + sys.stderr.write('local_stdout: %s\n' % local_ret) + sys.stderr.write('local_stderr: %s\n' % local_err) + + local_losses = local_ret.split("\n") + return local_losses + + def _run_cluster(self, model, envs, check_error_log): # Run dist train to compare with local results - ps0, ps1, ps0_pipe, ps1_pipe = self.start_pserver(model_file, - check_error_log) + ps0, ps1, ps0_pipe, ps1_pipe = self.start_pserver(model, + check_error_log, envs) self._wait_ps_ready(ps0.pid) self._wait_ps_ready(ps1.pid) - ps0_ep, ps1_ep = self._ps_endpoints.split(",") + tr_cmd = "%s %s --role trainer --endpoints %s --trainer_id %d --current_endpoint %s --trainers %d --is_dist" tr0_cmd = tr_cmd % \ - (self._python_interp, model_file, self._ps_endpoints, - 0, ps0_ep, self._trainers) + (self._python_interp, model, self._ps_endpoints, + 0, ps0_ep, self._trainers) tr1_cmd = tr_cmd % \ - (self._python_interp, model_file, self._ps_endpoints, - 1, ps1_ep, self._trainers) + (self._python_interp, model, self._ps_endpoints, + 1, ps1_ep, self._trainers) if self._sync_mode: tr0_cmd += " --sync_mode" @@ -285,20 +307,25 @@ class TestDistBase(unittest.TestCase): if self._use_reduce: tr0_cmd += " --use_reduce" tr1_cmd += " --use_reduce" + if self._use_reader_alloc: + tr0_cmd += " --use_reader_alloc" + tr1_cmd += " --use_reader_alloc" + if self.__use_cuda: + tr0_cmd += " --use_cuda" + tr1_cmd += " --use_cuda" + env0 = {"CUDA_VISIBLE_DEVICES": "0"} + env1 = {"CUDA_VISIBLE_DEVICES": "1"} + else: + env0 = {'CPU_NUM': '1'} + env1 = {'CPU_NUM': '1'} - env0 = {"CUDA_VISIBLE_DEVICES": "0"} - env1 = {"CUDA_VISIBLE_DEVICES": "1"} - env0.update(required_envs) - env1.update(required_envs) - FNULL = open(os.devnull, 'w') + env0.update(envs) + env1.update(envs) - tr0_pipe = subprocess.PIPE - tr1_pipe = subprocess.PIPE - if check_error_log: - print("tr0_cmd:", tr0_cmd) - print("tr1_cmd:", tr1_cmd) - tr0_pipe = open("/tmp/tr0_err.log", "wb") - tr1_pipe = open("/tmp/tr1_err.log", "wb") + print("tr0_cmd:{}, env0: {}".format(tr0_cmd, env0)) + print("tr1_cmd:{}, env1: {}".format(tr1_cmd, env1)) + tr0_pipe = open("/tmp/tr0_err.log", "wb") + tr1_pipe = open("/tmp/tr1_err.log", "wb") tr0_proc = subprocess.Popen( tr0_cmd.strip().split(" "), @@ -311,35 +338,65 @@ class TestDistBase(unittest.TestCase): stderr=tr1_pipe, env=env1) - tr0_proc.wait() - tr1_proc.wait() - out, err = tr0_proc.communicate() - sys.stderr.write('dist_stderr: %s\n' % err) - loss_data0 = cpt.to_text(out) - sys.stderr.write('dist_loss: %s\n' % loss_data0) - lines = loss_data0.split("\n") - dist_first_loss = eval(lines[0].replace(" ", ","))[0] - dist_last_loss = eval(lines[1].replace(" ", ","))[0] - - local_lines = local_ret.split("\n") - local_first_loss = eval(local_lines[0])[0] - local_last_loss = eval(local_lines[1])[0] + tr0_out, tr0_err = tr0_proc.communicate() + tr0_loss_text = cpt.to_text(tr0_out) + tr1_out, tr1_err = tr1_proc.communicate() + tr1_loss_text = cpt.to_text(tr1_out) # close trainer file - if check_error_log: - tr0_pipe.close() - tr1_pipe.close() + tr0_pipe.close() + tr1_pipe.close() - ps0_pipe.close() - ps1_pipe.close() + ps0_pipe.close() + ps1_pipe.close() # FIXME: use terminate() instead of sigkill. os.kill(ps0.pid, signal.SIGKILL) os.kill(ps1.pid, signal.SIGKILL) ps0.terminate() ps1.terminate() - ps0.wait() - ps1.wait() - FNULL.close() - self.assertAlmostEqual(local_first_loss, dist_first_loss, delta=delta) - self.assertAlmostEqual(local_last_loss, dist_last_loss, delta=delta) + # print log + sys.stderr.write('trainer 0 stdout:\n %s\n' % tr0_loss_text) + sys.stderr.write('trainer 0 stderr:\n %s\n' % tr0_err) + sys.stderr.write('trainer 1 stdout: %s\n' % tr1_loss_text) + sys.stderr.write('trainer 1 stderr: %s\n' % tr1_err) + + tr0_losses = tr0_loss_text.split("\n") + tr1_losses = tr1_loss_text.split("\n") + + return tr0_losses, tr1_losses + + def check_with_place(self, + model_file, + delta=1e-3, + check_error_log=False, + need_envs={}): + # TODO(typhoonzero): should auto adapt GPU count on the machine. + required_envs = { + "PATH": os.getenv("PATH", ""), + "PYTHONPATH": os.getenv("PYTHONPATH", ""), + "LD_LIBRARY_PATH": os.getenv("LD_LIBRARY_PATH", ""), + "FLAGS_fraction_of_gpu_memory_to_use": "0.15", + "FLAGS_cudnn_deterministic": "1", + "http_proxy": "" + } + + required_envs.update(need_envs) + + if check_error_log: + required_envs["GLOG_v"] = "7" + required_envs["GLOG_logtostderr"] = "1" + + local_losses\ + = self._run_local(model_file, required_envs, + check_error_log) + tr0_losses, tr1_losses = self._run_cluster(model_file, required_envs, + check_error_log) + + for step_id in range(RUN_STEP): + local_loss = eval(local_losses[step_id])[0] + tr0_loss = eval(tr0_losses[step_id])[0] + tr1_loss = eval(tr1_losses[step_id])[0] + dist_loss = (tr0_loss + tr1_loss) / 2 + print(str(local_loss) + ":" + str(dist_loss)) + self.assertAlmostEqual(local_loss, dist_loss, delta=delta) diff --git a/python/paddle/fluid/tests/notest_concurrency.py b/python/paddle/fluid/tests/unittests/test_dist_ctr.py similarity index 54% rename from python/paddle/fluid/tests/notest_concurrency.py rename to python/paddle/fluid/tests/unittests/test_dist_ctr.py index fd9da4cce0ea51c53b4b01e7c3dc2a2ed1eeb089..3575fd07fc727bd6c6b07a19a60b1df6656ae9e2 100644 --- a/python/paddle/fluid/tests/notest_concurrency.py +++ b/python/paddle/fluid/tests/unittests/test_dist_ctr.py @@ -11,31 +11,22 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - from __future__ import print_function +import os import unittest -import paddle.fluid as fluid -import paddle.fluid.core as core -from paddle.fluid.executor import Executor - +from test_dist_base import TestDistBase -class TestRoutineOp(unittest.TestCase): - def test_simple_routine(self): - ch = fluid.make_channel( - dtype=core.VarDesc.VarType.BOOL, name="CreateChannel") - with fluid.Go(): - fluid.channel_send(ch, True) - result = fluid.channel_recv(ch) - fluid.channel_close(ch) +class TestDistCTR2x2(TestDistBase): + def _setup_config(self): + self._sync_mode = True + self._enforce_place = "CPU" - cpu = core.CPUPlace() - exe = Executor(cpu) - outs = exe.run(fetch_list=[result]) - self.assertEqual(outs[0], True) +def test_dist_ctr(self): + self.check_with_place("dist_ctr.py", delta=1e-7, check_error_log=False) -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_dist_mnist.py b/python/paddle/fluid/tests/unittests/test_dist_mnist.py index 09b1c546e49bd02bf336f31885bf4c7339cc5a2c..f65dd7e2a28c4ace3988c0cc1267ebe981fbd9cb 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_mnist.py +++ b/python/paddle/fluid/tests/unittests/test_dist_mnist.py @@ -23,7 +23,7 @@ class TestDistMnist2x2(TestDistBase): self._use_reduce = False def test_dist_train(self): - self.check_with_place("dist_mnist.py", delta=1e-7) + self.check_with_place("dist_mnist.py", delta=1e-5) class TestDistMnist2x2WithMemopt(TestDistBase): @@ -32,7 +32,7 @@ class TestDistMnist2x2WithMemopt(TestDistBase): self._mem_opt = True def test_dist_train(self): - self.check_with_place("dist_mnist.py", delta=1e-7) + self.check_with_place("dist_mnist.py", delta=1e-5) class TestDistMnistAsync(TestDistBase): diff --git a/python/paddle/fluid/tests/unittests/test_dist_se_resnext.py b/python/paddle/fluid/tests/unittests/test_dist_se_resnext.py index c2b089694ea2f329e67ad6c50def26caa454720e..c0989ca709e100d8f147a08970b0e858c81ce09b 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_se_resnext.py +++ b/python/paddle/fluid/tests/unittests/test_dist_se_resnext.py @@ -20,24 +20,25 @@ from test_dist_base import TestDistBase class TestDistSeResneXt2x2(TestDistBase): def _setup_config(self): self._sync_mode = True + self._use_reader_alloc = False def test_dist_train(self): - self.check_with_place("dist_se_resnext.py", delta=1e-7) + self.check_with_place("dist_se_resnext.py", delta=100) -# TODO(typhoonzero): fix this test -# class TestDistseResnXt2x2WithMemopt(TestDistBase): -# def _setup_config(self): -# self._sync_mode = True -# self._mem_opt = True +class TestDistseResnXt2x2WithMemopt(TestDistBase): + def _setup_config(self): + self._sync_mode = True + self._mem_opt = True -# def test_dist_train(self): -# self.check_with_place("dist_se_resnext.py", delta=1e-7) + def test_dist_train(self): + self.check_with_place("dist_se_resnext.py", delta=100) class TestDistSeResneXt2x2Async(TestDistBase): def _setup_config(self): self._sync_mode = False + self._use_reader_alloc = False def test_dist_train(self): self.check_with_place("dist_se_resnext.py", delta=100) diff --git a/python/paddle/fluid/tests/unittests/test_dist_simnet_bow.py b/python/paddle/fluid/tests/unittests/test_dist_simnet_bow.py new file mode 100644 index 0000000000000000000000000000000000000000..e971f29db42a7c1a2394505a8ece3d2fd6b347e9 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_dist_simnet_bow.py @@ -0,0 +1,79 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from __future__ import print_function + +import os +import unittest + +from test_dist_base import TestDistBase + + +class TestDistSimnetBowDense2x2(TestDistBase): + def _setup_config(self): + self._sync_mode = True + self._enforce_place = "CPU" + + def test_simnet_bow(self): + need_envs = {"IS_DISTRIBUTED": '0', "IS_SPARSE": '0'} + self.check_with_place( + "dist_simnet_bow.py", + delta=1e-5, + check_error_log=False, + need_envs=need_envs) + + +class TestDistSimnetBow2x2DenseAsync(TestDistBase): + def _setup_config(self): + self._sync_mode = False + self._enforce_place = "CPU" + + def test_simnet_bow(self): + need_envs = {"IS_DISTRIBUTED": '0', "IS_SPARSE": '0'} + self.check_with_place( + "dist_simnet_bow.py", + delta=100, + check_error_log=False, + need_envs=need_envs) + + +class TestDistSimnetBowSparse2x2(TestDistBase): + def _setup_config(self): + self._sync_mode = True + self._enforce_place = "CPU" + + def test_simnet_bow(self): + need_envs = {"IS_DISTRIBUTED": '0', "IS_SPARSE": '1'} + self.check_with_place( + "dist_simnet_bow.py", + delta=1e-5, + check_error_log=False, + need_envs=need_envs) + + +class TestDistSimnetBow2x2SparseAsync(TestDistBase): + def _setup_config(self): + self._sync_mode = False + self._enforce_place = "CPU" + + def test_simnet_bow(self): + need_envs = {"IS_DISTRIBUTED": '0', "IS_SPARSE": '1'} + self.check_with_place( + "dist_simnet_bow.py", + delta=100, + check_error_log=False, + need_envs=need_envs) + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_dist_text_classification.py b/python/paddle/fluid/tests/unittests/test_dist_text_classification.py new file mode 100644 index 0000000000000000000000000000000000000000..0c1680359e2b84807084b06eab0534b41ecd6133 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_dist_text_classification.py @@ -0,0 +1,40 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function +import os +import unittest +from test_dist_base import TestDistBase + + +class TestDistTextClassification2x2(TestDistBase): + def _setup_config(self): + self._sync_mode = True + self._enforce_place = "CPU" + + def test_text_classification(self): + self.check_with_place("dist_text_classification.py", delta=1e-6) + + +class TestDistTextClassification2x2Async(TestDistBase): + def _setup_config(self): + self._sync_mode = False + self._enforce_place = "CPU" + + def test_se_resnext(self): + self.check_with_place("dist_text_classification.py", delta=100) + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_dist_transpiler.py b/python/paddle/fluid/tests/unittests/test_dist_transpiler.py index 1b7f0745cd758a7534dec9f48132e9cb44a994e6..54a1c68a37f6929890aab697b48d621e6effb7d8 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_transpiler.py +++ b/python/paddle/fluid/tests/unittests/test_dist_transpiler.py @@ -264,6 +264,25 @@ class TestLRDecay(TranspilerTest): ]) +class TestDecayedAdagrad(TranspilerTest): + def net_conf(self): + x = fluid.layers.data(name='x', shape=[1000], dtype='float32') + y_predict = fluid.layers.fc(input=x, + size=1000, + act=None, + param_attr=fluid.ParamAttr(name='fc_w'), + bias_attr=fluid.ParamAttr(name='fc_b')) + y = fluid.layers.data(name='y', shape=[1], dtype='float32') + cost = fluid.layers.square_error_cost(input=y_predict, label=y) + avg_cost = fluid.layers.mean(cost) + opt = fluid.optimizer.DecayedAdagrad(learning_rate=0.1) + opt.minimize(avg_cost) + + def transpiler_test_impl(self): + pserver, startup = self.get_pserver(self.pserver1_ep) + trainer, _ = self.get_trainer() + + class TestLRDecayConditional(TranspilerTest): def net_conf(self): x = fluid.layers.data(name='x', shape=[1000], dtype='float32') @@ -661,22 +680,25 @@ class TestLoadSliceVar(TranspilerTest): class TestNCCL2Transpile(TranspilerTest): def test_nccl2_transpile(self): - main = fluid.Program() - startup = fluid.Program() - with fluid.program_guard(main, startup): - self.net_conf() - - config = fluid.DistributeTranspilerConfig() - config.mode = "nccl2" - t = fluid.DistributeTranspiler(config=config) - t.transpile( - 0, - trainers="127.0.0.1:6174,127.0.0.1:6175", - current_endpoint="127.0.0.1:6174", - startup_program=startup) - print([op.type for op in startup.global_block().ops]) - self.assertEqual(startup.global_block().ops[-1].type, "gen_nccl_id") - self.assertIsNotNone(startup.global_block().vars.get("NCCLID")) + if fluid.core.is_compiled_with_cuda(): #test nccl2 only with cuda + main = fluid.Program() + startup = fluid.Program() + with fluid.program_guard(main, startup): + self.net_conf() + + config = fluid.DistributeTranspilerConfig() + config.mode = "nccl2" + t = fluid.DistributeTranspiler(config=config) + t.transpile( + 0, + trainers="127.0.0.1:6174,127.0.0.1:6175", + current_endpoint="127.0.0.1:6174", + startup_program=startup) + print([op.type for op in startup.global_block().ops]) + self.assertEqual(startup.global_block().ops[-1].type, "gen_nccl_id") + self.assertIsNotNone(startup.global_block().vars.get("NCCLID")) + else: + pass if __name__ == "__main__": diff --git a/python/paddle/fluid/tests/unittests/test_dist_word2vec.py b/python/paddle/fluid/tests/unittests/test_dist_word2vec.py index 33b39b262b95b0013e3696c3f15a288a2e801ce1..b26cbdbea12962a3a41036c774de5dfb61999205 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_word2vec.py +++ b/python/paddle/fluid/tests/unittests/test_dist_word2vec.py @@ -39,7 +39,7 @@ class TestDistW2V2x2Async(TestDistBase): self._sync_mode = False def test_dist_train(self): - self.check_with_place("dist_word2vec.py", delta=1) + self.check_with_place("dist_word2vec.py", delta=100) if __name__ == "__main__": diff --git a/python/paddle/fluid/tests/unittests/test_generate_proposals_op.py b/python/paddle/fluid/tests/unittests/test_generate_proposals_op.py index 86e27fe29ed945ec77fbbcdbd1c7cc6ecfba0fd5..9340d558577b4b3141df9317900ee33bbb683a0e 100644 --- a/python/paddle/fluid/tests/unittests/test_generate_proposals_op.py +++ b/python/paddle/fluid/tests/unittests/test_generate_proposals_op.py @@ -277,7 +277,6 @@ class TestGenerateProposalsOp(OpTest): 'eta': self.eta } - print("lod = ", self.lod) self.outputs = { 'RpnRois': (self.rpn_rois[0], [self.lod]), 'RpnRoiProbs': (self.rpn_roi_probs[0], [self.lod]) @@ -295,7 +294,7 @@ class TestGenerateProposalsOp(OpTest): self.post_nms_topN = 5000 # train 6000, test 1000 self.nms_thresh = 0.7 self.min_size = 3.0 - self.eta = 0.8 + self.eta = 1. def init_test_input(self): batch_size = 1 diff --git a/python/paddle/fluid/tests/unittests/test_infer_shape.py b/python/paddle/fluid/tests/unittests/test_infer_shape.py index a3d700aad8236fea7bb0e6d043323ad3bd0851f2..fdff22cacc28731a91ff4fd17407bd9edbdd9d8b 100644 --- a/python/paddle/fluid/tests/unittests/test_infer_shape.py +++ b/python/paddle/fluid/tests/unittests/test_infer_shape.py @@ -76,8 +76,8 @@ class TestInferShape(unittest.TestCase): mul_op_desc.set_input("X", ["x"]) mul_op_desc.set_input("Y", ["y"]) mul_op_desc.set_output("Out", ["out"]) - mul_op_desc.set_attr("x_num_col_dims", 1) - mul_op_desc.set_attr("y_num_col_dims", 1) + mul_op_desc._set_attr("x_num_col_dims", 1) + mul_op_desc._set_attr("y_num_col_dims", 1) mul_op_desc.check_attrs() mul_op_desc.infer_shape(block) diff --git a/python/paddle/fluid/tests/unittests/test_layers.py b/python/paddle/fluid/tests/unittests/test_layers.py index f474cdae2054531d44724e0e3e0e58a35fb8ddcd..1d8d0b55f0c5d7cffa01a100847bdf48b6d7023d 100644 --- a/python/paddle/fluid/tests/unittests/test_layers.py +++ b/python/paddle/fluid/tests/unittests/test_layers.py @@ -541,7 +541,7 @@ class TestBook(unittest.TestCase): with program_guard(program): input = layers.data( name="input", shape=[3, 100, 100], dtype="float32") - out = layers.shape(input, name="shape") + out = layers.shape(input) self.assertIsNotNone(out) print(str(program)) @@ -758,6 +758,65 @@ class TestBook(unittest.TestCase): out = layers.expand(x, [1, 2]) print(str(program)) + def test_uniform_random_batch_size_like(self): + program = Program() + with program_guard(program): + input = layers.data(name="input", shape=[13, 11], dtype='float32') + out = layers.uniform_random_batch_size_like(input, [-1, 11]) + self.assertIsNotNone(out) + print(str(program)) + + def test_gaussian_random(self): + program = Program() + with program_guard(program): + out = layers.gaussian_random(shape=[20, 30]) + self.assertIsNotNone(out) + print(str(program)) + + def test_sampling_id(self): + program = Program() + with program_guard(program): + x = layers.data( + name="X", + shape=[13, 11], + dtype='float32', + append_batch_size=False) + + out = layers.sampling_id(x) + self.assertIsNotNone(out) + print(str(program)) + + def test_gaussian_random_batch_size_like(self): + program = Program() + with program_guard(program): + input = layers.data(name="input", shape=[13, 11], dtype='float32') + + out = layers.gaussian_random_batch_size_like( + input, shape=[-1, 11], mean=1.0, std=2.0) + self.assertIsNotNone(out) + print(str(program)) + + def test_sum(self): + program = Program() + with program_guard(program): + input = layers.data(name="input", shape=[13, 11], dtype='float32') + + out = layers.sum(input) + self.assertIsNotNone(out) + print(str(program)) + + def test_slice(self): + starts = [1, 0, 2] + ends = [3, 3, 4] + axes = [0, 1, 2] + + program = Program() + with program_guard(program): + input = layers.data( + name="input", shape=[3, 4, 5, 6], dtype='float32') + + out = layers.slice(input, axes=axes, starts=starts, ends=ends) + def test_softshrink(self): program = Program() with program_guard(program): @@ -766,6 +825,15 @@ class TestBook(unittest.TestCase): self.assertIsNotNone(out) print(str(program)) + def iou_similarity(self): + program = Program() + with program_guard(program): + x = layers.data(name="x", shape=[16], dtype="float32") + y = layers.data(name="y", shape=[16], dtype="float32") + out = layers.iou_similarity(x, y, name='iou_similarity') + self.assertIsNotNone(out) + print(str(program)) + if __name__ == '__main__': unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_pass_builder.py b/python/paddle/fluid/tests/unittests/test_pass_builder.py new file mode 100644 index 0000000000000000000000000000000000000000..288c5f6a1f6b1760ca40c0c653e4c0726b799519 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_pass_builder.py @@ -0,0 +1,121 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import paddle.fluid as fluid +import paddle.fluid.core as core +import numpy as np +import unittest +import os +import sys +import math + + +def simple_fc_net(): + img = fluid.layers.data(name='image', shape=[784], dtype='float32') + label = fluid.layers.data(name='label', shape=[1], dtype='int64') + hidden = img + for _ in range(4): + hidden = fluid.layers.fc( + hidden, + size=200, + act='tanh', + bias_attr=fluid.ParamAttr( + initializer=fluid.initializer.Constant(value=1.0))) + prediction = fluid.layers.fc(hidden, size=10, act='softmax') + loss = fluid.layers.cross_entropy(input=prediction, label=label) + loss = fluid.layers.mean(loss) + return loss + + +class TestPassBuilder(unittest.TestCase): + def check_network_convergence(self, use_cuda, build_strategy=None): + os.environ['CPU_NUM'] = str(4) + main = fluid.Program() + startup = fluid.Program() + with fluid.program_guard(main, startup): + loss = simple_fc_net() + test_program = main.clone(for_test=True) + + opt = fluid.optimizer.SGD(learning_rate=0.001) + opt.minimize(loss) + + batch_size = 32 + image = np.random.normal(size=(batch_size, 784)).astype('float32') + label = np.random.randint(0, 10, (batch_size, 1), dtype="int64") + + place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() + exe = fluid.Executor(place) + exe.run(startup) + feed_dict = {'image': image, 'label': label} + + train_exe = fluid.ParallelExecutor( + use_cuda=use_cuda, + loss_name=loss.name, + main_program=main, + build_strategy=build_strategy) + + test_exe = fluid.ParallelExecutor( + use_cuda=use_cuda, + main_program=test_program, + share_vars_from=train_exe, + build_strategy=build_strategy) + + for i in range(5): + test_loss, = test_exe.run([loss.name], feed=feed_dict) + + train_loss, = train_exe.run([loss.name], feed=feed_dict) + + avg_test_loss_val = np.array(test_loss).mean() + if math.isnan(float(avg_test_loss_val)): + sys.exit("got NaN loss, testing failed.") + + avg_train_loss_val = np.array(train_loss).mean() + if math.isnan(float(avg_train_loss_val)): + sys.exit("got NaN loss, training failed.") + + self.assertTrue( + np.allclose( + train_loss, test_loss, atol=1e-8), + "Train loss: " + str(train_loss) + "\n Test loss:" + + str(test_loss)) + + def test_parallel_testing_with_new_strategy(self): + build_strategy = fluid.BuildStrategy() + pass_builder = build_strategy._create_passes_from_strategy() + origin_len = len(pass_builder.all_passes()) + + viz_pass = pass_builder.append_pass("graph_viz_pass") + self.assertEqual(origin_len + 1, len(pass_builder.all_passes())) + + pass_builder.insert_pass( + len(pass_builder.all_passes()), "graph_viz_pass") + self.assertEqual(origin_len + 2, len(pass_builder.all_passes())) + + pass_builder.remove_pass(len(pass_builder.all_passes()) - 1) + self.assertEqual(origin_len + 1, len(pass_builder.all_passes())) + viz_pass.set_str("graph_viz_path", "/tmp/test_viz_pass") + + self.check_network_convergence( + use_cuda=core.is_compiled_with_cuda(), + build_strategy=build_strategy) + try: + os.stat("/tmp/test_viz_pass") + except os.error: + self.assertFalse(True) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_protobuf_descs.py b/python/paddle/fluid/tests/unittests/test_protobuf_descs.py index d24b5cbd06ddf9f332c1369ebd513bef27b77e14..7fb2171f611adea434d6f2710465810fb69d6979 100644 --- a/python/paddle/fluid/tests/unittests/test_protobuf_descs.py +++ b/python/paddle/fluid/tests/unittests/test_protobuf_descs.py @@ -38,40 +38,40 @@ class TestOpDesc(unittest.TestCase): self.assertEqual(['z'], op.output("Out")) self.assertEqual(["Out"], op.output_names()) - op.set_attr("int_attr", 1) + op._set_attr("int_attr", 1) self.assertEqual(1, op.attr("int_attr")) self.assertTrue(op.has_attr("int_attr")) self.assertEqual(core.AttrType.INT, op.attr_type("int_attr")) - op.set_attr("float_attr", -1.32) + op._set_attr("float_attr", -1.32) self.assertAlmostEqual(-1.32, op.attr("float_attr"), delta=1e-4) self.assertTrue(op.has_attr("float_attr")) - op.set_attr("bool_attr", False) + op._set_attr("bool_attr", False) self.assertFalse(op.attr("bool_attr")) - op.set_attr("string_attr", "abc") + op._set_attr("string_attr", "abc") self.assertEqual("abc", op.attr("string_attr")) self.assertTrue(op.has_attr("string_attr")) - op.set_attr("ints_attr", [1, 2, 3]) + op._set_attr("ints_attr", [1, 2, 3]) self.assertEqual([1, 2, 3], op.attr("ints_attr")) expected = [1.2, 2.3, 3.4] - op.set_attr("floats_attr", expected) + op._set_attr("floats_attr", expected) for e, a in zip(expected, op.attr("floats_attr")): self.assertAlmostEqual(e, a, delta=1e-4) - op.set_attr("strings_attr", ["a", "b", "c"]) + op._set_attr("strings_attr", ["a", "b", "c"]) self.assertEqual(["a", "b", "c"], op.attr("strings_attr")) - op.set_attr("bools_attr", [True, False, True]) + op._set_attr("bools_attr", [True, False, True]) self.assertEqual([True, False, True], op.attr("bools_attr")) self.assertEqual(8, len(op.attr_names())) - op.set_block_attr("block_attr", program_desc.block(0)) - self.assertEqual(0, op.block_attr_id("block_attr")) + op.set_block_attr("_block_attr", program_desc.block(0)) + self.assertEqual(0, op._block_attr_id("_block_attr")) mul_op = block.append_op() mul_op.set_type("mul") diff --git a/python/paddle/fluid/tests/unittests/transformer_model.py b/python/paddle/fluid/tests/unittests/transformer_model.py index ab7a18d4c5c4ce1e490e2951ff9fbb023324e753..143d187edc3a154418f9e639b7d492c8ce994d42 100644 --- a/python/paddle/fluid/tests/unittests/transformer_model.py +++ b/python/paddle/fluid/tests/unittests/transformer_model.py @@ -246,6 +246,7 @@ def prepare_encoder(src_word, padding_idx=pos_pad_idx, param_attr=fluid.ParamAttr( name=pos_enc_param_name, trainable=False)) + src_pos_enc.stop_gradient = True enc_input = src_word_emb + src_pos_enc # FIXME(guosheng): Decouple the program desc with batch_size. diff --git a/python/paddle/fluid/transpiler/__init__.py b/python/paddle/fluid/transpiler/__init__.py index 28c7ae5341b20f0f79da8cf682d279fc4cc3fa19..c9a8176a72fb744963ae466e965a25bdfb0a44de 100644 --- a/python/paddle/fluid/transpiler/__init__.py +++ b/python/paddle/fluid/transpiler/__init__.py @@ -20,6 +20,10 @@ from .memory_optimization_transpiler import memory_optimize, release_memory from .ps_dispatcher import HashName, RoundRobin __all__ = [ - "DistributeTranspiler", "memory_optimize", "release_memory", "HashName", - "RoundRobin", "DistributeTranspilerConfig" + "DistributeTranspiler", + "memory_optimize", + "release_memory", + "HashName", + "RoundRobin", + "DistributeTranspilerConfig", ] diff --git a/python/paddle/fluid/transpiler/details/program_utils.py b/python/paddle/fluid/transpiler/details/program_utils.py index 59899e7e9ab98f661699d5ac0645c92bd23a1512..391d6aa12bdd70b9ef988898bee8e86cd0a0d765 100644 --- a/python/paddle/fluid/transpiler/details/program_utils.py +++ b/python/paddle/fluid/transpiler/details/program_utils.py @@ -128,7 +128,7 @@ def op_to_code(op): attr_type = op.desc.attr_type(name) if attr_type == core.AttrType.BLOCK: a = "{name} = block[{value}]".format( - name=name, type=attr_type, value=op.block_attr_id(name)) + name=name, type=attr_type, value=op._block_attr_id(name)) attrs_str += a if i != len(attr_names) - 1: attrs_str += ", " @@ -136,7 +136,7 @@ def op_to_code(op): if attr_type == core.AttrType.BLOCKS: a = "{name} = blocks{value}".format( - name=name, type=attr_type, value=op.blocks_attr_ids(name)) + name=name, type=attr_type, value=op._blocks_attr_ids(name)) attrs_str += a if i != len(attr_names) - 1: attrs_str += ", " diff --git a/python/paddle/fluid/transpiler/distribute_transpiler.py b/python/paddle/fluid/transpiler/distribute_transpiler.py index 43071def7a906e585909e50e4c0c52c56d981cde..ecdbe27f4d90268d755a712e25289cfaf4715f29 100644 --- a/python/paddle/fluid/transpiler/distribute_transpiler.py +++ b/python/paddle/fluid/transpiler/distribute_transpiler.py @@ -39,8 +39,8 @@ import six from .ps_dispatcher import RoundRobin, HashName, PSDispatcher from .. import core, framework from ..framework import Program, default_main_program, \ - default_startup_program, Block, \ - Parameter, grad_var_name + default_startup_program, Block, \ + Parameter, grad_var_name from .details import * from functools import reduce @@ -178,7 +178,7 @@ class DistributeTranspiler(object): pserver_program) elif role == "TRAINER": trainer_program = t.get_trainer_program() - + # for nccl2 mode config = fluid.DistributeTranspilerConfig() config.mode = "nccl2" @@ -470,7 +470,10 @@ class DistributeTranspiler(object): """ # remove optimize ops and add a send op to main_program # FIXME(typhoonzero): Also ops like clip_gradient, lrn_decay? + lr_ops = self._get_lr_ops() delete_ops(self.origin_program.global_block(), self.optimize_ops) + delete_ops(self.origin_program.global_block(), lr_ops) + self.origin_program.__str__() if wait_port: @@ -534,7 +537,7 @@ class DistributeTranspiler(object): }) for varname, splited_var in six.iteritems(self.param_var_mapping): - #add concat ops to merge splited parameters received from parameter servers. + # add concat ops to merge splited parameters received from parameter servers. if len(splited_var) <= 1: continue # NOTE: if enable memory optimization, origin vars maybe removed. @@ -668,7 +671,7 @@ in a single call.") __clone_lr_op_sub_block__(cloned_op, program, new_sub_block) # reset the block of op - op.set_attr('sub_block', new_sub_block) + op._set_attr('sub_block', new_sub_block) # append lr decay ops to the child block if exists lr_ops = self._get_lr_ops() @@ -734,19 +737,14 @@ in a single call.") table_opt_block = self._create_table_optimize_block( pserver_index, pserver_program, pre_block_idx, grad_to_block_id) optimize_blocks.append(table_opt_block) - prefetch_var_name_to_block_id = self._create_prefetch_block( + lookup_table_var_name_to_block_id = self._create_prefetch_block( pserver_index, pserver_program, table_opt_block) checkpoint_block_id = self._create_checkpoint_save_block( pserver_program, table_opt_block.idx) pserver_program._distributed_lookup_table = self.table_name - - # NOTE: if has_distributed_lookup_table is False, then prefetch_block will - # not be executed, so it's safe to use optimize_block to hold the place - if self.has_distributed_lookup_table: - assert len(prefetch_var_name_to_block_id) > 0 - else: - assert len(prefetch_var_name_to_block_id) == 0 + prefetch_var_name_to_block_id.extend( + lookup_table_var_name_to_block_id) attrs = { "optimize_blocks": optimize_blocks, @@ -755,11 +753,14 @@ in a single call.") "sync_mode": self.sync_mode, "grad_to_block_id": grad_to_block_id, } - if len(prefetch_var_name_to_block_id) > 0: - attrs['prefetch_var_name_to_block_id'] \ - = prefetch_var_name_to_block_id + + if self.has_distributed_lookup_table: attrs['checkpint_block_id'] = checkpoint_block_id + if len(prefetch_var_name_to_block_id) > 0: + attrs[ + 'prefetch_var_name_to_block_id'] = prefetch_var_name_to_block_id + # step5 append the listen_and_serv op pserver_program.global_block().append_op( type="listen_and_serv", @@ -864,7 +865,7 @@ to transpile() call.") if op.type in [ "gaussian_random", "fill_constant", "uniform_random" ]: - op.set_attr("shape", list(new_outputs["Out"].shape)) + op._set_attr("shape", list(new_outputs["Out"].shape)) s_prog.global_block().append_op( type=op.type, inputs=new_inputs, @@ -1013,7 +1014,7 @@ to transpile() call.") for g, p in zip(grad_blocks, param_blocks): g_name, g_bid, _ = g.split(":") p_name, p_bid, _ = p.split(":") - self.grad_param_mapping[self.grad_var_mapping[g_name][int(g_bid)]] = \ + self.grad_param_mapping[self.grad_var_mapping[g_name][int(g_bid)]] = \ self.param_var_mapping[p_name][int(p_bid)] # create mapping of endpoint -> split var to create pserver side program @@ -1320,7 +1321,7 @@ to transpile() call.") if len(splited) == 1: if self.sync_mode and add_trainer_suffix: new_var_name = "%s.trainer_%d" % \ - (orig_var.name, self.trainer_id) + (orig_var.name, self.trainer_id) program.global_block()._rename_var(varname, new_var_name) var_mapping[varname] = \ [program.global_block().var(new_var_name)] @@ -1343,10 +1344,10 @@ to transpile() call.") new_var_name = "" if self.sync_mode and add_trainer_suffix: new_var_name = "%s.block%d.trainer_%d" % \ - (varname, i, self.trainer_id) + (varname, i, self.trainer_id) else: new_var_name = "%s.block%d" % \ - (varname, i) + (varname, i) var = program.global_block().create_var( name=new_var_name, persistable=False, @@ -1430,6 +1431,9 @@ to transpile() call.") elif op_type == "rmsprop": if varkey in ["Moment", "MeanSquare"]: return param_shape + elif op_type == "decayed_adagrad": + if varkey == "Moment": + return param_shape elif op_type == "sgd": pass return orig_shape @@ -1484,9 +1488,8 @@ to transpile() call.") vars2merge = [] for i in range(self.trainer_num): per_trainer_name = "%s.trainer_%d" % \ - (merged_var_name, i) + (merged_var_name, i) vars2merge.append(pserver_block.vars[per_trainer_name]) - optimize_block.append_op( type="sum", inputs={"X": vars2merge}, @@ -1645,7 +1648,7 @@ to transpile() call.") # one op's output is another op's input, we say # the two operator is connected. if set(op1.desc.output_arg_names()) & set(op2.desc.input_arg_names()) or \ - set(op1.desc.input_arg_names()) & set(op2.desc.output_arg_names()): + set(op1.desc.input_arg_names()) & set(op2.desc.output_arg_names()): return True return False @@ -1662,7 +1665,7 @@ to transpile() call.") def _is_optimizer_op(self, op): if "Param" in op.input_names and \ - "LearningRate" in op.input_names: + "LearningRate" in op.input_names: return True return False @@ -1737,7 +1740,7 @@ to transpile() call.") # NOTE: we need to skip all optimize ops, since it is connected # with forward/backward ops and lr ops, we only need the lr ops. if op1 != op2 and self._is_op_connected(op1, op2) and \ - not self._is_optimizer_op(op1) and not self._is_optimizer_op(op2): + not self._is_optimizer_op(op1) and not self._is_optimizer_op(op2): ufind.union(op1, op2) # find all ops which is related with lr var for op1 in block.ops: diff --git a/python/paddle/fluid/transpiler/inference_transpiler.py b/python/paddle/fluid/transpiler/inference_transpiler.py index f30c81f0320fc605f98da430e68cfcb005e75c3d..82835c8c524b6cac696d71a690366ef155908582 100644 --- a/python/paddle/fluid/transpiler/inference_transpiler.py +++ b/python/paddle/fluid/transpiler/inference_transpiler.py @@ -124,7 +124,7 @@ class InferenceTranspiler(object): next_op = self.block.ops[i + 1] if next_op.type == 'relu': # modify bnorm OP to include relu - current_op.set_attr("fuse_relu", True) + current_op._set_attr("fuse_relu", True) # remove relu OP self.block._remove_op(i + 1) i = i + 1 @@ -163,7 +163,7 @@ class InferenceTranspiler(object): next_op = self.block.ops[i + 1] if next_op.type == 'relu': # modify bnorm OP to include relu - current_op.set_attr("fuse_with_relu", True) + current_op._set_attr("fuse_with_relu", True) # remove relu OP self.block._remove_op(i + 1) i = i + 1 @@ -377,7 +377,7 @@ class InferenceTranspiler(object): type=old_var.type, dtype=old_var.dtype, shape=old_var.shape) - op.rename_input(old_param_name, new_param_name) + op._rename_input(old_param_name, new_param_name) self.scope.var(new_param_name) tensor = self.scope.find_var(new_param_name).get_tensor() @@ -454,7 +454,7 @@ class InferenceTranspiler(object): :type eltwise_op: Operator ''' - conv_op.set_attr("fuse_eltwise", True) + conv_op._set_attr("fuse_eltwise", True) self.input_map[conv_op.output("Output")[0]] = eltwise_op.input("Y")[0] self.input_map[eltwise_op.output("Out")[0]] = eltwise_op.input("Y")[0] @@ -463,8 +463,8 @@ class InferenceTranspiler(object): current_op = self.block.ops[i] for input_arg in current_op.input_arg_names: if input_arg in self.input_map: - current_op.rename_input(input_arg, - self.input_map[input_arg]) + current_op._rename_input(input_arg, + self.input_map[input_arg]) def _remove_unused_var(self): ''' diff --git a/python/paddle/fluid/transpiler/memory_optimization_transpiler.py b/python/paddle/fluid/transpiler/memory_optimization_transpiler.py index d5aa54d752305b188d292f95f05cd70d27702c35..861bb5fae5d7a8561ded1f547fbb86ae1e1a073e 100755 --- a/python/paddle/fluid/transpiler/memory_optimization_transpiler.py +++ b/python/paddle/fluid/transpiler/memory_optimization_transpiler.py @@ -14,10 +14,10 @@ from __future__ import print_function -from collections import defaultdict, OrderedDict, Callable +from collections import defaultdict, MutableSet from .. import core from ... import compat as cpt -from ..framework import Program, default_main_program, Parameter, Variable +from ..framework import Program, default_main_program, Parameter, Variable, core from ..backward import _rename_arg_ from functools import reduce from six.moves import range @@ -44,17 +44,82 @@ SUB_BLOCK_PAIR = [("while", "while_grad"), ("parallel_do", "parallel_do_grad"), PRINT_LOG = False +class OrderedSet(MutableSet): + def __init__(self, iterable=None): + self.end = end = [] + end += [None, end, end] # sentinel node for doubly linked list + self.map = {} # key --> [key, prev, next] + if iterable is not None: + self |= iterable + + def __len__(self): + return len(self.map) + + def __contains__(self, key): + return key in self.map + + def add(self, key): + if key not in self.map: + end = self.end + curr = end[1] + curr[2] = end[1] = self.map[key] = [key, curr, end] + + def update(self, other): + for e in other: + self.add(e) + + def discard(self, key): + if key in self.map: + key, prev, next = self.map.pop(key) + prev[2] = next + next[1] = prev + + def remove(self, key): + self.discard(key) + + def __iter__(self): + end = self.end + curr = end[2] + while curr is not end: + yield curr[0] + curr = curr[2] + + def __reversed__(self): + end = self.end + curr = end[1] + while curr is not end: + yield curr[0] + curr = curr[1] + + def pop(self, last=True): + if not self: + raise KeyError('set is empty') + key = self.end[1][0] if last else self.end[2][0] + self.discard(key) + return key + + def __repr__(self): + if not self: + return '%s()' % (self.__class__.__name__, ) + return '%s(%r)' % (self.__class__.__name__, list(self)) + + def __eq__(self, other): + if isinstance(other, OrderedSet): + return len(self) == len(other) and list(self) == list(other) + return set(self) == set(other) + + class ControlFlowGraph(object): def __init__(self, program, ops, forward_num, skip_opt): self._program = program self._ops = ops self._forward_num = forward_num - self._successors = defaultdict(set) - self._presuccessors = defaultdict(set) - self._uses = defaultdict(set) - self._defs = defaultdict(set) - self._live_in = defaultdict(set) - self._live_out = defaultdict(set) + self._successors = defaultdict(OrderedSet) + self._presuccessors = defaultdict(OrderedSet) + self._uses = defaultdict(OrderedSet) + self._defs = defaultdict(OrderedSet) + self._live_in = defaultdict(OrderedSet) + self._live_out = defaultdict(OrderedSet) self._skip_opt = skip_opt self.pool = [] @@ -116,7 +181,7 @@ class ControlFlowGraph(object): # NOTE: must sort the in_diff set for cases that get different cache var. # FIXME(typhoonzero): maybe use a "sorted set" is better than this. can_optimize = [ - x for x in sorted(list(in_diff)) + x for x in in_diff if self._check_var_validity(block_desc, x, is_forward) ] if can_optimize: @@ -224,7 +289,7 @@ class ControlFlowGraph(object): if self.pool: # NOTE: must sort the in_diff set for cases that get different cache var. defs_can_optimize = [ - x for x in sorted(list(self._defs[i])) + x for x in self._defs[i] if self._check_var_validity(block_desc, x, is_forward) ] out_pair = [ @@ -381,7 +446,19 @@ def _get_cfgs(input_program): return cfgs -def memory_optimize(input_program, skip_opt_set=None, print_log=False, level=0): +def _is_opt_role_op(op): + op_maker = core.op_proto_and_checker_maker + optimize_role = core.op_proto_and_checker_maker.OpRole.Optimize + if op_maker.kOpRoleAttrName() in op.attr_names and \ + int(op.all_attrs()[op_maker.kOpRoleAttrName()]) == int(optimize_role): + return True + + +def memory_optimize(input_program, + skip_opt_set=None, + print_log=False, + level=0, + skip_grads=False): """Optimize memory by reusing var memory. Note: it doesn't not support subblock nested in subblock. @@ -398,6 +475,19 @@ def memory_optimize(input_program, skip_opt_set=None, print_log=False, level=0): raise ValueError("only support opt_level 0 or 1.") global PRINT_LOG PRINT_LOG = print_log + if skip_grads: + grad_set = set() + OP_ROLE_VAR = core.op_proto_and_checker_maker.kOpRoleVarAttrName() + for op in input_program.global_block().ops: + if _is_opt_role_op(op): + if op.attr(OP_ROLE_VAR): + grad_name = op.attr(OP_ROLE_VAR)[1] + grad_set.add(grad_name) + if not skip_opt_set: + skip_opt_set = grad_set + else: + skip_opt_set.update(grad_set) + cfgs = _get_cfgs(input_program) for cfg in cfgs: cfg.memory_optimize(skip_opt_set=skip_opt_set, level=level) diff --git a/python/setup.py.in b/python/setup.py.in index 786c9f2e39880b68700b8acb94b3d35a48323958..b376be0ea373f089ef17f27435d979712fbdff72 100644 --- a/python/setup.py.in +++ b/python/setup.py.in @@ -106,6 +106,7 @@ packages=['paddle', 'paddle.fluid.layers', 'paddle.fluid.contrib', 'paddle.fluid.contrib.decoder', + 'paddle.fluid.contrib.quantize', 'paddle.fluid.transpiler', 'paddle.fluid.transpiler.details']