未验证 提交 7968887f 编写于 作者: J jiaqi 提交者: GitHub

Merge branch 'develop' into dataset_merge_develop

...@@ -71,11 +71,11 @@ option(ANAKIN_BUILD_CROSS_PLANTFORM "Build anakin lib for any nvidia device plan ...@@ -71,11 +71,11 @@ option(ANAKIN_BUILD_CROSS_PLANTFORM "Build anakin lib for any nvidia device plan
option(WITH_GRPC "Use grpc as the default rpc framework" ${WITH_DISTRIBUTE}) option(WITH_GRPC "Use grpc as the default rpc framework" ${WITH_DISTRIBUTE})
option(WITH_BRPC_RDMA "Use brpc rdma as the rpc protocal" OFF) option(WITH_BRPC_RDMA "Use brpc rdma as the rpc protocal" OFF)
option(ON_INFER "Turn on inference optimization." OFF) option(ON_INFER "Turn on inference optimization." OFF)
option(WITH_INFERENCE_API_TEST "Test fluid inference high-level api interface" OFF) option(WITH_INFERENCE_API_TEST "Test fluid inference C++ high-level api interface" OFF)
option(WITH_HIGH_LEVEL_API_TEST "Test fluid python high-level api interface" OFF)
option(WITH_SYSTEM_BLAS "Use system blas library" OFF) option(WITH_SYSTEM_BLAS "Use system blas library" OFF)
option(PY_VERSION "Compile PaddlePaddle with python3 support" ${PY_VERSION}) option(PY_VERSION "Compile PaddlePaddle with python3 support" ${PY_VERSION})
option(WITH_FAST_MATH "Make use of fast math library, might affect the precision to some extent" ON) option(WITH_FAST_MATH "Make use of fast math library, might affect the precision to some extent" ON)
option(WITH_WBAES "Compile PaddlePaddle with WBAES support" ON)
# PY_VERSION # PY_VERSION
if(NOT PY_VERSION) if(NOT PY_VERSION)
...@@ -149,7 +149,6 @@ include(external/dlpack) ...@@ -149,7 +149,6 @@ include(external/dlpack)
include(external/snappy) # download snappy include(external/snappy) # download snappy
include(external/snappystream) # download snappystream include(external/snappystream) # download snappystream
include(external/warpctc) # download, build, install warpctc include(external/warpctc) # download, build, install warpctc
include(external/wbaes) # download wbaes
if (NOT WIN32) if (NOT WIN32)
# there is no official support of nccl, cupti in windows # there is no official support of nccl, cupti in windows
......
...@@ -157,7 +157,3 @@ endif(WITH_BRPC_RDMA) ...@@ -157,7 +157,3 @@ endif(WITH_BRPC_RDMA)
if(ON_INFER) if(ON_INFER)
add_definitions(-DPADDLE_ON_INFERENCE) add_definitions(-DPADDLE_ON_INFERENCE)
endif(ON_INFER) endif(ON_INFER)
if(WITH_WBAES)
add_definitions(-DPADDLE_WITH_WBAES)
endif(WITH_WBAES)
...@@ -221,6 +221,7 @@ FUNCTION(build_protobuf TARGET_NAME BUILD_FOR_HOST) ...@@ -221,6 +221,7 @@ FUNCTION(build_protobuf TARGET_NAME BUILD_FOR_HOST)
-DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE} -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
-DCMAKE_INSTALL_PREFIX=${PROTOBUF_INSTALL_DIR} -DCMAKE_INSTALL_PREFIX=${PROTOBUF_INSTALL_DIR}
-DCMAKE_INSTALL_LIBDIR=lib -DCMAKE_INSTALL_LIBDIR=lib
-DBUILD_SHARED_LIBS=OFF
CMAKE_CACHE_ARGS CMAKE_CACHE_ARGS
-DCMAKE_INSTALL_PREFIX:PATH=${PROTOBUF_INSTALL_DIR} -DCMAKE_INSTALL_PREFIX:PATH=${PROTOBUF_INSTALL_DIR}
-DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE} -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
......
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
IF(NOT ${WITH_WBAES})
return()
ENDIF(NOT ${WITH_WBAES})
INCLUDE(ExternalProject)
SET(WBAES_DST_DIR "wbaes")
SET(WBAES_INSTALL_ROOT "${THIRD_PARTY_PATH}/install")
SET(WBAES_INSTALL_DIR ${WBAES_INSTALL_ROOT}/${WBAES_DST_DIR})
SET(WBAES_ROOT ${WBAES_INSTALL_DIR})
SET(WBAES_INC_DIR ${WBAES_ROOT}/include)
SET(WBAES_LIB_DIR ${WBAES_ROOT}/lib)
SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${WBAES_ROOT}/lib")
SET(CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE)
IF(APPLE)
SET(WBAES_TAG "v1.0.0" CACHE STRING "" FORCE)
SET(WBAES_URL "http://paddlepaddledeps.bj.bcebos.com/wbaes-sdk.mac.${WBAES_TAG}.tgz" CACHE STRING "" FORCE)
SET(WBAES_LIB ${WBAES_LIB_DIR}/libwbaes.dylib)
SET(WBAES_SHARED_LIB ${WBAES_LIB_DIR}/libwbaes.dylib)
ELSEIF(WIN32)
SET(WBAES_TAG "v1.0.0" CACHE STRING "" FORCE)
SET(WBAES_URL "http://paddlepaddledeps.bj.bcebos.com/wbaes-sdk.windows-x64.${WBAES_TAG}.tgz" CACHE STRING "" FORCE)
SET(WBAES_LIB ${WBAES_LIB_DIR}/libwbaes.lib)
SET(WBAES_SHARED_LIB ${WBAES_LIB_DIR}/libwbaes.dll)
ELSE()
SET(WBAES_TAG "v1.0.2" CACHE STRING "" FORCE)
SET(WBAES_URL "http://paddlepaddledeps.bj.bcebos.com/wbaes-sdk.linux-x86_64.${WBAES_TAG}.tgz" CACHE STRING "" FORCE)
SET(WBAES_LIB ${WBAES_LIB_DIR}/libwbaes.so)
SET(WBAES_SHARED_LIB ${WBAES_LIB_DIR}/libwbaes.so)
ENDIF()
SET(WBAES_PROJECT "extern_wbaes")
MESSAGE(STATUS "WBAES_URL: ${WBAES_URL}, WBAES_LIB: ${WBAES_LIB}")
SET(WBAES_SOURCE_DIR "${THIRD_PARTY_PATH}/wbaes")
SET(WBAES_DOWNLOAD_DIR "${WBAES_SOURCE_DIR}/src/${WBAES_PROJECT}")
ExternalProject_Add(
${WBAES_PROJECT}
${EXTERNAL_PROJECT_LOG_ARGS}
PREFIX ${WBAES_SOURCE_DIR}
URL ${WBAES_URL}
DOWNLOAD_DIR ${WBAES_DOWNLOAD_DIR}
DOWNLOAD_NO_PROGRESS 1
CONFIGURE_COMMAND ""
BUILD_COMMAND ""
INSTALL_COMMAND ""
${CMAKE_COMMAND} -E copy_directory ${WBAES_DOWNLOAD_DIR}/include ${WBAES_INC_DIR} &&
${CMAKE_COMMAND} -E copy_directory ${WBAES_DOWNLOAD_DIR}/lib ${WBAES_LIB_DIR}
)
INCLUDE_DIRECTORIES(${WBAES_INC_DIR})
ADD_LIBRARY(wbaes SHARED IMPORTED GLOBAL)
SET_PROPERTY(TARGET wbaes PROPERTY IMPORTED_LOCATION ${WBAES_LIB})
SET_PROPERTY(TARGET wbaes PROPERTY IMPORTED_NO_SONAME 1)
ADD_DEPENDENCIES(wbaes ${WBAES_PROJECT})
...@@ -264,14 +264,6 @@ function(cc_library TARGET_NAME) ...@@ -264,14 +264,6 @@ function(cc_library TARGET_NAME)
list(REMOVE_ITEM cc_library_DEPS warpctc) list(REMOVE_ITEM cc_library_DEPS warpctc)
add_dependencies(${TARGET_NAME} warpctc) add_dependencies(${TARGET_NAME} warpctc)
endif() endif()
# Only deps libwbaes.so, not link
if("${cc_library_DEPS};" MATCHES "wbaes;")
list(REMOVE_ITEM cc_library_DEPS wbaes)
if(NOT "${TARGET_NAME}" MATCHES "dynload_wbaes")
list(APPEND cc_library_DEPS dynload_wbaes)
endif()
add_dependencies(${TARGET_NAME} wbaes)
endif()
# Only deps libmklml.so, not link # Only deps libmklml.so, not link
if("${cc_library_DEPS};" MATCHES "mklml;") if("${cc_library_DEPS};" MATCHES "mklml;")
list(REMOVE_ITEM cc_library_DEPS mklml) list(REMOVE_ITEM cc_library_DEPS mklml)
......
...@@ -170,14 +170,6 @@ copy(snappystream_lib ...@@ -170,14 +170,6 @@ copy(snappystream_lib
DSTS ${dst_dir} ${dst_dir}/lib DSTS ${dst_dir} ${dst_dir}/lib
DEPS snappystream) DEPS snappystream)
if (WITH_WBAES)
set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/wbaes")
copy(wbaes_lib
SRCS ${WBAES_INC_DIR} ${WBAES_LIB}
DSTS ${dst_dir} ${dst_dir}/lib
DEPS wbaes)
endif ()
set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/zlib") set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/zlib")
copy(zlib_lib copy(zlib_lib
SRCS ${ZLIB_INCLUDE_DIR} ${ZLIB_LIBRARIES} SRCS ${ZLIB_INCLUDE_DIR} ${ZLIB_LIBRARIES}
......
...@@ -13,6 +13,7 @@ paddle.fluid.name_scope (ArgSpec(args=['prefix'], varargs=None, keywords=None, d ...@@ -13,6 +13,7 @@ paddle.fluid.name_scope (ArgSpec(args=['prefix'], varargs=None, keywords=None, d
paddle.fluid.cuda_places (ArgSpec(args=['device_ids'], varargs=None, keywords=None, defaults=(None,)), ('document', '7d9a51fc9cf3c5245b5227080a8064c3')) paddle.fluid.cuda_places (ArgSpec(args=['device_ids'], varargs=None, keywords=None, defaults=(None,)), ('document', '7d9a51fc9cf3c5245b5227080a8064c3'))
paddle.fluid.cpu_places (ArgSpec(args=['device_count'], varargs=None, keywords=None, defaults=(None,)), ('document', '4c0cd83f0b401fc2ff84c70974e5d210')) paddle.fluid.cpu_places (ArgSpec(args=['device_count'], varargs=None, keywords=None, defaults=(None,)), ('document', '4c0cd83f0b401fc2ff84c70974e5d210'))
paddle.fluid.cuda_pinned_places (ArgSpec(args=['device_count'], varargs=None, keywords=None, defaults=(None,)), ('document', 'd0c3ebd813c39958c92b78e3eef7e912')) paddle.fluid.cuda_pinned_places (ArgSpec(args=['device_count'], varargs=None, keywords=None, defaults=(None,)), ('document', 'd0c3ebd813c39958c92b78e3eef7e912'))
paddle.fluid.in_dygraph_mode (ArgSpec(args=[], varargs=None, keywords=None, defaults=None), ('document', 'f06314a1cb30c96b5808dde2219c2dae'))
paddle.fluid.Executor.__init__ (ArgSpec(args=['self', 'place'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.Executor.__init__ (ArgSpec(args=['self', 'place'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
paddle.fluid.Executor.close (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', 'f5369953dd0c443961cf79f7a00e1a03')) paddle.fluid.Executor.close (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', 'f5369953dd0c443961cf79f7a00e1a03'))
paddle.fluid.Executor.infer_from_dataset (ArgSpec(args=['self', 'program', 'dataset', 'scope', 'thread', 'debug', 'fetch_list', 'fetch_info', 'print_period'], varargs=None, keywords=None, defaults=(None, None, None, 0, False, None, None, 100)), ('document', '9c7decb955b9c4f718114179c8985581')) paddle.fluid.Executor.infer_from_dataset (ArgSpec(args=['self', 'program', 'dataset', 'scope', 'thread', 'debug', 'fetch_list', 'fetch_info', 'print_period'], varargs=None, keywords=None, defaults=(None, None, None, 0, False, None, None, 100)), ('document', '9c7decb955b9c4f718114179c8985581'))
...@@ -117,6 +118,8 @@ paddle.fluid.layers.reduce_mean (ArgSpec(args=['input', 'dim', 'keep_dim', 'name ...@@ -117,6 +118,8 @@ paddle.fluid.layers.reduce_mean (ArgSpec(args=['input', 'dim', 'keep_dim', 'name
paddle.fluid.layers.reduce_max (ArgSpec(args=['input', 'dim', 'keep_dim', 'name'], varargs=None, keywords=None, defaults=(None, False, None)), ('document', '66a622db727551761ce4eb73eaa7f6a4')) paddle.fluid.layers.reduce_max (ArgSpec(args=['input', 'dim', 'keep_dim', 'name'], varargs=None, keywords=None, defaults=(None, False, None)), ('document', '66a622db727551761ce4eb73eaa7f6a4'))
paddle.fluid.layers.reduce_min (ArgSpec(args=['input', 'dim', 'keep_dim', 'name'], varargs=None, keywords=None, defaults=(None, False, None)), ('document', 'd50ac552b5d131468ed466d08bb2d38c')) paddle.fluid.layers.reduce_min (ArgSpec(args=['input', 'dim', 'keep_dim', 'name'], varargs=None, keywords=None, defaults=(None, False, None)), ('document', 'd50ac552b5d131468ed466d08bb2d38c'))
paddle.fluid.layers.reduce_prod (ArgSpec(args=['input', 'dim', 'keep_dim', 'name'], varargs=None, keywords=None, defaults=(None, False, None)), ('document', 'fcd8301a0ce15f219c7a4bcd0c1e8eca')) paddle.fluid.layers.reduce_prod (ArgSpec(args=['input', 'dim', 'keep_dim', 'name'], varargs=None, keywords=None, defaults=(None, False, None)), ('document', 'fcd8301a0ce15f219c7a4bcd0c1e8eca'))
paddle.fluid.layers.reduce_all (ArgSpec(args=['input', 'dim', 'keep_dim', 'name'], varargs=None, keywords=None, defaults=(None, False, None)), ('document', '646ca4d4a2cc16084f59de44b6927eca'))
paddle.fluid.layers.reduce_any (ArgSpec(args=['input', 'dim', 'keep_dim', 'name'], varargs=None, keywords=None, defaults=(None, False, None)), ('document', 'f36661060aeeaf6c6b1331e41b3726fa'))
paddle.fluid.layers.sequence_first_step (ArgSpec(args=['input'], varargs=None, keywords=None, defaults=None), ('document', '2b290d3d77882bfe9bb8d331cac8cdd3')) paddle.fluid.layers.sequence_first_step (ArgSpec(args=['input'], varargs=None, keywords=None, defaults=None), ('document', '2b290d3d77882bfe9bb8d331cac8cdd3'))
paddle.fluid.layers.sequence_last_step (ArgSpec(args=['input'], varargs=None, keywords=None, defaults=None), ('document', 'c16a892f44f7fe71bfa5afc32d3f34ce')) paddle.fluid.layers.sequence_last_step (ArgSpec(args=['input'], varargs=None, keywords=None, defaults=None), ('document', 'c16a892f44f7fe71bfa5afc32d3f34ce'))
paddle.fluid.layers.sequence_slice (ArgSpec(args=['input', 'offset', 'length', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'fdcea0e8b5bc7d8d4b1b072c521014e6')) paddle.fluid.layers.sequence_slice (ArgSpec(args=['input', 'offset', 'length', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'fdcea0e8b5bc7d8d4b1b072c521014e6'))
...@@ -124,7 +127,7 @@ paddle.fluid.layers.dropout (ArgSpec(args=['x', 'dropout_prob', 'is_test', 'seed ...@@ -124,7 +127,7 @@ paddle.fluid.layers.dropout (ArgSpec(args=['x', 'dropout_prob', 'is_test', 'seed
paddle.fluid.layers.split (ArgSpec(args=['input', 'num_or_sections', 'dim', 'name'], varargs=None, keywords=None, defaults=(-1, None)), ('document', '652625345c2acb900029c78cc75f8aa6')) paddle.fluid.layers.split (ArgSpec(args=['input', 'num_or_sections', 'dim', 'name'], varargs=None, keywords=None, defaults=(-1, None)), ('document', '652625345c2acb900029c78cc75f8aa6'))
paddle.fluid.layers.ctc_greedy_decoder (ArgSpec(args=['input', 'blank', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'ebbf2adbd79683dc93db03454dfa18c2')) paddle.fluid.layers.ctc_greedy_decoder (ArgSpec(args=['input', 'blank', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'ebbf2adbd79683dc93db03454dfa18c2'))
paddle.fluid.layers.edit_distance (ArgSpec(args=['input', 'label', 'normalized', 'ignored_tokens'], varargs=None, keywords=None, defaults=(True, None)), ('document', '97f0262f97602644c83142789d784571')) paddle.fluid.layers.edit_distance (ArgSpec(args=['input', 'label', 'normalized', 'ignored_tokens'], varargs=None, keywords=None, defaults=(True, None)), ('document', '97f0262f97602644c83142789d784571'))
paddle.fluid.layers.l2_normalize (ArgSpec(args=['x', 'axis', 'epsilon', 'name'], varargs=None, keywords=None, defaults=(1e-12, None)), ('document', '6e428384ce6a77207fa2c70d9f011990')) paddle.fluid.layers.l2_normalize (ArgSpec(args=['x', 'axis', 'epsilon', 'name'], varargs=None, keywords=None, defaults=(1e-12, None)), ('document', '35c6a241bcc1a1fc89508860d82ad62b'))
paddle.fluid.layers.matmul (ArgSpec(args=['x', 'y', 'transpose_x', 'transpose_y', 'alpha', 'name'], varargs=None, keywords=None, defaults=(False, False, 1.0, None)), ('document', 'b4cbe1ac451005df6dad12e9ffdccca9')) paddle.fluid.layers.matmul (ArgSpec(args=['x', 'y', 'transpose_x', 'transpose_y', 'alpha', 'name'], varargs=None, keywords=None, defaults=(False, False, 1.0, None)), ('document', 'b4cbe1ac451005df6dad12e9ffdccca9'))
paddle.fluid.layers.topk (ArgSpec(args=['input', 'k', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'd3570c02f71bcd78e60b3f31dc8f5b32')) paddle.fluid.layers.topk (ArgSpec(args=['input', 'k', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'd3570c02f71bcd78e60b3f31dc8f5b32'))
paddle.fluid.layers.warpctc (ArgSpec(args=['input', 'label', 'blank', 'norm_by_times', 'use_cudnn'], varargs=None, keywords=None, defaults=(0, False, False)), ('document', 'aaba49c038ba927f0a8e45c0c9a686ab')) paddle.fluid.layers.warpctc (ArgSpec(args=['input', 'label', 'blank', 'norm_by_times', 'use_cudnn'], varargs=None, keywords=None, defaults=(0, False, False)), ('document', 'aaba49c038ba927f0a8e45c0c9a686ab'))
...@@ -155,10 +158,10 @@ paddle.fluid.layers.label_smooth (ArgSpec(args=['label', 'prior_dist', 'epsilon' ...@@ -155,10 +158,10 @@ paddle.fluid.layers.label_smooth (ArgSpec(args=['label', 'prior_dist', 'epsilon'
paddle.fluid.layers.roi_pool (ArgSpec(args=['input', 'rois', 'pooled_height', 'pooled_width', 'spatial_scale'], varargs=None, keywords=None, defaults=(1, 1, 1.0)), ('document', 'c317aa595deb31649083c8faa91cdb97')) paddle.fluid.layers.roi_pool (ArgSpec(args=['input', 'rois', 'pooled_height', 'pooled_width', 'spatial_scale'], varargs=None, keywords=None, defaults=(1, 1, 1.0)), ('document', 'c317aa595deb31649083c8faa91cdb97'))
paddle.fluid.layers.roi_align (ArgSpec(args=['input', 'rois', 'pooled_height', 'pooled_width', 'spatial_scale', 'sampling_ratio', 'name'], varargs=None, keywords=None, defaults=(1, 1, 1.0, -1, None)), ('document', '12c5bbb8b38c42e623fbc47611d766e1')) paddle.fluid.layers.roi_align (ArgSpec(args=['input', 'rois', 'pooled_height', 'pooled_width', 'spatial_scale', 'sampling_ratio', 'name'], varargs=None, keywords=None, defaults=(1, 1, 1.0, -1, None)), ('document', '12c5bbb8b38c42e623fbc47611d766e1'))
paddle.fluid.layers.dice_loss (ArgSpec(args=['input', 'label', 'epsilon'], varargs=None, keywords=None, defaults=(1e-05,)), ('document', '1ba0508d573f65feecf3564dce22aa1d')) paddle.fluid.layers.dice_loss (ArgSpec(args=['input', 'label', 'epsilon'], varargs=None, keywords=None, defaults=(1e-05,)), ('document', '1ba0508d573f65feecf3564dce22aa1d'))
paddle.fluid.layers.image_resize (ArgSpec(args=['input', 'out_shape', 'scale', 'name', 'resample', 'actual_shape', 'align_corners', 'align_mode'], varargs=None, keywords=None, defaults=(None, None, None, 'BILINEAR', None, True, 1)), ('document', '7a1966d7c3a48f1fc0881cdaf5d83b0b')) paddle.fluid.layers.image_resize (ArgSpec(args=['input', 'out_shape', 'scale', 'name', 'resample', 'actual_shape', 'align_corners', 'align_mode'], varargs=None, keywords=None, defaults=(None, None, None, 'BILINEAR', None, True, 1)), ('document', 'd1b08c11bb9277386fcf6ae70b6622d1'))
paddle.fluid.layers.image_resize_short (ArgSpec(args=['input', 'out_short_len', 'resample'], varargs=None, keywords=None, defaults=('BILINEAR',)), ('document', '06211aefc50c5a3e940d7204d859cdf7')) paddle.fluid.layers.image_resize_short (ArgSpec(args=['input', 'out_short_len', 'resample'], varargs=None, keywords=None, defaults=('BILINEAR',)), ('document', '06211aefc50c5a3e940d7204d859cdf7'))
paddle.fluid.layers.resize_bilinear (ArgSpec(args=['input', 'out_shape', 'scale', 'name', 'actual_shape', 'align_corners', 'align_mode'], varargs=None, keywords=None, defaults=(None, None, None, None, True, 1)), ('document', 'e4fb4ed511b2293b8f04f7e872afbfd7')) paddle.fluid.layers.resize_bilinear (ArgSpec(args=['input', 'out_shape', 'scale', 'name', 'actual_shape', 'align_corners', 'align_mode'], varargs=None, keywords=None, defaults=(None, None, None, None, True, 1)), ('document', 'c45591fbc4f64a178fbca219e1546a58'))
paddle.fluid.layers.resize_nearest (ArgSpec(args=['input', 'out_shape', 'scale', 'name', 'actual_shape', 'align_corners'], varargs=None, keywords=None, defaults=(None, None, None, None, True)), ('document', '735fa9758a6d7ff3b47d7b827f961c1d')) paddle.fluid.layers.resize_nearest (ArgSpec(args=['input', 'out_shape', 'scale', 'name', 'actual_shape', 'align_corners'], varargs=None, keywords=None, defaults=(None, None, None, None, True)), ('document', 'ae6d73cdc7f3a138d8a338ecdb33c1ae'))
paddle.fluid.layers.gather (ArgSpec(args=['input', 'index'], varargs=None, keywords=None, defaults=None), ('document', '98f1c86716b9b7f4dda83f20e2adeee2')) paddle.fluid.layers.gather (ArgSpec(args=['input', 'index'], varargs=None, keywords=None, defaults=None), ('document', '98f1c86716b9b7f4dda83f20e2adeee2'))
paddle.fluid.layers.scatter (ArgSpec(args=['input', 'index', 'updates', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '65f8e9d8ddfd0b412f940579c4faa342')) paddle.fluid.layers.scatter (ArgSpec(args=['input', 'index', 'updates', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '65f8e9d8ddfd0b412f940579c4faa342'))
paddle.fluid.layers.sequence_scatter (ArgSpec(args=['input', 'index', 'updates', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '15b522457dfef103f0c20ca9d397678b')) paddle.fluid.layers.sequence_scatter (ArgSpec(args=['input', 'index', 'updates', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '15b522457dfef103f0c20ca9d397678b'))
...@@ -203,6 +206,7 @@ paddle.fluid.layers.gaussian_random_batch_size_like (ArgSpec(args=['input', 'sha ...@@ -203,6 +206,7 @@ paddle.fluid.layers.gaussian_random_batch_size_like (ArgSpec(args=['input', 'sha
paddle.fluid.layers.sum (ArgSpec(args=['x'], varargs=None, keywords=None, defaults=None), ('document', 'a418e3ccb5e2ac21bd60f5cc221d5860')) paddle.fluid.layers.sum (ArgSpec(args=['x'], varargs=None, keywords=None, defaults=None), ('document', 'a418e3ccb5e2ac21bd60f5cc221d5860'))
paddle.fluid.layers.slice (ArgSpec(args=['input', 'axes', 'starts', 'ends'], varargs=None, keywords=None, defaults=None), ('document', '01dbb91e7c74cb11336cd531013de51a')) paddle.fluid.layers.slice (ArgSpec(args=['input', 'axes', 'starts', 'ends'], varargs=None, keywords=None, defaults=None), ('document', '01dbb91e7c74cb11336cd531013de51a'))
paddle.fluid.layers.shape (ArgSpec(args=['input'], varargs=None, keywords=None, defaults=None), ('document', '17db0f814eb7bb5a3fac1ca6e60e16d8')) paddle.fluid.layers.shape (ArgSpec(args=['input'], varargs=None, keywords=None, defaults=None), ('document', '17db0f814eb7bb5a3fac1ca6e60e16d8'))
paddle.fluid.layers.rank (ArgSpec(args=['input'], varargs=None, keywords=None, defaults=None), ('document', 'ee1386c42ecc8f424fe3fb21862fefc2'))
paddle.fluid.layers.logical_and (ArgSpec(args=['x', 'y', 'out', 'name'], varargs=None, keywords=None, defaults=(None, None)), ('document', 'cdcf20c494c92060d10feb9374532f42')) paddle.fluid.layers.logical_and (ArgSpec(args=['x', 'y', 'out', 'name'], varargs=None, keywords=None, defaults=(None, None)), ('document', 'cdcf20c494c92060d10feb9374532f42'))
paddle.fluid.layers.logical_or (ArgSpec(args=['x', 'y', 'out', 'name'], varargs=None, keywords=None, defaults=(None, None)), ('document', '0eae3f726a4afe590757552fa3ced012')) paddle.fluid.layers.logical_or (ArgSpec(args=['x', 'y', 'out', 'name'], varargs=None, keywords=None, defaults=(None, None)), ('document', '0eae3f726a4afe590757552fa3ced012'))
paddle.fluid.layers.logical_xor (ArgSpec(args=['x', 'y', 'out', 'name'], varargs=None, keywords=None, defaults=(None, None)), ('document', 'b0daaa3fa4a0aa62f9b58c43d959eb25')) paddle.fluid.layers.logical_xor (ArgSpec(args=['x', 'y', 'out', 'name'], varargs=None, keywords=None, defaults=(None, None)), ('document', 'b0daaa3fa4a0aa62f9b58c43d959eb25'))
...@@ -235,6 +239,7 @@ paddle.fluid.layers.huber_loss (ArgSpec(args=['input', 'label', 'delta'], vararg ...@@ -235,6 +239,7 @@ paddle.fluid.layers.huber_loss (ArgSpec(args=['input', 'label', 'delta'], vararg
paddle.fluid.layers.kldiv_loss (ArgSpec(args=['x', 'target', 'reduction', 'name'], varargs=None, keywords=None, defaults=('mean', None)), ('document', '776d536cac47c89073abc7ee524d5aec')) paddle.fluid.layers.kldiv_loss (ArgSpec(args=['x', 'target', 'reduction', 'name'], varargs=None, keywords=None, defaults=('mean', None)), ('document', '776d536cac47c89073abc7ee524d5aec'))
paddle.fluid.layers.tree_conv (ArgSpec(args=['nodes_vector', 'edge_set', 'output_size', 'num_filters', 'max_depth', 'act', 'param_attr', 'bias_attr', 'name'], varargs=None, keywords=None, defaults=(1, 2, 'tanh', None, None, None)), ('document', '34ea12ac9f10a65dccbc50100d12e607')) paddle.fluid.layers.tree_conv (ArgSpec(args=['nodes_vector', 'edge_set', 'output_size', 'num_filters', 'max_depth', 'act', 'param_attr', 'bias_attr', 'name'], varargs=None, keywords=None, defaults=(1, 2, 'tanh', None, None, None)), ('document', '34ea12ac9f10a65dccbc50100d12e607'))
paddle.fluid.layers.npair_loss (ArgSpec(args=['anchor', 'positive', 'labels', 'l2_reg'], varargs=None, keywords=None, defaults=(0.002,)), ('document', '46994d10276dd4cb803b4062b5d14329')) paddle.fluid.layers.npair_loss (ArgSpec(args=['anchor', 'positive', 'labels', 'l2_reg'], varargs=None, keywords=None, defaults=(0.002,)), ('document', '46994d10276dd4cb803b4062b5d14329'))
paddle.fluid.layers.pixel_shuffle (ArgSpec(args=['x', 'upscale_factor'], varargs=None, keywords=None, defaults=None), ('document', '731b21c62a4add60a33bd76d802ffc5c'))
paddle.fluid.layers.fsp_matrix (ArgSpec(args=['x', 'y'], varargs=None, keywords=None, defaults=None), ('document', 'b76ccca3735bea4a58a0dbf0d77c5393')) paddle.fluid.layers.fsp_matrix (ArgSpec(args=['x', 'y'], varargs=None, keywords=None, defaults=None), ('document', 'b76ccca3735bea4a58a0dbf0d77c5393'))
paddle.fluid.layers.data (ArgSpec(args=['name', 'shape', 'append_batch_size', 'dtype', 'lod_level', 'type', 'stop_gradient'], varargs=None, keywords=None, defaults=(True, 'float32', 0, VarType.LOD_TENSOR, True)), ('document', '33bbd42027d872b3818b3d64ec52e139')) paddle.fluid.layers.data (ArgSpec(args=['name', 'shape', 'append_batch_size', 'dtype', 'lod_level', 'type', 'stop_gradient'], varargs=None, keywords=None, defaults=(True, 'float32', 0, VarType.LOD_TENSOR, True)), ('document', '33bbd42027d872b3818b3d64ec52e139'))
paddle.fluid.layers.open_files (ArgSpec(args=['filenames', 'shapes', 'lod_levels', 'dtypes', 'thread_num', 'buffer_size', 'pass_num', 'is_test'], varargs=None, keywords=None, defaults=(None, None, 1, None)), ('document', 'b1ae2e1cc0750e58726374061ea90ecc')) paddle.fluid.layers.open_files (ArgSpec(args=['filenames', 'shapes', 'lod_levels', 'dtypes', 'thread_num', 'buffer_size', 'pass_num', 'is_test'], varargs=None, keywords=None, defaults=(None, None, 1, None)), ('document', 'b1ae2e1cc0750e58726374061ea90ecc'))
...@@ -270,6 +275,7 @@ paddle.fluid.layers.has_inf (ArgSpec(args=['x'], varargs=None, keywords=None, de ...@@ -270,6 +275,7 @@ paddle.fluid.layers.has_inf (ArgSpec(args=['x'], varargs=None, keywords=None, de
paddle.fluid.layers.has_nan (ArgSpec(args=['x'], varargs=None, keywords=None, defaults=None), ('document', '2e53e83127dbfd86e7098bdfe9a549e8')) paddle.fluid.layers.has_nan (ArgSpec(args=['x'], varargs=None, keywords=None, defaults=None), ('document', '2e53e83127dbfd86e7098bdfe9a549e8'))
paddle.fluid.layers.isfinite (ArgSpec(args=['x'], varargs=None, keywords=None, defaults=None), ('document', '0a437011c3906079fd8947ed3e52d292')) paddle.fluid.layers.isfinite (ArgSpec(args=['x'], varargs=None, keywords=None, defaults=None), ('document', '0a437011c3906079fd8947ed3e52d292'))
paddle.fluid.layers.range (ArgSpec(args=['start', 'end', 'step', 'dtype'], varargs=None, keywords=None, defaults=None), ('document', '2ec937ede953ded2fdff2675883900bb')) paddle.fluid.layers.range (ArgSpec(args=['start', 'end', 'step', 'dtype'], varargs=None, keywords=None, defaults=None), ('document', '2ec937ede953ded2fdff2675883900bb'))
paddle.fluid.layers.linspace (ArgSpec(args=['start', 'stop', 'num', 'dtype'], varargs=None, keywords=None, defaults=None), ('document', '495e21e9a848c2d075a102802fc67756'))
paddle.fluid.layers.While.__init__ (ArgSpec(args=['self', 'cond', 'is_test', 'name'], varargs=None, keywords=None, defaults=(False, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.layers.While.__init__ (ArgSpec(args=['self', 'cond', 'is_test', 'name'], varargs=None, keywords=None, defaults=(False, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
paddle.fluid.layers.While.block (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.layers.While.block (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
paddle.fluid.layers.Switch.__init__ (ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.layers.Switch.__init__ (ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
...@@ -295,12 +301,12 @@ paddle.fluid.layers.DynamicRNN.static_input (ArgSpec(args=['self', 'x'], varargs ...@@ -295,12 +301,12 @@ paddle.fluid.layers.DynamicRNN.static_input (ArgSpec(args=['self', 'x'], varargs
paddle.fluid.layers.DynamicRNN.step_input (ArgSpec(args=['self', 'x', 'level'], varargs=None, keywords=None, defaults=(0,)), ('document', '7568c5ac7622a10288d3307a94134655')) paddle.fluid.layers.DynamicRNN.step_input (ArgSpec(args=['self', 'x', 'level'], varargs=None, keywords=None, defaults=(0,)), ('document', '7568c5ac7622a10288d3307a94134655'))
paddle.fluid.layers.DynamicRNN.update_memory (ArgSpec(args=['self', 'ex_mem', 'new_mem'], varargs=None, keywords=None, defaults=None), ('document', '5d83987da13b98363d6a807a52d8024f')) paddle.fluid.layers.DynamicRNN.update_memory (ArgSpec(args=['self', 'ex_mem', 'new_mem'], varargs=None, keywords=None, defaults=None), ('document', '5d83987da13b98363d6a807a52d8024f'))
paddle.fluid.layers.StaticRNN.__init__ (ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.layers.StaticRNN.__init__ (ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
paddle.fluid.layers.StaticRNN.memory (ArgSpec(args=['self', 'init', 'shape', 'batch_ref', 'init_value', 'init_batch_dim_idx', 'ref_batch_dim_idx'], varargs=None, keywords=None, defaults=(None, None, None, 0.0, 0, 1)), ('document', 'c24e368e23afac1ed91a78a639d7a9c7')) paddle.fluid.layers.StaticRNN.memory (ArgSpec(args=['self', 'init', 'shape', 'batch_ref', 'init_value', 'init_batch_dim_idx', 'ref_batch_dim_idx'], varargs=None, keywords=None, defaults=(None, None, None, 0.0, 0, 1)), ('document', '72530f299d6451a567cf4a12dc3fb1ff'))
paddle.fluid.layers.StaticRNN.output (ArgSpec(args=['self'], varargs='outputs', keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.layers.StaticRNN.output (ArgSpec(args=['self'], varargs='outputs', keywords=None, defaults=None), ('document', 'df6ceab6e6c9bd31e97914d7e7538137'))
paddle.fluid.layers.StaticRNN.step (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.layers.StaticRNN.step (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6d3e0a5d9aa519a9773a36e1620ea9b7'))
paddle.fluid.layers.StaticRNN.step_input (ArgSpec(args=['self', 'x'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.layers.StaticRNN.step_input (ArgSpec(args=['self', 'x'], varargs=None, keywords=None, defaults=None), ('document', '903387ec11f3d0bf46821d31a68cffa5'))
paddle.fluid.layers.StaticRNN.step_output (ArgSpec(args=['self', 'o'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.layers.StaticRNN.step_output (ArgSpec(args=['self', 'o'], varargs=None, keywords=None, defaults=None), ('document', '252890d4c3199a7623ab8667e13fd837'))
paddle.fluid.layers.StaticRNN.update_memory (ArgSpec(args=['self', 'mem', 'var'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.layers.StaticRNN.update_memory (ArgSpec(args=['self', 'mem', 'var'], varargs=None, keywords=None, defaults=None), ('document', '7a0000520f179f35239956a5ba55119f'))
paddle.fluid.layers.reorder_lod_tensor_by_rank (ArgSpec(args=['x', 'rank_table'], varargs=None, keywords=None, defaults=None), ('document', '3545f529ef04e8f6ecb76b47fa3df01a')) paddle.fluid.layers.reorder_lod_tensor_by_rank (ArgSpec(args=['x', 'rank_table'], varargs=None, keywords=None, defaults=None), ('document', '3545f529ef04e8f6ecb76b47fa3df01a'))
paddle.fluid.layers.Print (ArgSpec(args=['input', 'first_n', 'message', 'summarize', 'print_tensor_name', 'print_tensor_type', 'print_tensor_shape', 'print_tensor_lod', 'print_phase'], varargs=None, keywords=None, defaults=(-1, None, -1, True, True, True, True, 'both')), ('document', '5fef91b0e21c93610785f2b1f7161732')) paddle.fluid.layers.Print (ArgSpec(args=['input', 'first_n', 'message', 'summarize', 'print_tensor_name', 'print_tensor_type', 'print_tensor_shape', 'print_tensor_lod', 'print_phase'], varargs=None, keywords=None, defaults=(-1, None, -1, True, True, True, True, 'both')), ('document', '5fef91b0e21c93610785f2b1f7161732'))
paddle.fluid.layers.is_empty (ArgSpec(args=['x', 'cond'], varargs=None, keywords=None, defaults=(None,)), ('document', 'bbe578dbb49ad13e15b014e98c22b519')) paddle.fluid.layers.is_empty (ArgSpec(args=['x', 'cond'], varargs=None, keywords=None, defaults=(None,)), ('document', 'bbe578dbb49ad13e15b014e98c22b519'))
...@@ -359,8 +365,7 @@ paddle.fluid.layers.inverse_time_decay (ArgSpec(args=['learning_rate', 'decay_st ...@@ -359,8 +365,7 @@ paddle.fluid.layers.inverse_time_decay (ArgSpec(args=['learning_rate', 'decay_st
paddle.fluid.layers.polynomial_decay (ArgSpec(args=['learning_rate', 'decay_steps', 'end_learning_rate', 'power', 'cycle'], varargs=None, keywords=None, defaults=(0.0001, 1.0, False)), ('document', '882634f420f626642f0874481263da40')) paddle.fluid.layers.polynomial_decay (ArgSpec(args=['learning_rate', 'decay_steps', 'end_learning_rate', 'power', 'cycle'], varargs=None, keywords=None, defaults=(0.0001, 1.0, False)), ('document', '882634f420f626642f0874481263da40'))
paddle.fluid.layers.piecewise_decay (ArgSpec(args=['boundaries', 'values'], varargs=None, keywords=None, defaults=None), ('document', 'c717d9d1d78a53c809d01b8bc56f3cae')) paddle.fluid.layers.piecewise_decay (ArgSpec(args=['boundaries', 'values'], varargs=None, keywords=None, defaults=None), ('document', 'c717d9d1d78a53c809d01b8bc56f3cae'))
paddle.fluid.layers.noam_decay (ArgSpec(args=['d_model', 'warmup_steps'], varargs=None, keywords=None, defaults=None), ('document', 'd9a95746353fd574be36dc28d8726c28')) paddle.fluid.layers.noam_decay (ArgSpec(args=['d_model', 'warmup_steps'], varargs=None, keywords=None, defaults=None), ('document', 'd9a95746353fd574be36dc28d8726c28'))
paddle.fluid.layers.append_LARS (ArgSpec(args=['params_grads', 'learning_rate', 'weight_decay'], varargs=None, keywords=None, defaults=None), ('document', 'd24fa1e7d62ac8a534fc6a86002f84f8')) paddle.fluid.layers.cosine_decay (ArgSpec(args=['learning_rate', 'step_each_epoch', 'epochs'], varargs=None, keywords=None, defaults=None), ('document', 'f8b2727bccf0f368c997d7cf05847e49'))
paddle.fluid.layers.cosine_decay (ArgSpec(args=['learning_rate', 'step_each_epoch', 'epochs'], varargs=None, keywords=None, defaults=None), ('document', '9588c64c26ffaef3c466e404a6af9d9b'))
paddle.fluid.layers.linear_lr_warmup (ArgSpec(args=['learning_rate', 'warmup_steps', 'start_lr', 'end_lr'], varargs=None, keywords=None, defaults=None), ('document', '2ef3f5ca5cd71ea4217c418e5a7a0565')) paddle.fluid.layers.linear_lr_warmup (ArgSpec(args=['learning_rate', 'warmup_steps', 'start_lr', 'end_lr'], varargs=None, keywords=None, defaults=None), ('document', '2ef3f5ca5cd71ea4217c418e5a7a0565'))
paddle.fluid.contrib.InitState.__init__ (ArgSpec(args=['self', 'init', 'shape', 'value', 'init_boot', 'need_reorder', 'dtype'], varargs=None, keywords=None, defaults=(None, None, 0.0, None, False, 'float32')), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.contrib.InitState.__init__ (ArgSpec(args=['self', 'init', 'shape', 'value', 'init_boot', 'need_reorder', 'dtype'], varargs=None, keywords=None, defaults=(None, None, 0.0, None, False, 'float32')), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
paddle.fluid.contrib.StateCell.__init__ (ArgSpec(args=['self', 'inputs', 'states', 'out_state', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.contrib.StateCell.__init__ (ArgSpec(args=['self', 'inputs', 'states', 'out_state', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
......
...@@ -72,7 +72,6 @@ bool DataFeed::PickOneFile(std::string* filename) { ...@@ -72,7 +72,6 @@ bool DataFeed::PickOneFile(std::string* filename) {
} }
VLOG(3) << "file_idx_=" << *file_idx_; VLOG(3) << "file_idx_=" << *file_idx_;
*filename = filelist_[(*file_idx_)++]; *filename = filelist_[(*file_idx_)++];
// LOG(ERROR) << "pick file:" << *filename;
return true; return true;
} }
...@@ -466,6 +465,17 @@ void MultiSlotDataFeed::Init( ...@@ -466,6 +465,17 @@ void MultiSlotDataFeed::Init(
if (slot.is_used()) { if (slot.is_used()) {
use_slots_.push_back(all_slots_[i]); use_slots_.push_back(all_slots_[i]);
use_slots_is_dense_.push_back(slot.is_dense()); use_slots_is_dense_.push_back(slot.is_dense());
std::vector<int> local_shape;
if (slot.is_dense()) {
// for batch size holder if is_dense
if (slot.shape(0) > 0) {
local_shape.push_back(0);
}
}
for (size_t i = 0; i < slot.shape_size(); ++i) {
local_shape.push_back(slot.shape(i));
}
use_slots_shape_.push_back(local_shape);
} }
} }
feed_vec_.resize(use_slots_.size()); feed_vec_.resize(use_slots_.size());
...@@ -752,8 +762,8 @@ void MultiSlotDataFeed::PutToFeedVec( ...@@ -752,8 +762,8 @@ void MultiSlotDataFeed::PutToFeedVec(
LoD data_lod{offset}; LoD data_lod{offset};
feed_vec_[i]->set_lod(data_lod); feed_vec_[i]->set_lod(data_lod);
if (use_slots_is_dense_[i]) { if (use_slots_is_dense_[i]) {
int dim = total_instance / batch_size_; use_slots_shape_[i][0] = batch_size_;
feed_vec_[i]->Resize({batch_size_, dim}); feed_vec_[i]->Resize(framework::make_ddim(use_slots_shape_[i]));
} }
} }
#endif #endif
...@@ -785,6 +795,16 @@ void MultiSlotInMemoryDataFeed::Init( ...@@ -785,6 +795,16 @@ void MultiSlotInMemoryDataFeed::Init(
if (slot.is_used()) { if (slot.is_used()) {
use_slots_.push_back(all_slots_[i]); use_slots_.push_back(all_slots_[i]);
use_slots_is_dense_.push_back(slot.is_dense()); use_slots_is_dense_.push_back(slot.is_dense());
std::vector<int> local_shape;
if (slot.is_dense()) {
if (slot.shape(0) > 0) {
local_shape.push_back(0);
}
}
for (size_t i = 0; i < slot.shape_size(); ++i) {
local_shape.push_back(slot.shape(i));
}
use_slots_shape_.push_back(local_shape);
} }
} }
feed_vec_.resize(use_slots_.size()); feed_vec_.resize(use_slots_.size());
...@@ -940,8 +960,8 @@ void MultiSlotInMemoryDataFeed::PutToFeedVec( ...@@ -940,8 +960,8 @@ void MultiSlotInMemoryDataFeed::PutToFeedVec(
LoD data_lod{offset}; LoD data_lod{offset};
feed_vec_[i]->set_lod(data_lod); feed_vec_[i]->set_lod(data_lod);
if (use_slots_is_dense_[i]) { if (use_slots_is_dense_[i]) {
int dim = total_instance / batch_size_; use_slots_shape_[i][0] = batch_size_;
feed_vec_[i]->Resize({batch_size_, dim}); feed_vec_[i]->Resize(framework::make_ddim(use_slots_shape_[i]));
} }
} }
#endif #endif
......
...@@ -142,6 +142,7 @@ class DataFeed { ...@@ -142,6 +142,7 @@ class DataFeed {
// object) // object)
std::vector<std::string> all_slots_; std::vector<std::string> all_slots_;
std::vector<std::string> all_slots_type_; std::vector<std::string> all_slots_type_;
std::vector<std::vector<int>> use_slots_shape_;
std::vector<int> std::vector<int>
use_slots_index_; // -1: not used; >=0: the index of use_slots_ use_slots_index_; // -1: not used; >=0: the index of use_slots_
......
...@@ -19,6 +19,7 @@ message Slot { ...@@ -19,6 +19,7 @@ message Slot {
required string type = 2; required string type = 2;
optional bool is_dense = 3 [ default = false ]; optional bool is_dense = 3 [ default = false ];
optional bool is_used = 4 [ default = false ]; optional bool is_used = 4 [ default = false ];
repeated int32 shape = 5; // we can define N-D Tensor
} }
message MultiSlotDesc { repeated Slot slots = 1; } message MultiSlotDesc { repeated Slot slots = 1; }
......
...@@ -53,6 +53,10 @@ AllReduceOpHandle::AllReduceOpHandle(ir::Node *node, ...@@ -53,6 +53,10 @@ AllReduceOpHandle::AllReduceOpHandle(ir::Node *node,
this->SetDeviceContext(p, nccl_ctxs_->DevCtx(p)); this->SetDeviceContext(p, nccl_ctxs_->DevCtx(p));
} }
} }
// TODO(gongwb) :polish them!
if (is_encoded) {
VLOG(1) << "Use dgc allreduce mode";
}
} }
#else #else
AllReduceOpHandle::AllReduceOpHandle(ir::Node *node, AllReduceOpHandle::AllReduceOpHandle(ir::Node *node,
...@@ -86,7 +90,7 @@ void AllReduceOpHandle::RunImplEncoded() { ...@@ -86,7 +90,7 @@ void AllReduceOpHandle::RunImplEncoded() {
paddle::framework::GradOriginalVarName(in_var_handles[i]->name()); paddle::framework::GradOriginalVarName(in_var_handles[i]->name());
auto encode_var_name = original_name + g_dgc_encoded; auto encode_var_name = original_name + g_dgc_encoded;
auto *in_var = local_scope->FindVar(encode_var_name); auto *in_var = local_scope->FindVar(encode_var_name);
PADDLE_ENFORCE_NOT_NULL(in_var); PADDLE_ENFORCE_NOT_NULL(in_var, "%s should not be null", encode_var_name);
auto &in = in_var->Get<LoDTensor>(); auto &in = in_var->Get<LoDTensor>();
ins.emplace_back(&in); ins.emplace_back(&in);
......
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <algorithm>
#include <string>
#include <unordered_map>
#include <utility>
#include <vector>
#include "paddle/fluid/framework/details/build_strategy.h"
#include "paddle/fluid/framework/details/multi_devices_helper.h"
#include "paddle/fluid/framework/ir/graph_helper.h"
namespace paddle {
namespace framework {
namespace details {
void SetFuseParameterGroupsSize(int group_size);
int GetFuseParameterGroupsSize();
void SetFuseParameterMemorySize(uint64_t memory_size);
uint64_t GetFuseParameterMemorySize();
class AllocContinuousSpaceForGradPass : public ir::Pass {
protected:
void ApplyImpl(ir::Graph *graph) const override;
template <typename AttrType>
void ResetAttribute(const std::string &attr_name, ir::Graph *graph) const;
void SetGroupGradsAndParams(
const std::unordered_map<std::string, ir::Node *> &var_nodes,
const ParamsAndGrads &params_grads,
GroupGradsAndParams *group_grads_params) const;
void SetGroupAccordingToLayers(
const std::unordered_map<std::string, ir::Node *> &var_nodes,
const ParamsAndGrads &params_grads,
GroupGradsAndParams *group_grads_params) const;
void SetGroupAccordingToMemorySize(
const std::unordered_map<std::string, ir::Node *> &var_nodes,
GroupGradsAndParams *group_grads_params) const;
void SetGroupAccordingToGroupSize(
const std::unordered_map<std::string, ir::Node *> &var_nodes,
GroupGradsAndParams *group_grads_params) const;
private:
bool IsSupportedVarType(const proto::VarType::Type &type) const;
void RecordParamsAndGrads(ir::Node *node, ParamsAndGrads *params_grads) const;
void InitFusedVarsAndAllocSpaceForVars(
const std::vector<platform::Place> &places,
const std::vector<Scope *> &local_scopes,
const std::unordered_map<std::string, ir::Node *> &vars,
const std::string &fused_var_name,
const ParamsAndGrads &params_grads) const;
void AppendAllocSpaceForVarsOp(const std::vector<std::string> &params_name,
const std::vector<std::string> &grads_name,
const std::string &fused_var_name,
BlockDesc *global_block) const;
};
} // namespace details
} // namespace framework
} // namespace paddle
...@@ -101,8 +101,6 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder { ...@@ -101,8 +101,6 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder {
"mode."; "mode.";
strategy_.fuse_all_optimizer_ops_ = false; strategy_.fuse_all_optimizer_ops_ = false;
} else { } else {
VLOG(10) << "Add alloc_continuous_space_for_grad_pass";
AppendPass("alloc_continuous_space_for_grad_pass");
// NOTE: fuse_all_xx_ops will count the number of xx operator first, // NOTE: fuse_all_xx_ops will count the number of xx operator first,
// if the number is zero, fuse_all_reduce_ops will do nothing. // if the number is zero, fuse_all_reduce_ops will do nothing.
// Currently, only one type of optimization algorithm can be fused. // Currently, only one type of optimization algorithm can be fused.
...@@ -142,6 +140,19 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder { ...@@ -142,6 +140,19 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder {
AppendPass("memory_optimize_pass"); AppendPass("memory_optimize_pass");
} }
// runtime_context_cache pass should be the last pass to enable the attr of
// all original and fused operators. But no operators can be enabled this
// attr if putting it after MultiDevPass.
if (strategy_.cache_runtime_context_) {
VLOG(10) << "Add runtime_context_cache_pass";
AppendPass("runtime_context_cache_pass");
}
if (strategy_.cache_expected_kernel_) {
VLOG(10) << "Add expected_kernel_cache_pass";
AppendPass("expected_kernel_cache_pass");
}
AppendMultiDevPass(strategy_); AppendMultiDevPass(strategy_);
if (strategy_.fuse_all_reduce_ops_) { if (strategy_.fuse_all_reduce_ops_) {
...@@ -243,7 +254,7 @@ ir::Graph *BuildStrategy::Apply(ir::Graph *graph, ...@@ -243,7 +254,7 @@ ir::Graph *BuildStrategy::Apply(ir::Graph *graph,
CreatePassesFromStrategy(false); CreatePassesFromStrategy(false);
for (std::shared_ptr<ir::Pass> &pass : pass_builder_->AllPasses()) { for (std::shared_ptr<ir::Pass> &pass : pass_builder_->AllPasses()) {
VLOG(3) << "apply " << pass->Type(); VLOG(3) << "BuildStrategy::Apply pass:" << pass->Type();
if (IsMultiDevPass(pass->Type())) { if (IsMultiDevPass(pass->Type())) {
pass->Erase(kPlaces); pass->Erase(kPlaces);
pass->SetNotOwned<const std::vector<platform::Place>>(kPlaces, &places); pass->SetNotOwned<const std::vector<platform::Place>>(kPlaces, &places);
...@@ -328,3 +339,5 @@ USE_PASS(graph_to_program_pass); ...@@ -328,3 +339,5 @@ USE_PASS(graph_to_program_pass);
USE_PASS(fuse_adam_op_pass); USE_PASS(fuse_adam_op_pass);
USE_PASS(fuse_sgd_op_pass); USE_PASS(fuse_sgd_op_pass);
USE_PASS(fuse_all_reduce_op_pass); USE_PASS(fuse_all_reduce_op_pass);
USE_PASS(runtime_context_cache_pass);
USE_PASS(expected_kernel_cache_pass);
...@@ -83,11 +83,11 @@ struct BuildStrategy { ...@@ -83,11 +83,11 @@ struct BuildStrategy {
bool sync_batch_norm_{false}; bool sync_batch_norm_{false};
bool memory_optimize_{true}; // FIXME(liuwei1031) disable memory_optimzie and enable_inplace in 1.4
// TODO(dzhwinter): // to open them by default, we need to solve the fetch variable issue
// make enable_inplace, memory_optimize_ bool memory_optimize_{false};
// memory_early_delete_ true by default
bool enable_inplace_{true}; bool enable_inplace_{false};
bool enable_sequential_execution_{false}; bool enable_sequential_execution_{false};
...@@ -107,6 +107,9 @@ struct BuildStrategy { ...@@ -107,6 +107,9 @@ struct BuildStrategy {
std::vector<std::string> trainers_endpoints_; std::vector<std::string> trainers_endpoints_;
bool remove_unnecessary_lock_{true}; bool remove_unnecessary_lock_{true};
bool cache_runtime_context_{false};
bool cache_expected_kernel_{true};
// NOTE: // NOTE:
// Before you add new options, think if it's a general strategy that works // Before you add new options, think if it's a general strategy that works
// with other strategy. If not, the strategy should be created through // with other strategy. If not, the strategy should be created through
......
...@@ -24,7 +24,7 @@ namespace details { ...@@ -24,7 +24,7 @@ namespace details {
const std::string FuseAdamOpPass::GetOpType() const { return "adam"; } const std::string FuseAdamOpPass::GetOpType() const { return "adam"; }
const std::vector<std::string> FuseAdamOpPass::GetAuxiliaryVarNames() const { const std::vector<std::string> FuseAdamOpPass::GetAuxiliaryVarNames() const {
return {"Param", "Moment1", "Moment2", "Beta1Pow", "Beta2Pow"}; return {"Moment1", "Moment2", "Beta1Pow", "Beta2Pow"};
} }
void FuseAdamOpPass::FuseOptimizerOps( void FuseAdamOpPass::FuseOptimizerOps(
...@@ -77,16 +77,16 @@ void FuseAdamOpPass::FuseAdamOps( ...@@ -77,16 +77,16 @@ void FuseAdamOpPass::FuseAdamOps(
VLOG(10) << "Insert adam to graph "; VLOG(10) << "Insert adam to graph ";
OpDesc adam_desc(adam_ops[0]->Op()->Block()); OpDesc adam_desc(adam_ops[0]->Op()->Block());
adam_desc.SetType("adam"); adam_desc.SetType("adam");
adam_desc.SetInput("Param", {fused_vars_name.at("Param")}); adam_desc.SetInput(kParam, {fused_vars_name.at(kParam)});
adam_desc.SetInput("Grad", {fused_vars_name.at("Grad")}); adam_desc.SetInput(kGrad, {fused_vars_name.at(kGrad)});
adam_desc.SetInput("Moment1", {fused_vars_name.at("Moment1")}); adam_desc.SetInput("Moment1", {fused_vars_name.at("Moment1")});
adam_desc.SetInput("Moment2", {fused_vars_name.at("Moment2")}); adam_desc.SetInput("Moment2", {fused_vars_name.at("Moment2")});
// TODO(zcd): The LearningRate, Beta1Pow, Beta2Pow should be equal. // TODO(zcd): The LearningRate, Beta1Pow, Beta2Pow should be equal.
adam_desc.SetInput("LearningRate", adam_ops[0]->Op()->Input("LearningRate")); adam_desc.SetInput(kLearningRate, adam_ops[0]->Op()->Input(kLearningRate));
adam_desc.SetInput("Beta1Pow", adam_ops[0]->Op()->Input("Beta1Pow")); adam_desc.SetInput("Beta1Pow", adam_ops[0]->Op()->Input("Beta1Pow"));
adam_desc.SetInput("Beta2Pow", adam_ops[0]->Op()->Input("Beta2Pow")); adam_desc.SetInput("Beta2Pow", adam_ops[0]->Op()->Input("Beta2Pow"));
adam_desc.SetOutput("ParamOut", {fused_vars_name.at("Param")}); adam_desc.SetOutput("ParamOut", {fused_vars_name.at(kParam)});
adam_desc.SetOutput("Moment1Out", {fused_vars_name.at("Moment1")}); adam_desc.SetOutput("Moment1Out", {fused_vars_name.at("Moment1")});
adam_desc.SetOutput("Moment2Out", {fused_vars_name.at("Moment2")}); adam_desc.SetOutput("Moment2Out", {fused_vars_name.at("Moment2")});
adam_desc.SetAttr("beta1", beta1); adam_desc.SetAttr("beta1", beta1);
......
...@@ -29,7 +29,9 @@ void FuseOptimizerOpPass::ApplyImpl(ir::Graph *graph) const { ...@@ -29,7 +29,9 @@ void FuseOptimizerOpPass::ApplyImpl(ir::Graph *graph) const {
auto &local_scopes = Get<const std::vector<Scope *>>(kLocalScopes); auto &local_scopes = Get<const std::vector<Scope *>>(kLocalScopes);
const std::string fuse_op_type = GetOpType(); const std::string fuse_op_type = GetOpType();
const std::vector<std::string> aux_var_names = GetAuxiliaryVarNames(); std::vector<std::string> aux_var_names = GetAuxiliaryVarNames();
aux_var_names.emplace_back(kParam);
aux_var_names.emplace_back(kGrad);
// Step 1: Get the specified op and auxiliary variables. // Step 1: Get the specified op and auxiliary variables.
std::vector<ir::Node *> topo_nodes = ir::TopologySortOperations(result); std::vector<ir::Node *> topo_nodes = ir::TopologySortOperations(result);
...@@ -61,7 +63,7 @@ void FuseOptimizerOpPass::ApplyImpl(ir::Graph *graph) const { ...@@ -61,7 +63,7 @@ void FuseOptimizerOpPass::ApplyImpl(ir::Graph *graph) const {
result.Set(kFusedVars, new FusedVars); result.Set(kFusedVars, new FusedVars);
} }
std::unordered_map<std::string, std::string> fused_vars_name; std::unordered_map<std::string, std::string> fused_vars_name;
fused_vars_name.reserve(aux_var_names.size() + 1); fused_vars_name.reserve(aux_var_names.size());
auto &fused_var_set = result.Get<FusedVars>(kFusedVars); auto &fused_var_set = result.Get<FusedVars>(kFusedVars);
const std::string prefix(kFusedVarNamePrefix); const std::string prefix(kFusedVarNamePrefix);
// NOTE: the fused_var_name should be unique. // NOTE: the fused_var_name should be unique.
...@@ -75,39 +77,103 @@ void FuseOptimizerOpPass::ApplyImpl(ir::Graph *graph) const { ...@@ -75,39 +77,103 @@ void FuseOptimizerOpPass::ApplyImpl(ir::Graph *graph) const {
} }
// Step 3: Get the fused Gradient's name // Step 3: Get the fused Gradient's name
auto &params_grads = result.Get<ParamsAndGrads>(kParamsAndGrads); bool grad_fused = false;
if (!result.Has(kFusedGrads)) { if (result.Has(kParamsAndGrads)) {
PADDLE_THROW( auto &params_grads = result.Get<ParamsAndGrads>(kParamsAndGrads);
"The alloc_continuous_space_for_grad_pass should be called before this " PADDLE_ENFORCE_EQ(
"pass."); params_grads.size(), aux_var_set.at(kGrad).size(),
} "The number of gradients and optimizer ops is not equal.");
auto &fused_grad = result.Get<FusedGrads>(kFusedGrads); std::unordered_set<std::string> opt_grad_set(aux_var_set.at(kGrad).begin(),
auto &fused_vars = result.Get<FusedVars>(kFusedVars); aux_var_set.at(kGrad).end());
auto iter = std::find(fused_vars.begin(), fused_vars.end(), fused_grad); size_t same_grad_num = 0;
PADDLE_ENFORCE(iter != fused_vars.end(), "Not find the fused_grad."); for (auto &p_g : params_grads) {
fused_vars_name.emplace("Grad", fused_grad); if (opt_grad_set.count(p_g.second)) {
++same_grad_num;
// Step 4: Sort the parameters and auxiliary variables according }
// to parameters' name to make variables' name correspond correctly. }
PADDLE_ENFORCE(result.Has(kParamsAndGrads), "Does't find kParamsAndGrads.");
PADDLE_ENFORCE_EQ(params_grads.size(), aux_var_set.begin()->second.size(), // NOTE(zcd): the gradient of kParamsAndGrads may be different with the
"The size of params_grads and aux_var_set are not equal."); // kGrad.
SortParametersAndAuxVars(params_grads, &aux_var_set, &opt_ops); if (same_grad_num == aux_var_set.at(kGrad).size()) {
if (!result.Has(kFusedGrads)) {
// Step 5: Alloc continuous space for Parameters and AuxiliaryVar(e.g. PADDLE_THROW(
"The alloc_continuous_space_for_grad_pass should be called before "
"this pass.");
}
auto &fused_grad = result.Get<FusedGrads>(kFusedGrads);
auto &fused_vars = result.Get<FusedVars>(kFusedVars);
auto iter = std::find(fused_vars.begin(), fused_vars.end(), fused_grad);
PADDLE_ENFORCE(iter != fused_vars.end(), "Not find the fused_grad.");
fused_vars_name[kGrad] = fused_grad;
// Sort the parameters and auxiliary variables according
// to parameters' name to make variables' name correspond correctly.
SortParametersAndAuxVars(params_grads, &aux_var_set, &opt_ops);
grad_fused = true;
}
}
// Step 4: Alloc continuous space for Parameters and AuxiliaryVar(e.g.
// Moment1, Moment2, Beta1Pow, Beta2Pow) of all the optimizer ops separately. // Moment1, Moment2, Beta1Pow, Beta2Pow) of all the optimizer ops separately.
aux_var_names.pop_back();
if (!grad_fused) {
InitFusedGradsAndAllocSpaceForGrads(
places, local_scopes, aux_var_set.at(kParam), aux_var_set.at(kGrad),
fused_vars_name.at(kGrad), &result);
}
InitFusedVarsAndAllocSpaceForVars(places, local_scopes, aux_var_names, InitFusedVarsAndAllocSpaceForVars(places, local_scopes, aux_var_names,
aux_var_set, fused_vars_name); aux_var_set, fused_vars_name);
// Step 6: Fuse optimizer Ops and Scale Ops // Step 5: Fuse optimizer Ops and Scale Ops
FuseOptimizerOps(aux_var_set, fused_vars_name, opt_ops, &result); FuseOptimizerOps(aux_var_set, fused_vars_name, opt_ops, &result);
// Step 7: Remove optimizer Ops // Step 6: Remove optimizer Ops
for (auto &opt_op : opt_ops) { for (auto &opt_op : opt_ops) {
graph->RemoveNode(opt_op); graph->RemoveNode(opt_op);
} }
} }
void FuseOptimizerOpPass::InitFusedGradsAndAllocSpaceForGrads(
const std::vector<platform::Place> &places,
const std::vector<Scope *> &local_scopes,
const std::vector<std::string> &params,
const std::vector<std::string> &grads, const std::string &fused_grad_name,
ir::Graph *result) const {
// Get Var Nodes
std::unordered_map<std::string, ir::Node *> vars;
for (ir::Node *node : result->Nodes()) {
if (node->IsVar() && node->Var()) {
// Note: The graph may have the same name node. For example, parameter
// is the input of operator and it also is the output of optimizer;
vars.emplace(node->Var()->Name(), node);
}
}
// Init Grads
for (auto it = local_scopes.rbegin(); it != local_scopes.rend(); ++it) {
auto &scope = *it;
VLOG(10) << "Init " << fused_grad_name;
PADDLE_ENFORCE(scope->FindVar(fused_grad_name) == nullptr,
"%s has existed in scope.", fused_grad_name);
scope->Var(fused_grad_name)->GetMutable<LoDTensor>();
for (auto &grad_var_name : grads) {
auto iter = vars.find(grad_var_name);
PADDLE_ENFORCE(iter != vars.end());
PADDLE_ENFORCE_NOT_NULL(iter->second->Var());
PADDLE_ENFORCE_EQ(iter->second->Var()->GetType(),
proto::VarType::LOD_TENSOR);
scope->Var(grad_var_name)->GetMutable<LoDTensor>();
}
}
// Define Ops
ProgramDesc program_desc;
auto *global_block = program_desc.MutableBlock(0);
AppendAllocContinuousSpace(params, grads, fused_grad_name, global_block,
false, false);
// Run Ops
RunInitOps(places, local_scopes, *global_block);
}
void FuseOptimizerOpPass::InitFusedVarsAndAllocSpaceForVars( void FuseOptimizerOpPass::InitFusedVarsAndAllocSpaceForVars(
const std::vector<platform::Place> &places, const std::vector<platform::Place> &places,
const std::vector<Scope *> &local_scopes, const std::vector<Scope *> &local_scopes,
...@@ -115,37 +181,49 @@ void FuseOptimizerOpPass::InitFusedVarsAndAllocSpaceForVars( ...@@ -115,37 +181,49 @@ void FuseOptimizerOpPass::InitFusedVarsAndAllocSpaceForVars(
const std::unordered_map<std::string, std::vector<std::string>> const std::unordered_map<std::string, std::vector<std::string>>
&aux_var_set, &aux_var_set,
const std::unordered_map<std::string, std::string> &fused_vars_name) const { const std::unordered_map<std::string, std::string> &fused_vars_name) const {
VLOG(10) << "Init FusedVars."; // Init Vars
// Alloc parameters and auxiliary vars in the respective scope. for (auto &var_name : aux_var_names) {
size_t idx = local_scopes.size(); auto &fused_var_name = fused_vars_name.at(var_name);
for (auto iter = local_scopes.rbegin(); iter != local_scopes.rend(); InitVars(local_scopes, fused_var_name);
++iter, --idx) {
auto &scope = *iter;
for (auto &var_name : aux_var_names) {
auto fused_var_name = fused_vars_name.at(var_name);
VLOG(10) << "Init " << fused_var_name;
PADDLE_ENFORCE(scope->FindVar(fused_var_name) == nullptr,
"%s has exist in scope[%d]", fused_var_name, idx);
scope->Var(fused_var_name)->GetMutable<LoDTensor>();
}
} }
// Define Ops
ProgramDesc program_desc; ProgramDesc program_desc;
auto *global_block = program_desc.MutableBlock(0); auto *global_block = program_desc.MutableBlock(0);
for (auto &var_name : aux_var_names) { for (auto &var_name : aux_var_names) {
AppendAllocContinuousSpace(aux_var_set.at(var_name), AppendAllocContinuousSpace(
fused_vars_name.at(var_name), true, aux_var_set.at(var_name), aux_var_set.at(var_name),
global_block); fused_vars_name.at(var_name), global_block, true);
} }
// Run Ops
RunInitOps(places, local_scopes, *global_block);
}
void FuseOptimizerOpPass::RunInitOps(const std::vector<platform::Place> &places,
const std::vector<Scope *> &local_scopes,
const BlockDesc &global_block) const {
for (size_t i = 0; i < local_scopes.size(); ++i) { for (size_t i = 0; i < local_scopes.size(); ++i) {
for (auto &op_desc : global_block->AllOps()) { for (auto &op_desc : global_block.AllOps()) {
auto op = OpRegistry::CreateOp(*op_desc); auto op = OpRegistry::CreateOp(*op_desc);
op->Run(*local_scopes[i], places[i]); op->Run(*local_scopes[i], places[i]);
} }
} }
} }
void FuseOptimizerOpPass::InitVars(const std::vector<Scope *> &local_scopes,
const std::string &fused_var_name) const {
VLOG(10) << "Init FusedVars.";
// Alloc parameters and auxiliary vars in the respective scope.
size_t idx = local_scopes.size();
for (auto iter = local_scopes.rbegin(); iter != local_scopes.rend();
++iter, --idx) {
auto &scope = *iter;
VLOG(10) << "Init " << fused_var_name;
PADDLE_ENFORCE(scope->FindVar(fused_var_name) == nullptr,
"%s has exist in scope[%d]", fused_var_name, idx);
scope->Var(fused_var_name)->GetMutable<LoDTensor>();
}
}
void FuseOptimizerOpPass::SortParametersAndAuxVars( void FuseOptimizerOpPass::SortParametersAndAuxVars(
const std::vector<std::pair<std::string, std::string>> &params_grads, const std::vector<std::pair<std::string, std::string>> &params_grads,
std::unordered_map<std::string, std::vector<std::string>> *aux_vars_set, std::unordered_map<std::string, std::vector<std::string>> *aux_vars_set,
...@@ -203,15 +281,16 @@ void FuseOptimizerOpPass::GetSpecifiedOpsAndVars( ...@@ -203,15 +281,16 @@ void FuseOptimizerOpPass::GetSpecifiedOpsAndVars(
} }
void FuseOptimizerOpPass::AppendAllocContinuousSpace( void FuseOptimizerOpPass::AppendAllocContinuousSpace(
const std::vector<std::string> &args, const std::string &out_arg, const std::vector<std::string> &in_args,
bool copy_data, BlockDesc *global_block) const { const std::vector<std::string> &out_args, const std::string &fused_out_arg,
BlockDesc *global_block, bool copy_data, bool check_name) const {
auto op_desc = global_block->AppendOp(); auto op_desc = global_block->AppendOp();
op_desc->SetType("alloc_continuous_space"); op_desc->SetType("alloc_continuous_space");
op_desc->SetInput("Input", args); op_desc->SetInput("Input", in_args);
op_desc->SetOutput("Output", args); op_desc->SetOutput("Output", out_args);
op_desc->SetOutput("FusedOutput", {out_arg}); op_desc->SetOutput("FusedOutput", {fused_out_arg});
op_desc->SetAttr("copy_data", copy_data); op_desc->SetAttr("copy_data", copy_data);
op_desc->SetAttr("check_name", true); op_desc->SetAttr("check_name", check_name);
} }
void FuseOptimizerOpPass::InserInputAndOutputForOptOps( void FuseOptimizerOpPass::InserInputAndOutputForOptOps(
......
...@@ -27,6 +27,10 @@ namespace paddle { ...@@ -27,6 +27,10 @@ namespace paddle {
namespace framework { namespace framework {
namespace details { namespace details {
constexpr char kGrad[] = "Grad";
constexpr char kParam[] = "Param";
constexpr char kLearningRate[] = "LearningRate";
class FuseOptimizerOpPass : public ir::Pass { class FuseOptimizerOpPass : public ir::Pass {
protected: protected:
void ApplyImpl(ir::Graph *graph) const override; void ApplyImpl(ir::Graph *graph) const override;
...@@ -56,9 +60,18 @@ class FuseOptimizerOpPass : public ir::Pass { ...@@ -56,9 +60,18 @@ class FuseOptimizerOpPass : public ir::Pass {
std::unordered_map<std::string, std::vector<std::string>> *aux_args_name) std::unordered_map<std::string, std::vector<std::string>> *aux_args_name)
const; const;
void AppendAllocContinuousSpace(const std::vector<std::string> &args, void AppendAllocContinuousSpace(const std::vector<std::string> &in_args,
const std::string &out_arg, bool copy_data, const std::vector<std::string> &out_args,
BlockDesc *global_block) const; const std::string &fused_out_arg,
BlockDesc *global_block, bool copy_data,
bool check_name = true) const;
void InitFusedGradsAndAllocSpaceForGrads(
const std::vector<platform::Place> &places,
const std::vector<Scope *> &local_scopes,
const std::vector<std::string> &params,
const std::vector<std::string> &grads, const std::string &fused_grad_name,
ir::Graph *result) const;
void InitFusedVarsAndAllocSpaceForVars( void InitFusedVarsAndAllocSpaceForVars(
const std::vector<platform::Place> &places, const std::vector<platform::Place> &places,
...@@ -68,6 +81,13 @@ class FuseOptimizerOpPass : public ir::Pass { ...@@ -68,6 +81,13 @@ class FuseOptimizerOpPass : public ir::Pass {
&aux_var_set, &aux_var_set,
const std::unordered_map<std::string, std::string> &fused_vars_name) const std::unordered_map<std::string, std::string> &fused_vars_name)
const; const;
void RunInitOps(const std::vector<platform::Place> &places,
const std::vector<Scope *> &local_scopes,
const BlockDesc &global_block) const;
void InitVars(const std::vector<Scope *> &local_scopes,
const std::string &fused_var_name) const;
}; };
} // namespace details } // namespace details
......
...@@ -24,7 +24,7 @@ namespace details { ...@@ -24,7 +24,7 @@ namespace details {
const std::string FuseSgdOpPass::GetOpType() const { return "sgd"; } const std::string FuseSgdOpPass::GetOpType() const { return "sgd"; }
const std::vector<std::string> FuseSgdOpPass::GetAuxiliaryVarNames() const { const std::vector<std::string> FuseSgdOpPass::GetAuxiliaryVarNames() const {
return {"Param"}; return {};
} }
void FuseSgdOpPass::FuseOptimizerOps( void FuseSgdOpPass::FuseOptimizerOps(
...@@ -50,12 +50,12 @@ void FuseSgdOpPass::FuseSgdOps( ...@@ -50,12 +50,12 @@ void FuseSgdOpPass::FuseSgdOps(
// Add fused scale // Add fused scale
OpDesc Sgd_desc(sgd_ops[0]->Op()->Block()); OpDesc Sgd_desc(sgd_ops[0]->Op()->Block());
Sgd_desc.SetType("sgd"); Sgd_desc.SetType("sgd");
Sgd_desc.SetInput("Param", {fused_vars_name.at("Param")}); Sgd_desc.SetInput(kParam, {fused_vars_name.at(kParam)});
Sgd_desc.SetInput("Grad", {fused_vars_name.at("Grad")}); Sgd_desc.SetInput(kGrad, {fused_vars_name.at(kGrad)});
Sgd_desc.SetOutput("ParamOut", {fused_vars_name.at("Param")}); Sgd_desc.SetOutput("ParamOut", {fused_vars_name.at(kParam)});
// TODO(zcd): The LearningRate, Beta1Pow, Beta2Pow should be equal. // TODO(zcd): The LearningRate, Beta1Pow, Beta2Pow should be equal.
Sgd_desc.SetInput("LearningRate", sgd_ops[0]->Op()->Input("LearningRate")); Sgd_desc.SetInput(kLearningRate, sgd_ops[0]->Op()->Input(kLearningRate));
// NOTE: multi_devices_pass requires that every op should have a role. // NOTE: multi_devices_pass requires that every op should have a role.
Sgd_desc.SetAttr(OpProtoAndCheckerMaker::OpRoleAttrName(), op_role); Sgd_desc.SetAttr(OpProtoAndCheckerMaker::OpRoleAttrName(), op_role);
......
...@@ -305,6 +305,12 @@ void InplacePass::TryInplaceOpInputOutput(ir::Node* op, ...@@ -305,6 +305,12 @@ void InplacePass::TryInplaceOpInputOutput(ir::Node* op,
VLOG(4) << "Try to inplace " << in_var_name << " with " << out_var_name; VLOG(4) << "Try to inplace " << in_var_name << " with " << out_var_name;
if (var_nodes_[in_var_name].back() != in_node) {
VLOG(4) << "SKIP since " << in_var_name
<< " is also used as output by other ops";
continue;
}
bool can_replace = true; bool can_replace = true;
if (in_var_name == out_var_name) { if (in_var_name == out_var_name) {
can_replace = false; can_replace = false;
...@@ -527,6 +533,9 @@ void GraphView::Build(ir::Graph* g) { ...@@ -527,6 +533,9 @@ void GraphView::Build(ir::Graph* g) {
}; };
for (auto& node : g->Nodes()) { for (auto& node : g->Nodes()) {
if (!node->IsOp()) continue; if (!node->IsOp()) continue;
// avoid optimize the variable used in sub-blocks
if (OpHasSubBlock(node->Op())) update_skip_set(node);
if (node->Name() == "send") update_skip_set(node); if (node->Name() == "send") update_skip_set(node);
if (node->Name() == "recv") update_skip_set(node); if (node->Name() == "recv") update_skip_set(node);
if (node->Name() == "prefetch") update_skip_set(node); if (node->Name() == "prefetch") update_skip_set(node);
......
...@@ -233,6 +233,12 @@ struct OpInfoFiller<T, kNoNeedBufferVarsInference> { ...@@ -233,6 +233,12 @@ struct OpInfoFiller<T, kNoNeedBufferVarsInference> {
} }
}; };
// A fake OpInfoFiller of void
template <>
struct OpInfoFiller<void, kUnknown> {
void operator()(const char* op_type, OpInfo* info) const {}
};
} // namespace details } // namespace details
} // namespace framework } // namespace framework
......
...@@ -106,7 +106,7 @@ ParallelSSAGraphExecutor::ParallelSSAGraphExecutor( ...@@ -106,7 +106,7 @@ ParallelSSAGraphExecutor::ParallelSSAGraphExecutor(
VLOG(1) << "set num_threads: " << strategy_.num_threads_ VLOG(1) << "set num_threads: " << strategy_.num_threads_
<< " to run the operators of the graph on each device."; << " to run the operators of the graph on each device.";
for (size_t i = 0; i < places.size(); ++i) { for (size_t i = 0; i < places.size(); ++i) {
executors_.emplace_back(new details::ThreadedSSAGraphExecutor( executors_.emplace_back(new details::FastThreadedSSAGraphExecutor(
strategy_, local_scopes_, {places_[i]}, graphs_.at(i).get())); strategy_, local_scopes_, {places_[i]}, graphs_.at(i).get()));
} }
} }
......
...@@ -14,12 +14,12 @@ ...@@ -14,12 +14,12 @@
#pragma once #pragma once
#include <memory>
#include <string> #include <string>
#include <vector> #include <vector>
#include "ThreadPool.h" #include "ThreadPool.h"
#include "paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h"
#include "paddle/fluid/framework/details/multi_devices_helper.h" #include "paddle/fluid/framework/details/multi_devices_helper.h"
#include "paddle/fluid/framework/details/threaded_ssa_graph_executor.h"
#include "paddle/fluid/framework/ir/graph.h" #include "paddle/fluid/framework/ir/graph.h"
namespace paddle { namespace paddle {
...@@ -48,7 +48,8 @@ class ParallelSSAGraphExecutor : public SSAGraphExecutor { ...@@ -48,7 +48,8 @@ class ParallelSSAGraphExecutor : public SSAGraphExecutor {
std::vector<platform::Place> places_; std::vector<platform::Place> places_;
std::vector<std::unique_ptr<ir::Graph>> graphs_; std::vector<std::unique_ptr<ir::Graph>> graphs_;
std::vector<std::unique_ptr<details::ThreadedSSAGraphExecutor>> executors_; std::vector<std::unique_ptr<details::FastThreadedSSAGraphExecutor>>
executors_;
ExceptionHolder exception_holder_; ExceptionHolder exception_holder_;
}; };
......
...@@ -21,40 +21,40 @@ namespace framework { ...@@ -21,40 +21,40 @@ namespace framework {
void DownpourWorker::Initialize(const TrainerDesc& desc) { void DownpourWorker::Initialize(const TrainerDesc& desc) {
param_ = desc.downpour_param(); param_ = desc.downpour_param();
for (size_t i = 0; i < param_.sparse_table_size(); ++i) { for (int i = 0; i < param_.sparse_table_size(); ++i) {
uint64_t table_id = uint64_t table_id =
static_cast<uint64_t>(param_.sparse_table(i).table_id()); static_cast<uint64_t>(param_.sparse_table(i).table_id());
TableParameter table = param_.sparse_table(i); TableParameter table = param_.sparse_table(i);
sparse_key_names_[table_id].resize(table.sparse_key_name_size()); sparse_key_names_[table_id].resize(table.sparse_key_name_size());
for (size_t j = 0; j < table.sparse_key_name_size(); ++j) { for (int j = 0; j < table.sparse_key_name_size(); ++j) {
sparse_key_names_[table_id][j] = table.sparse_key_name(j); sparse_key_names_[table_id][j] = table.sparse_key_name(j);
} }
sparse_value_names_[table_id].resize(table.sparse_value_name_size()); sparse_value_names_[table_id].resize(table.sparse_value_name_size());
for (size_t j = 0; j < table.sparse_value_name_size(); ++j) { for (int j = 0; j < table.sparse_value_name_size(); ++j) {
sparse_value_names_[table_id][j] = table.sparse_value_name(j); sparse_value_names_[table_id][j] = table.sparse_value_name(j);
} }
sparse_grad_names_[table_id].resize(table.sparse_grad_name_size()); sparse_grad_names_[table_id].resize(table.sparse_grad_name_size());
for (size_t j = 0; j < table.sparse_grad_name_size(); ++j) { for (int j = 0; j < table.sparse_grad_name_size(); ++j) {
sparse_grad_names_[table_id][j] = table.sparse_grad_name(j); sparse_grad_names_[table_id][j] = table.sparse_grad_name(j);
} }
label_var_name_[table_id] = table.label_var_name(); label_var_name_[table_id] = table.label_var_name();
} }
for (size_t i = 0; i < param_.dense_table_size(); ++i) { for (int i = 0; i < param_.dense_table_size(); ++i) {
uint64_t table_id = static_cast<uint64_t>(param_.dense_table(i).table_id()); uint64_t table_id = static_cast<uint64_t>(param_.dense_table(i).table_id());
auto table = param_.dense_table(i); auto table = param_.dense_table(i);
dense_value_names_[table_id].resize(table.dense_value_name_size()); dense_value_names_[table_id].resize(table.dense_value_name_size());
for (size_t j = 0; j < table.dense_value_name_size(); ++j) { for (int j = 0; j < table.dense_value_name_size(); ++j) {
dense_value_names_[table_id][j] = table.dense_value_name(j); dense_value_names_[table_id][j] = table.dense_value_name(j);
} }
dense_grad_names_[table_id].resize(table.dense_grad_name_size()); dense_grad_names_[table_id].resize(table.dense_grad_name_size());
for (size_t j = 0; j < table.dense_grad_name_size(); ++j) { for (int j = 0; j < table.dense_grad_name_size(); ++j) {
dense_grad_names_[table_id][j] = table.dense_grad_name(j); dense_grad_names_[table_id][j] = table.dense_grad_name(j);
} }
} }
skip_ops_.resize(param_.skip_ops_size()); skip_ops_.resize(param_.skip_ops_size());
for (size_t i = 0; i < param_.skip_ops_size(); ++i) { for (int i = 0; i < param_.skip_ops_size(); ++i) {
skip_ops_[i] = param_.skip_ops(i); skip_ops_[i] = param_.skip_ops(i);
} }
...@@ -83,14 +83,14 @@ void DownpourWorker::CollectLabelInfo(size_t table_idx) { ...@@ -83,14 +83,14 @@ void DownpourWorker::CollectLabelInfo(size_t table_idx) {
LoDTensor* tensor = var->GetMutable<LoDTensor>(); LoDTensor* tensor = var->GetMutable<LoDTensor>();
int64_t* label_ptr = tensor->data<int64_t>(); int64_t* label_ptr = tensor->data<int64_t>();
int global_index = 0; size_t global_index = 0;
for (size_t i = 0; i < sparse_key_names_[table_id].size(); ++i) { for (size_t i = 0; i < sparse_key_names_[table_id].size(); ++i) {
VLOG(3) << "sparse_key_names_[" << i VLOG(3) << "sparse_key_names_[" << i
<< "]: " << sparse_key_names_[table_id][i]; << "]: " << sparse_key_names_[table_id][i];
Variable* fea_var = thread_scope_->FindVar(sparse_key_names_[table_id][i]); Variable* fea_var = thread_scope_->FindVar(sparse_key_names_[table_id][i]);
LoDTensor* tensor = fea_var->GetMutable<LoDTensor>(); LoDTensor* tensor = fea_var->GetMutable<LoDTensor>();
int64_t* ids = tensor->data<int64_t>(); int64_t* ids = tensor->data<int64_t>();
int fea_idx = 0; size_t fea_idx = 0;
// tensor->lod()[0].size() == batch_size + 1 // tensor->lod()[0].size() == batch_size + 1
for (auto lod_idx = 1u; lod_idx < tensor->lod()[0].size(); ++lod_idx) { for (auto lod_idx = 1u; lod_idx < tensor->lod()[0].size(); ++lod_idx) {
for (; fea_idx < tensor->lod()[0][lod_idx]; ++fea_idx) { for (; fea_idx < tensor->lod()[0][lod_idx]; ++fea_idx) {
...@@ -138,7 +138,7 @@ void DownpourWorker::FillSparseValue(size_t table_idx) { ...@@ -138,7 +138,7 @@ void DownpourWorker::FillSparseValue(size_t table_idx) {
auto& tensor_lod = tensor->lod()[0]; auto& tensor_lod = tensor->lod()[0];
LoD data_lod{tensor_lod}; LoD data_lod{tensor_lod};
tensor_emb->set_lod(data_lod); tensor_emb->set_lod(data_lod);
for (auto index = 0u; index < len; ++index) { for (int index = 0; index < len; ++index) {
if (ids[index] == 0u) { if (ids[index] == 0u) {
memcpy(ptr + table.emb_dim() * index, init_value.data() + 2, memcpy(ptr + table.emb_dim() * index, init_value.data() + 2,
sizeof(float) * table.emb_dim()); sizeof(float) * table.emb_dim());
...@@ -192,7 +192,7 @@ void DownpourWorker::TrainFilesWithProfiler() { ...@@ -192,7 +192,7 @@ void DownpourWorker::TrainFilesWithProfiler() {
read_time += timeline.ElapsedSec(); read_time += timeline.ElapsedSec();
total_time += timeline.ElapsedSec(); total_time += timeline.ElapsedSec();
VLOG(3) << "program config size: " << param_.program_config_size(); VLOG(3) << "program config size: " << param_.program_config_size();
for (size_t i = 0; i < param_.program_config(0).pull_sparse_table_id_size(); for (int i = 0; i < param_.program_config(0).pull_sparse_table_id_size();
++i) { ++i) {
uint64_t tid = static_cast<uint64_t>( uint64_t tid = static_cast<uint64_t>(
param_.program_config(0).pull_sparse_table_id(i)); param_.program_config(0).pull_sparse_table_id(i));
...@@ -244,8 +244,8 @@ void DownpourWorker::TrainFilesWithProfiler() { ...@@ -244,8 +244,8 @@ void DownpourWorker::TrainFilesWithProfiler() {
} }
if (need_to_push_sparse_) { if (need_to_push_sparse_) {
for (size_t i = 0; for (int i = 0; i < param_.program_config(0).push_sparse_table_id_size();
i < param_.program_config(0).push_sparse_table_id_size(); ++i) { ++i) {
uint64_t tid = static_cast<uint64_t>( uint64_t tid = static_cast<uint64_t>(
param_.program_config(0).push_sparse_table_id(i)); param_.program_config(0).push_sparse_table_id(i));
TableParameter table; TableParameter table;
...@@ -268,8 +268,8 @@ void DownpourWorker::TrainFilesWithProfiler() { ...@@ -268,8 +268,8 @@ void DownpourWorker::TrainFilesWithProfiler() {
if (need_to_push_dense_) { if (need_to_push_dense_) {
timeline.Start(); timeline.Start();
for (size_t i = 0; for (int i = 0; i < param_.program_config(0).push_dense_table_id_size();
i < param_.program_config(0).push_dense_table_id_size(); ++i) { ++i) {
uint64_t tid = static_cast<uint64_t>( uint64_t tid = static_cast<uint64_t>(
param_.program_config(0).push_dense_table_id(i)); param_.program_config(0).push_dense_table_id(i));
fleet_ptr_->PushDenseVarsAsync( fleet_ptr_->PushDenseVarsAsync(
...@@ -315,8 +315,8 @@ void DownpourWorker::TrainFilesWithProfiler() { ...@@ -315,8 +315,8 @@ void DownpourWorker::TrainFilesWithProfiler() {
} }
if (need_to_push_dense_) { if (need_to_push_dense_) {
for (size_t i = 0; for (int i = 0; i < param_.program_config(0).push_dense_table_id_size();
i < param_.program_config(0).push_dense_table_id_size(); ++i) { ++i) {
uint64_t tid = static_cast<uint64_t>( uint64_t tid = static_cast<uint64_t>(
param_.program_config(0).push_dense_table_id(i)); param_.program_config(0).push_dense_table_id(i));
pull_dense_worker_->IncreaseThreadVersion(thread_id_, tid); pull_dense_worker_->IncreaseThreadVersion(thread_id_, tid);
...@@ -362,7 +362,7 @@ void DownpourWorker::TrainFiles() { ...@@ -362,7 +362,7 @@ void DownpourWorker::TrainFiles() {
int cur_batch; int cur_batch;
while ((cur_batch = device_reader_->Next()) > 0) { while ((cur_batch = device_reader_->Next()) > 0) {
// pull sparse here // pull sparse here
for (size_t i = 0; i < param_.program_config(0).pull_sparse_table_id_size(); for (int i = 0; i < param_.program_config(0).pull_sparse_table_id_size();
++i) { ++i) {
uint64_t tid = static_cast<uint64_t>( uint64_t tid = static_cast<uint64_t>(
param_.program_config(0).pull_sparse_table_id(i)); param_.program_config(0).pull_sparse_table_id(i));
...@@ -397,8 +397,8 @@ void DownpourWorker::TrainFiles() { ...@@ -397,8 +397,8 @@ void DownpourWorker::TrainFiles() {
if (need_to_push_sparse_) { if (need_to_push_sparse_) {
// push gradients here // push gradients here
for (size_t i = 0; for (int i = 0; i < param_.program_config(0).push_sparse_table_id_size();
i < param_.program_config(0).push_sparse_table_id_size(); ++i) { ++i) {
uint64_t tid = static_cast<uint64_t>( uint64_t tid = static_cast<uint64_t>(
param_.program_config(0).push_sparse_table_id(i)); param_.program_config(0).push_sparse_table_id(i));
TableParameter table; TableParameter table;
...@@ -416,8 +416,8 @@ void DownpourWorker::TrainFiles() { ...@@ -416,8 +416,8 @@ void DownpourWorker::TrainFiles() {
} }
if (need_to_push_dense_) { if (need_to_push_dense_) {
for (size_t i = 0; for (int i = 0; i < param_.program_config(0).push_dense_table_id_size();
i < param_.program_config(0).push_dense_table_id_size(); ++i) { ++i) {
uint64_t tid = static_cast<uint64_t>( uint64_t tid = static_cast<uint64_t>(
param_.program_config(0).push_dense_table_id(i)); param_.program_config(0).push_dense_table_id(i));
fleet_ptr_->PushDenseVarsAsync( fleet_ptr_->PushDenseVarsAsync(
...@@ -461,8 +461,8 @@ void DownpourWorker::TrainFiles() { ...@@ -461,8 +461,8 @@ void DownpourWorker::TrainFiles() {
} }
if (need_to_push_dense_) { if (need_to_push_dense_) {
for (size_t i = 0; for (int i = 0; i < param_.program_config(0).push_dense_table_id_size();
i < param_.program_config(0).push_dense_table_id_size(); ++i) { ++i) {
uint64_t tid = static_cast<uint64_t>( uint64_t tid = static_cast<uint64_t>(
param_.program_config(0).push_dense_table_id(i)); param_.program_config(0).push_dense_table_id(i));
pull_dense_worker_->IncreaseThreadVersion(thread_id_, tid); pull_dense_worker_->IncreaseThreadVersion(thread_id_, tid);
......
...@@ -68,6 +68,7 @@ pass_library(transpose_flatten_concat_fuse_pass inference) ...@@ -68,6 +68,7 @@ pass_library(transpose_flatten_concat_fuse_pass inference)
pass_library(identity_scale_op_clean_pass base) pass_library(identity_scale_op_clean_pass base)
pass_library(sync_batch_norm_pass base) pass_library(sync_batch_norm_pass base)
pass_library(runtime_context_cache_pass base) pass_library(runtime_context_cache_pass base)
pass_library(expected_kernel_cache_pass base)
pass_library(quant_conv2d_dequant_fuse_pass inference) pass_library(quant_conv2d_dequant_fuse_pass inference)
pass_library(fillconstant_elementwisemul_fuse inference) pass_library(fillconstant_elementwisemul_fuse inference)
......
/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/ir/expected_kernel_cache_pass.h"
#include <memory>
#include "paddle/fluid/framework/operator.h"
namespace paddle {
namespace framework {
namespace ir {
void ExpectedKernelCachePass::ApplyImpl(ir::Graph* graph) const {
VLOG(3) << "Applies Expected Kernel Cache strategy.";
for (const Node* n : graph->Nodes()) {
if (n->IsOp() && n->Op()) {
n->Op()->SetAttr(kEnableCacheExpectedKernel, true);
}
}
}
} // namespace ir
} // namespace framework
} // namespace paddle
REGISTER_PASS(expected_kernel_cache_pass,
paddle::framework::ir::ExpectedKernelCachePass);
...@@ -12,23 +12,20 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,23 +12,20 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#ifdef PADDLE_WITH_WBAES #pragma once
#include "paddle/fluid/platform/dynload/wbaes.h" #include <memory>
#include "paddle/fluid/framework/ir/pass.h"
namespace paddle { namespace paddle {
namespace platform { namespace framework {
namespace dynload { namespace ir {
std::once_flag wbaes_dso_flag; class ExpectedKernelCachePass : public Pass {
void *wbaes_dso_handle = nullptr; protected:
void ApplyImpl(ir::Graph* graph) const override;
};
#define DEFINE_WRAP(__name) DynLoad__##__name __name } // namespace ir
} // namespace framework
WBAES_ROUTINE_EACH(DEFINE_WRAP);
} // namespace dynload
} // namespace platform
} // namespace paddle } // namespace paddle
#endif
...@@ -31,10 +31,10 @@ namespace paddle { ...@@ -31,10 +31,10 @@ namespace paddle {
namespace framework { namespace framework {
namespace ir { namespace ir {
namespace { namespace {
void SortHelper( void SortHelper(const std::map<ir::Node *, std::set<ir::Node *, ir::NodeComp>,
const std::map<ir::Node *, std::unordered_set<ir::Node *>> &adj_list, ir::NodeComp> &adj_list,
ir::Node *node, std::unordered_set<ir::Node *> *visited, ir::Node *node, std::unordered_set<ir::Node *> *visited,
std::vector<ir::Node *> *ret) { std::vector<ir::Node *> *ret) {
visited->insert(node); visited->insert(node);
for (auto adj : adj_list.at(node)) { for (auto adj : adj_list.at(node)) {
...@@ -50,7 +50,8 @@ void SortHelper( ...@@ -50,7 +50,8 @@ void SortHelper(
bool HasCircleHelper( bool HasCircleHelper(
ir::Node *node, ir::Node *node,
const std::map<ir::Node *, std::unordered_set<ir::Node *>> &adj_list, const std::map<ir::Node *, std::set<ir::Node *, ir::NodeComp>, ir::NodeComp>
&adj_list,
std::unordered_set<ir::Node *> *visited, std::unordered_set<ir::Node *> *visited,
std::unordered_set<ir::Node *> *in_trace, std::unordered_set<ir::Node *> *in_trace,
std::vector<std::vector<ir::Node *>> *circles) { std::vector<std::vector<ir::Node *>> *circles) {
...@@ -84,7 +85,8 @@ bool HasCircleHelper( ...@@ -84,7 +85,8 @@ bool HasCircleHelper(
} }
bool HasCircleInternal( bool HasCircleInternal(
const std::map<ir::Node *, std::unordered_set<ir::Node *>> &adj_list, const std::map<ir::Node *, std::set<ir::Node *, ir::NodeComp>, ir::NodeComp>
&adj_list,
std::vector<std::vector<ir::Node *>> *circles) { std::vector<std::vector<ir::Node *>> *circles) {
std::unordered_set<ir::Node *> visited; std::unordered_set<ir::Node *> visited;
std::unordered_set<ir::Node *> in_trace; std::unordered_set<ir::Node *> in_trace;
...@@ -107,8 +109,8 @@ bool FindCircleSubGraph(const Graph &graph, ...@@ -107,8 +109,8 @@ bool FindCircleSubGraph(const Graph &graph,
} }
std::vector<ir::Node *> TopologySortOperations(const Graph &graph) { std::vector<ir::Node *> TopologySortOperations(const Graph &graph) {
std::map<ir::Node *, std::unordered_set<ir::Node *>> adj_list = std::map<ir::Node *, std::set<ir::Node *, ir::NodeComp>, ir::NodeComp>
BuildOperationAdjList(graph); adj_list = BuildOperationAdjList(graph);
PADDLE_ENFORCE(!HasCircleInternal(adj_list, nullptr)); PADDLE_ENFORCE(!HasCircleInternal(adj_list, nullptr));
std::unordered_set<ir::Node *> visited; std::unordered_set<ir::Node *> visited;
std::vector<ir::Node *> ret; std::vector<ir::Node *> ret;
...@@ -117,34 +119,30 @@ std::vector<ir::Node *> TopologySortOperations(const Graph &graph) { ...@@ -117,34 +119,30 @@ std::vector<ir::Node *> TopologySortOperations(const Graph &graph) {
SortHelper(adj_list, adj.first, &visited, &ret); SortHelper(adj_list, adj.first, &visited, &ret);
} }
} }
return ret; return ret;
} }
// Build operator inlink edge table. // Build operator inlink edge table.
std::map<ir::Node *, std::unordered_set<ir::Node *>> BuildOperationAdjList( std::map<ir::Node *, std::set<ir::Node *, ir::NodeComp>, ir::NodeComp>
const Graph &graph) { BuildOperationAdjList(const Graph &graph) {
std::map<ir::Node *, std::unordered_set<ir::Node *>> adj_list; std::map<ir::Node *, std::set<ir::Node *, ir::NodeComp>, ir::NodeComp>
adj_list;
for (auto &n : graph.Nodes()) { for (auto &n : graph.Nodes()) {
if (!n->IsOp()) continue; if (!n->IsOp()) continue;
if (adj_list.find(n) == adj_list.end()) { if (adj_list.find(n) == adj_list.end()) {
adj_list[n] = std::unordered_set<ir::Node *>(); adj_list[n] = std::set<ir::Node *, ir::NodeComp>();
} }
std::vector<ir::Node *> nodes;
for (auto &var : n->inputs) { for (auto &var : n->inputs) {
for (auto &adj_n : var->inputs) { for (auto &adj_n : var->inputs) {
PADDLE_ENFORCE(adj_n->NodeType() == ir::Node::Type::kOperation); PADDLE_ENFORCE(adj_n->NodeType() == ir::Node::Type::kOperation);
VLOG(4) << "adj " << adj_n->Name() << reinterpret_cast<void *>(adj_n) VLOG(4) << "adj " << adj_n->Name() << reinterpret_cast<void *>(adj_n)
<< " -> " << n->Name() << reinterpret_cast<void *>(n) << " -> " << n->Name() << reinterpret_cast<void *>(n)
<< " via " << var->Name() << reinterpret_cast<void *>(var); << " via " << var->Name() << reinterpret_cast<void *>(var);
nodes.push_back(adj_n); adj_list[n].insert(adj_n);
} }
} }
std::sort(nodes.begin(), nodes.end(), [](ir::Node *node1, ir::Node *node2) {
return node1->id() > node2->id();
});
adj_list[n].insert(std::make_move_iterator(nodes.begin()),
std::make_move_iterator(nodes.end()));
} }
return adj_list; return adj_list;
} }
......
...@@ -16,6 +16,7 @@ limitations under the License. */ ...@@ -16,6 +16,7 @@ limitations under the License. */
#include <map> #include <map>
#include <memory> #include <memory>
#include <set>
#include <vector> #include <vector>
#include "paddle/fluid/framework/ir/graph.h" #include "paddle/fluid/framework/ir/graph.h"
...@@ -25,6 +26,13 @@ namespace paddle { ...@@ -25,6 +26,13 @@ namespace paddle {
namespace framework { namespace framework {
namespace ir { namespace ir {
// Compare nodes via node id.
struct NodeComp {
bool operator()(ir::Node *const &node1, ir::Node *const &node2) const {
return node1->id() < node2->id();
}
};
// Test if the graph contains circle. // Test if the graph contains circle.
bool HasCircle(const Graph &graph); bool HasCircle(const Graph &graph);
...@@ -57,8 +65,8 @@ std::vector<Node *> TopologyVarientSort(const Graph &graph, SortKind sort_kind); ...@@ -57,8 +65,8 @@ std::vector<Node *> TopologyVarientSort(const Graph &graph, SortKind sort_kind);
void CleanIndividualNodes(Graph *graph); void CleanIndividualNodes(Graph *graph);
// Build an adjacency list of operations for the `graph`. // Build an adjacency list of operations for the `graph`.
std::map<ir::Node *, std::unordered_set<ir::Node *>> BuildOperationAdjList( std::map<ir::Node *, std::set<ir::Node *, ir::NodeComp>, ir::NodeComp>
const Graph &graph); BuildOperationAdjList(const Graph &graph);
template <typename T> template <typename T>
std::vector<T *> FilterByNodeWrapper(const Graph &graph) { std::vector<T *> FilterByNodeWrapper(const Graph &graph) {
......
...@@ -23,7 +23,7 @@ namespace ir { ...@@ -23,7 +23,7 @@ namespace ir {
void RuntimeContextCachePass::ApplyImpl(ir::Graph* graph) const { void RuntimeContextCachePass::ApplyImpl(ir::Graph* graph) const {
VLOG(3) << "Applies Runtime Context Cache strategy."; VLOG(3) << "Applies Runtime Context Cache strategy.";
for (const Node* n : graph->Nodes()) { for (const Node* n : graph->Nodes()) {
if (n->IsOp()) { if (n->IsOp() && n->Op()) {
n->Op()->SetAttr(kEnableCacheRuntimeContext, true); n->Op()->SetAttr(kEnableCacheRuntimeContext, true);
} }
} }
......
...@@ -241,6 +241,7 @@ OpDesc::OpDesc(const std::string &type, const VariableNameMap &inputs, ...@@ -241,6 +241,7 @@ OpDesc::OpDesc(const std::string &type, const VariableNameMap &inputs,
outputs_ = outputs; outputs_ = outputs;
attrs_ = attrs; attrs_ = attrs;
need_update_ = true; need_update_ = true;
block_ = nullptr;
} }
OpDesc::OpDesc(const OpDesc &other, BlockDesc *block) { OpDesc::OpDesc(const OpDesc &other, BlockDesc *block) {
......
...@@ -880,7 +880,16 @@ std::vector<KernelConfig>* OperatorWithKernel::GetKernelConfig( ...@@ -880,7 +880,16 @@ std::vector<KernelConfig>* OperatorWithKernel::GetKernelConfig(
void OperatorWithKernel::RunImpl(const Scope& scope, void OperatorWithKernel::RunImpl(const Scope& scope,
const platform::Place& place) const { const platform::Place& place) const {
if (!HasAttr(kEnableCacheRuntimeContext)) { // To reduce the elapsed time of HasAttr, we use bool variable to record the
// result of HasAttr.
if (!enable_cache_runtime_context && HasAttr(kEnableCacheRuntimeContext))
enable_cache_runtime_context = true;
if (!enable_cache_expected_kernel && HasAttr(kEnableCacheExpectedKernel))
enable_cache_expected_kernel = true;
if (!all_kernels_must_compute_runtime_shape &&
HasAttr(kAllKernelsMustComputeRuntimeShape))
all_kernels_must_compute_runtime_shape = true;
if (!enable_cache_runtime_context) {
RuntimeContext ctx(Inputs(), Outputs(), scope); RuntimeContext ctx(Inputs(), Outputs(), scope);
RunImpl(scope, place, &ctx); RunImpl(scope, place, &ctx);
} else { } else {
...@@ -899,60 +908,33 @@ void OperatorWithKernel::RunImpl(const Scope& scope, ...@@ -899,60 +908,33 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
auto* dev_ctx = pool.Get(place); auto* dev_ctx = pool.Get(place);
// check if op[type] has kernel registered. if (!enable_cache_expected_kernel || !kernel_type_) {
auto& all_op_kernels = AllOpKernels(); ChooseKernel(*runtime_ctx, scope, place);
auto kernels_iter = all_op_kernels.find(type_);
if (kernels_iter == all_op_kernels.end()) {
PADDLE_THROW(
"There are no kernels which are registered in the %s operator.", type_);
} }
OpKernelMap& kernels = kernels_iter->second; std::vector<KernelConfig>* kernel_configs = GetKernelConfig(*kernel_type_);
auto expected_kernel_key = this->GetExpectedKernelType(
ExecutionContext(*this, scope, *dev_ctx, *runtime_ctx, nullptr));
VLOG(3) << "expected_kernel_key:" << expected_kernel_key;
auto kernel_iter = kernels.find(expected_kernel_key);
#ifdef PADDLE_WITH_MKLDNN
// workaround for missing MKLDNN kernel when FLAGS_use_mkldnn env var is set
if (kernel_iter == kernels.end() &&
expected_kernel_key.library_type_ == LibraryType::kMKLDNN) {
VLOG(3) << "missing MKLDNN kernel: fallbacking to PLAIN one";
expected_kernel_key.library_type_ = LibraryType::kPlain;
expected_kernel_key.data_layout_ = DataLayout::kAnyLayout;
kernel_iter = kernels.find(expected_kernel_key);
}
#endif
if (kernel_iter == kernels.end()) {
PADDLE_THROW("op %s does not have kernel for %s", type_,
KernelTypeToString(expected_kernel_key));
}
std::vector<KernelConfig>* kernel_configs =
GetKernelConfig(expected_kernel_key);
// do data transformScope &transfer_scope; // do data transformScope &transfer_scope;
std::vector<std::string> transfered_inplace_vars; std::vector<std::string> transfered_inplace_vars;
auto* transfer_scope = PrepareData(scope, expected_kernel_key, auto* transfer_scope =
&transfered_inplace_vars, runtime_ctx); PrepareData(scope, *kernel_type_, &transfered_inplace_vars, runtime_ctx);
// exec scope is the scope that kernel actually executed on. // exec scope is the scope that kernel actually executed on.
const Scope& exec_scope = const Scope& exec_scope =
(transfer_scope == nullptr ? scope : *transfer_scope); (transfer_scope == nullptr ? scope : *transfer_scope);
if (!(expected_kernel_key.place_ == dev_ctx->GetPlace())) { if (!(kernel_type_->place_ == dev_ctx->GetPlace())) {
dev_ctx = pool.Get(expected_kernel_key.place_); dev_ctx = pool.Get(kernel_type_->place_);
} }
if (!HasAttr(kAllKernelsMustComputeRuntimeShape)) { if (!all_kernels_must_compute_runtime_shape) {
RuntimeInferShapeContext infer_shape_ctx(*this, exec_scope, *runtime_ctx); RuntimeInferShapeContext infer_shape_ctx(*this, exec_scope, *runtime_ctx);
this->InferShape(&infer_shape_ctx); this->InferShape(&infer_shape_ctx);
} }
// TODO(panyx0718): ExecutionContext should only depend on RuntimeContext // TODO(panyx0718): ExecutionContext should only depend on RuntimeContext
// not Scope. Imperative mode only pass inputs and get outputs. // not Scope. Imperative mode only pass inputs and get outputs.
kernel_iter->second(ExecutionContext(*this, exec_scope, *dev_ctx, (*kernel_func_)(ExecutionContext(*this, exec_scope, *dev_ctx, *runtime_ctx,
*runtime_ctx, kernel_configs)); kernel_configs));
if (!transfered_inplace_vars.empty()) { if (!transfered_inplace_vars.empty()) {
// there is inplace variable has been transfered. // there is inplace variable has been transfered.
...@@ -978,6 +960,46 @@ void OperatorWithKernel::RunImpl(const Scope& scope, ...@@ -978,6 +960,46 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
} }
} }
void OperatorWithKernel::ChooseKernel(const RuntimeContext& ctx,
const Scope& scope,
const platform::Place& place) const {
platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
auto* dev_ctx = pool.Get(place);
// check if op[type] has kernel registered.
auto& all_op_kernels = AllOpKernels();
auto kernels_iter = all_op_kernels.find(type_);
if (kernels_iter == all_op_kernels.end()) {
PADDLE_THROW(
"There are no kernels which are registered in the %s operator.", type_);
}
OpKernelMap& kernels = kernels_iter->second;
auto expected_kernel_key = this->GetExpectedKernelType(
ExecutionContext(*this, scope, *dev_ctx, ctx, nullptr));
VLOG(3) << "expected_kernel_key:" << expected_kernel_key;
auto kernel_iter = kernels.find(expected_kernel_key);
#ifdef PADDLE_WITH_MKLDNN
// workaround for missing MKLDNN kernel when FLAGS_use_mkldnn env var is set
if (kernel_iter == kernels.end() &&
expected_kernel_key.library_type_ == LibraryType::kMKLDNN) {
VLOG(3) << "missing MKLDNN kernel: fallbacking to PLAIN one";
expected_kernel_key.library_type_ = LibraryType::kPlain;
expected_kernel_key.data_layout_ = DataLayout::kAnyLayout;
kernel_iter = kernels.find(expected_kernel_key);
}
#endif
if (kernel_iter == kernels.end()) {
PADDLE_THROW("op %s does not have kernel for %s", type_,
KernelTypeToString(expected_kernel_key));
}
kernel_type_.reset(new OpKernelType(expected_kernel_key));
kernel_func_.reset(new OpKernelFunc(kernel_iter->second));
}
void OperatorWithKernel::TransferInplaceVarsBack( void OperatorWithKernel::TransferInplaceVarsBack(
const Scope& scope, const std::vector<std::string>& inplace_vars, const Scope& scope, const std::vector<std::string>& inplace_vars,
const Scope& transfer_scope) const { const Scope& transfer_scope) const {
......
...@@ -70,6 +70,12 @@ constexpr char kNewGradSuffix[] = "@NEWGRAD@"; ...@@ -70,6 +70,12 @@ constexpr char kNewGradSuffix[] = "@NEWGRAD@";
/// this Op's execution to save the elapsed time. /// this Op's execution to save the elapsed time.
constexpr char kEnableCacheRuntimeContext[] = "@ENABLE_CACHE_RUNTIME_CONTEXT@"; constexpr char kEnableCacheRuntimeContext[] = "@ENABLE_CACHE_RUNTIME_CONTEXT@";
/// If an Op has attribtue kEnableCacheExpectedKernel, it means that in a same
/// name scope and same place, since the expected kerenl of this Op does not
/// change in the execution, it could be recorded only at the first iteration of
/// this Op's execution to save the elapsed time.
constexpr char kEnableCacheExpectedKernel[] = "@ENABLE_CACHE_EXPECTED_KERNEL@";
/// If an Op has this attribute, all its kernels should calculate output /// If an Op has this attribute, all its kernels should calculate output
/// variable's shape in the corresponding Compute() function. And /// variable's shape in the corresponding Compute() function. And
/// OperatorWithKernel::RunImpl() would skip call this Op's InferShape() /// OperatorWithKernel::RunImpl() would skip call this Op's InferShape()
...@@ -491,10 +497,18 @@ class OperatorWithKernel : public OperatorBase { ...@@ -491,10 +497,18 @@ class OperatorWithKernel : public OperatorBase {
const std::vector<std::string>& inplace_vars, const std::vector<std::string>& inplace_vars,
const Scope& exec_scope) const; const Scope& exec_scope) const;
void ChooseKernel(const RuntimeContext& ctx, const Scope& scope,
const platform::Place& place) const;
protected: protected:
mutable OpKernelConfigsMap kernel_configs_map_; mutable OpKernelConfigsMap kernel_configs_map_;
mutable std::unique_ptr<OpKernelType> kernel_type_;
mutable std::unique_ptr<OpKernelFunc> kernel_func_;
mutable std::unique_ptr<RuntimeContext> runtime_ctx_; mutable std::unique_ptr<RuntimeContext> runtime_ctx_;
mutable const Scope* pre_scope_ = nullptr; mutable const Scope* pre_scope_ = nullptr;
mutable bool enable_cache_runtime_context = false;
mutable bool enable_cache_expected_kernel = false;
mutable bool all_kernels_must_compute_runtime_shape = false;
}; };
extern bool OpSupportGPU(const std::string& op_type); extern bool OpSupportGPU(const std::string& op_type);
......
...@@ -221,7 +221,7 @@ ParallelExecutor::ParallelExecutor(const std::vector<platform::Place> &places, ...@@ -221,7 +221,7 @@ ParallelExecutor::ParallelExecutor(const std::vector<platform::Place> &places,
PADDLE_ENFORCE(!member_->use_cuda_, PADDLE_ENFORCE(!member_->use_cuda_,
"gpu mode does not support async_mode_ now!"); "gpu mode does not support async_mode_ now!");
graphs.push_back(graph); graphs.push_back(graph);
for (int i = 1; i < places.size(); ++i) { for (size_t i = 1; i < places.size(); ++i) {
auto *tmp_graph = new ir::Graph(graph->OriginProgram()); auto *tmp_graph = new ir::Graph(graph->OriginProgram());
async_graphs_.emplace_back(tmp_graph); async_graphs_.emplace_back(tmp_graph);
graphs.push_back(tmp_graph); graphs.push_back(tmp_graph);
...@@ -315,7 +315,7 @@ ParallelExecutor::ParallelExecutor(const std::vector<platform::Place> &places, ...@@ -315,7 +315,7 @@ ParallelExecutor::ParallelExecutor(const std::vector<platform::Place> &places,
graph = build_strategy.Apply(graph, {member_->places_[0]}, loss_var_name, graph = build_strategy.Apply(graph, {member_->places_[0]}, loss_var_name,
{member_->local_scopes_[0]}, 1, {member_->local_scopes_[0]}, 1,
member_->use_cuda_, member_->nccl_ctxs_.get()); member_->use_cuda_, member_->nccl_ctxs_.get());
for (int i = 1; i < member_->places_.size(); ++i) { for (size_t i = 1; i < member_->places_.size(); ++i) {
graphs[i] = graphs[i] =
build_strategy.Apply(graphs[i], {member_->places_[i]}, loss_var_name, build_strategy.Apply(graphs[i], {member_->places_[i]}, loss_var_name,
{member_->local_scopes_[i]}, 1, {member_->local_scopes_[i]}, 1,
......
...@@ -76,7 +76,7 @@ message PullDenseWorkerParameter { ...@@ -76,7 +76,7 @@ message PullDenseWorkerParameter {
message TableParameter { message TableParameter {
// dense table only // dense table only
optional int64 table_id = 1; optional uint64 table_id = 1;
repeated string dense_value_name = 2; repeated string dense_value_name = 2;
repeated string dense_grad_name = 3; repeated string dense_grad_name = 3;
repeated int32 push_dense_wait_times = 5; repeated int32 push_dense_wait_times = 5;
......
...@@ -45,12 +45,16 @@ class InferVarTypeContext { ...@@ -45,12 +45,16 @@ class InferVarTypeContext {
virtual bool HasInput(const std::string& name) const { virtual bool HasInput(const std::string& name) const {
PADDLE_ENFORCE_NOT_NULL(op_); PADDLE_ENFORCE_NOT_NULL(op_);
return op_->Inputs().count(name) > 0; auto& inputs = op_->Inputs();
auto input = inputs.find(name);
return input != inputs.end() && !input->second.empty();
} }
virtual bool HasOutput(const std::string& name) const { virtual bool HasOutput(const std::string& name) const {
PADDLE_ENFORCE_NOT_NULL(op_); PADDLE_ENFORCE_NOT_NULL(op_);
return op_->Outputs().count(name) > 0; auto& outputs = op_->Outputs();
auto output = outputs.find(name);
return output != outputs.end() && !output->second.empty();
} }
virtual const std::vector<std::string>& Input(const std::string& name) const { virtual const std::vector<std::string>& Input(const std::string& name) const {
......
...@@ -3,4 +3,7 @@ cc_library(layer SRCS layer.cc DEPS proto_desc operator device_context blas pybi ...@@ -3,4 +3,7 @@ cc_library(layer SRCS layer.cc DEPS proto_desc operator device_context blas pybi
cc_library(tracer SRCS tracer.cc DEPS proto_desc device_context pybind) cc_library(tracer SRCS tracer.cc DEPS proto_desc device_context pybind)
cc_library(engine SRCS engine.cc) cc_library(engine SRCS engine.cc)
cc_library(imperative_profiler SRCS profiler.cc) cc_library(imperative_profiler SRCS profiler.cc)
cc_library(nccl_context SRCS nccl_context.cc DEPS device_context)
cc_test(nccl_context_test SRCS nccl_context_test.cc DEPS nccl_context)
endif() endif()
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/imperative/nccl_context.h"
namespace paddle {
namespace imperative {
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
void NCCLParallelContext::RecvNCCLID(const std::string &ep,
ncclUniqueId *nccl_id) {
auto addr = paddle::string::Split(ep, ':');
PADDLE_ENFORCE_EQ(addr.size(), 2UL,
"The endpoint should contain host and port: %s", ep);
std::string host = addr[0];
int port = std::stoi(addr[1]);
int server_fd, new_socket;
struct sockaddr_in address;
int addrlen = sizeof(address);
char buffer[1024] = {0};
int opt = 0;
// creating socket fd
if ((server_fd = socket(AF_INET, SOCK_STREAM, 0)) == 0)
PADDLE_THROW("create server fd failed");
if (setsockopt(server_fd, SOL_SOCKET, SO_REUSEADDR, &opt, sizeof(opt)))
PADDLE_THROW("set socket opt failed");
address.sin_family = AF_INET;
address.sin_addr.s_addr = INADDR_ANY;
address.sin_port = htons(port);
if (bind(server_fd, (struct sockaddr *)&address, sizeof(address)) < 0)
PADDLE_THROW("binding failed on ep: %s", ep);
VLOG(3) << "listening on: " << ep;
if (listen(server_fd, 3) < 0) PADDLE_THROW("listen on server fd failed");
if ((new_socket =
accept(server_fd, reinterpret_cast<struct sockaddr *>(&address),
reinterpret_cast<socklen_t *>(&addrlen))) < 0)
PADDLE_THROW("accept the new socket fd failed");
if (read(new_socket, buffer, 1024) < 0)
PADDLE_THROW("reading the ncclUniqueId from socket failed");
VLOG(3) << "recevived the ncclUniqueId";
memcpy(nccl_id, buffer, NCCL_UNIQUE_ID_BYTES);
VLOG(3) << "closing the socket server: " << ep;
close(server_fd);
}
void NCCLParallelContext::SendNCCLID(const std::string &ep,
ncclUniqueId *nccl_id) {
auto addr = paddle::string::Split(ep, ':');
PADDLE_ENFORCE_EQ(addr.size(), 2UL,
"The endpoint should contain host and port: %s", ep);
std::string host = addr[0];
int port = std::stoi(addr[1]);
// struct sockaddr_in address;
int sock = 0;
struct sockaddr_in serv_addr;
char buffer[1024] = {0};
memcpy(buffer, nccl_id, NCCL_UNIQUE_ID_BYTES);
if ((sock = socket(AF_INET, SOCK_STREAM, 0)) < 0)
PADDLE_THROW("create socket failed");
memset(&serv_addr, '0', sizeof(serv_addr));
serv_addr.sin_family = AF_INET;
serv_addr.sin_port = htons(port);
if (inet_pton(AF_INET, host.c_str(), &serv_addr.sin_addr) <= 0)
PADDLE_THROW("invalied address: %s", ep);
while (true) {
if (connect(sock, (struct sockaddr *)&serv_addr, sizeof(serv_addr)) < 0) {
VLOG(0) << "worker: " << ep
<< " is not ready, will retry after 3 seconds...";
std::this_thread::sleep_for(std::chrono::seconds(3));
continue;
}
VLOG(3) << "sending the ncclUniqueId to " << ep;
send(sock, buffer, NCCL_UNIQUE_ID_BYTES, 0);
break;
}
}
void NCCLParallelContext::BcastNCCLId(ncclUniqueId *nccl_id, int root) {
if (strategy_.local_rank_ == root) {
for (auto ep : strategy_.trainer_endpoints_) {
if (ep != strategy_.current_endpoint_) SendNCCLID(ep, nccl_id);
}
} else {
RecvNCCLID(strategy_.current_endpoint_, nccl_id);
}
}
void NCCLParallelContext::Init() {
ncclUniqueId nccl_id;
ncclComm_t comm;
if (strategy_.local_rank_ == 0) {
// generate the unique ncclid on the root worker
platform::dynload::ncclGetUniqueId(&nccl_id);
BcastNCCLId(&nccl_id, 0);
} else {
BcastNCCLId(&nccl_id, 0);
}
int gpu_id = boost::get<platform::CUDAPlace>(place_).device;
VLOG(0) << "init nccl context nranks: " << strategy_.nranks_
<< " local rank: " << strategy_.local_rank_ << " gpu id: " << gpu_id;
PADDLE_ENFORCE(cudaSetDevice(gpu_id));
PADDLE_ENFORCE(platform::dynload::ncclCommInitRank(
&comm, strategy_.nranks_, nccl_id, strategy_.local_rank_));
platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
auto *dev_ctx = static_cast<platform::CUDADeviceContext *>(pool.Get(place_));
dev_ctx->set_nccl_comm(comm);
}
#endif
} // namespace imperative
} // namespace paddle
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
// network header files
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
#include <arpa/inet.h>
#include <netinet/in.h>
#include <stdlib.h>
#include <sys/socket.h>
#endif
#include <string>
#include <vector>
#include "paddle/fluid/framework/variable.h"
#include "paddle/fluid/platform/device_context.h"
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
#include "paddle/fluid/platform/dynload/nccl.h"
#endif
#include "paddle/fluid/platform/place.h"
#include "paddle/fluid/string/split.h"
namespace paddle {
namespace imperative {
struct ParallelStrategy {
int nranks_{1};
int local_rank_{0};
std::vector<std::string> trainer_endpoints_{};
std::string current_endpoint_{""};
};
class ParallelContext {
public:
explicit ParallelContext(const ParallelStrategy& strategy,
const platform::Place& place)
: strategy_(strategy), place_(place) {}
virtual ~ParallelContext() {}
virtual void Init() = 0;
protected:
ParallelStrategy strategy_;
platform::Place place_;
};
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
class NCCLParallelContext : ParallelContext {
public:
explicit NCCLParallelContext(const ParallelStrategy& strategy,
const platform::Place& place)
: ParallelContext(strategy, place) {}
~NCCLParallelContext() {}
void BcastNCCLId(ncclUniqueId* nccl_id, int root);
void Init() override;
protected:
void RecvNCCLID(const std::string& endpoint, ncclUniqueId* nccl_id);
void SendNCCLID(const std::string& endpoint, ncclUniqueId* nccl_id);
};
#endif
} // namespace imperative
} // namespace paddle
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/imperative/nccl_context.h"
#include "gtest/gtest.h"
#include "paddle/fluid/platform/device_context.h"
namespace imperative = paddle::imperative;
namespace platform = paddle::platform;
imperative::ParallelStrategy GetStrategy(int local_rank) {
std::vector<std::string> eps = {"127.0.0.1:9866", "127.0.0.1:9867"};
imperative::ParallelStrategy strategy;
strategy.trainer_endpoints_ = eps;
strategy.current_endpoint_ = eps[local_rank];
strategy.nranks_ = 2;
strategy.local_rank_ = local_rank;
return strategy;
}
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
void BcastNCCLId(int local_rank, ncclUniqueId *nccl_id) {
auto strategy = GetStrategy(local_rank);
platform::CUDAPlace gpu(local_rank);
imperative::NCCLParallelContext ctx(strategy, gpu);
ctx.BcastNCCLId(nccl_id, 0);
}
TEST(BcastNCCLId, Run) {
ncclUniqueId nccl_id;
platform::dynload::ncclGetUniqueId(&nccl_id);
std::thread t(BcastNCCLId, 0, &nccl_id);
ncclUniqueId recv_nccl_id;
BcastNCCLId(1, &recv_nccl_id);
t.join();
EXPECT_EQ(0, std::memcmp(nccl_id.internal, recv_nccl_id.internal,
NCCL_UNIQUE_ID_BYTES));
}
#endif
...@@ -177,7 +177,7 @@ std::set<std::string> Tracer::Trace(OpBase* op, const VarBasePtrMap& inputs, ...@@ -177,7 +177,7 @@ std::set<std::string> Tracer::Trace(OpBase* op, const VarBasePtrMap& inputs,
current_vars_map[out->Name()] = out; current_vars_map[out->Name()] = out;
} }
VLOG(3) << "input var name: " << out->Name() VLOG(3) << "output var name: " << out->Name()
<< " inited: " << out->var_->IsInitialized() << " inited: " << out->var_->IsInitialized()
<< " stop_grad: " << out->IsStopGradient(); << " stop_grad: " << out->IsStopGradient();
} }
...@@ -215,6 +215,7 @@ std::set<std::string> Tracer::Trace(OpBase* op, const VarBasePtrMap& inputs, ...@@ -215,6 +215,7 @@ std::set<std::string> Tracer::Trace(OpBase* op, const VarBasePtrMap& inputs,
framework::Scope scope; framework::Scope scope;
op->place_ = GetExpectedPlace(expected_place, inputs); op->place_ = GetExpectedPlace(expected_place, inputs);
PreparedOp prepared_op = PreparedOp::Prepare(ctx, *op_kernel, op->place_); PreparedOp prepared_op = PreparedOp::Prepare(ctx, *op_kernel, op->place_);
prepared_op.op.RuntimeInferShape(scope, op->place_, ctx); prepared_op.op.RuntimeInferShape(scope, op->place_, ctx);
prepared_op.func( prepared_op.func(
......
...@@ -231,6 +231,7 @@ void AnalysisConfig::Update() { ...@@ -231,6 +231,7 @@ void AnalysisConfig::Update() {
pass_builder()->InsertPass(3, "tensorrt_subgraph_pass"); pass_builder()->InsertPass(3, "tensorrt_subgraph_pass");
} }
pass_builder()->DeletePass("runtime_context_cache_pass"); pass_builder()->DeletePass("runtime_context_cache_pass");
pass_builder()->DeletePass("expected_kernel_cache_pass");
} }
if (use_mkldnn_) { if (use_mkldnn_) {
......
...@@ -259,6 +259,9 @@ bool AnalysisPredictor::SetFeed(const std::vector<PaddleTensor> &inputs, ...@@ -259,6 +259,9 @@ bool AnalysisPredictor::SetFeed(const std::vector<PaddleTensor> &inputs,
return false; return false;
} }
PADDLE_ENFORCE_NOT_NULL(input_ptr);
PADDLE_ENFORCE_NOT_NULL(inputs[i].data.data());
if (platform::is_cpu_place(place_)) { if (platform::is_cpu_place(place_)) {
// TODO(panyx0718): Init LoDTensor from existing memcpy to save a copy. // TODO(panyx0718): Init LoDTensor from existing memcpy to save a copy.
std::memcpy(static_cast<void *>(input_ptr), inputs[i].data.data(), std::memcpy(static_cast<void *>(input_ptr), inputs[i].data.data(),
......
...@@ -54,6 +54,7 @@ PaddleBuf &PaddleBuf::operator=(const PaddleBuf &other) { ...@@ -54,6 +54,7 @@ PaddleBuf &PaddleBuf::operator=(const PaddleBuf &other) {
memory_owned_ = other.memory_owned_; memory_owned_ = other.memory_owned_;
} else { } else {
Resize(other.length()); Resize(other.length());
PADDLE_ENFORCE(!(other.length() > 0 && other.data() == nullptr));
memcpy(data_, other.data(), other.length()); memcpy(data_, other.data(), other.length());
length_ = other.length(); length_ = other.length();
memory_owned_ = true; memory_owned_ = true;
......
...@@ -169,6 +169,7 @@ std::unique_ptr<PaddlePredictor> NativePaddlePredictor::Clone() { ...@@ -169,6 +169,7 @@ std::unique_ptr<PaddlePredictor> NativePaddlePredictor::Clone() {
std::unique_ptr<PaddlePredictor> cls(new NativePaddlePredictor(config_)); std::unique_ptr<PaddlePredictor> cls(new NativePaddlePredictor(config_));
// Hot fix the bug that result diff in multi-thread. // Hot fix the bug that result diff in multi-thread.
// TODO(Superjomn) re-implement a real clone here. // TODO(Superjomn) re-implement a real clone here.
PADDLE_ENFORCE_NOT_NULL(dynamic_cast<NativePaddlePredictor *>(cls.get()));
if (!dynamic_cast<NativePaddlePredictor *>(cls.get())->Init(nullptr)) { if (!dynamic_cast<NativePaddlePredictor *>(cls.get())->Init(nullptr)) {
LOG(ERROR) << "fail to call Init"; LOG(ERROR) << "fail to call Init";
return nullptr; return nullptr;
...@@ -210,6 +211,8 @@ bool NativePaddlePredictor::SetFeed(const std::vector<PaddleTensor> &inputs, ...@@ -210,6 +211,8 @@ bool NativePaddlePredictor::SetFeed(const std::vector<PaddleTensor> &inputs,
return false; return false;
} }
PADDLE_ENFORCE_NOT_NULL(input_ptr);
PADDLE_ENFORCE_NOT_NULL(inputs[i].data.data());
if (platform::is_cpu_place(place_)) { if (platform::is_cpu_place(place_)) {
// TODO(panyx0718): Init LoDTensor from existing memcpy to save a copy. // TODO(panyx0718): Init LoDTensor from existing memcpy to save a copy.
std::memcpy(static_cast<void *>(input_ptr), inputs[i].data.data(), std::memcpy(static_cast<void *>(input_ptr), inputs[i].data.data(),
...@@ -316,6 +319,8 @@ std::unique_ptr<PaddlePredictor> CreatePaddlePredictor< ...@@ -316,6 +319,8 @@ std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<
} }
std::unique_ptr<PaddlePredictor> predictor(new NativePaddlePredictor(config)); std::unique_ptr<PaddlePredictor> predictor(new NativePaddlePredictor(config));
PADDLE_ENFORCE_NOT_NULL(
dynamic_cast<NativePaddlePredictor *>(predictor.get()));
if (!dynamic_cast<NativePaddlePredictor *>(predictor.get())->Init(nullptr)) { if (!dynamic_cast<NativePaddlePredictor *>(predictor.get())->Init(nullptr)) {
return nullptr; return nullptr;
} }
......
...@@ -86,7 +86,8 @@ const std::vector<std::string> kAnakinSubgraphPasses({ ...@@ -86,7 +86,8 @@ const std::vector<std::string> kAnakinSubgraphPasses({
GpuPassStrategy::GpuPassStrategy() : PassStrategy({}) { GpuPassStrategy::GpuPassStrategy() : PassStrategy({}) {
passes_.assign({ passes_.assign({
"infer_clean_graph_pass", // "infer_clean_graph_pass", //
"runtime_context_cache_pass", //
// "identity_scale_op_clean_pass", // // "identity_scale_op_clean_pass", //
"conv_affine_channel_fuse_pass", // "conv_affine_channel_fuse_pass", //
"conv_eltwiseadd_affine_channel_fuse_pass", // "conv_eltwiseadd_affine_channel_fuse_pass", //
...@@ -96,9 +97,9 @@ GpuPassStrategy::GpuPassStrategy() : PassStrategy({}) { ...@@ -96,9 +97,9 @@ GpuPassStrategy::GpuPassStrategy() : PassStrategy({}) {
"conv_elementwise_add_act_fuse_pass", // "conv_elementwise_add_act_fuse_pass", //
"conv_elementwise_add2_act_fuse_pass", // "conv_elementwise_add2_act_fuse_pass", //
"conv_elementwise_add_fuse_pass", // "conv_elementwise_add_fuse_pass", //
"runtime_context_cache_pass", //
#endif // #endif //
"transpose_flatten_concat_fuse_pass", "transpose_flatten_concat_fuse_pass",
"expected_kernel_cache_pass", //
}); });
use_gpu_ = true; use_gpu_ = true;
...@@ -116,10 +117,14 @@ CpuPassStrategy::CpuPassStrategy() : PassStrategy({}) { ...@@ -116,10 +117,14 @@ CpuPassStrategy::CpuPassStrategy() : PassStrategy({}) {
// NOTE the large fusions should be located in the front, so that they will // NOTE the large fusions should be located in the front, so that they will
// not be damaged by smaller ones. // not be damaged by smaller ones.
passes_.assign({ passes_.assign({
"infer_clean_graph_pass", // "infer_clean_graph_pass", //
// TODO(luotao): runtime_context_cache_pass should be located in the
// front, see https://github.com/PaddlePaddle/Paddle/issues/16609,
// will enhance this pass later.
"runtime_context_cache_pass", //
"attention_lstm_fuse_pass", // "attention_lstm_fuse_pass", //
"seqpool_concat_fuse_pass", //
"seqconv_eltadd_relu_fuse_pass", // "seqconv_eltadd_relu_fuse_pass", //
// "seqpool_concat_fuse_pass", //
// "embedding_fc_lstm_fuse_pass", // // "embedding_fc_lstm_fuse_pass", //
"fc_lstm_fuse_pass", // "fc_lstm_fuse_pass", //
"mul_lstm_fuse_pass", // "mul_lstm_fuse_pass", //
...@@ -132,9 +137,9 @@ CpuPassStrategy::CpuPassStrategy() : PassStrategy({}) { ...@@ -132,9 +137,9 @@ CpuPassStrategy::CpuPassStrategy() : PassStrategy({}) {
"conv_bn_fuse_pass", // "conv_bn_fuse_pass", //
"conv_eltwiseadd_bn_fuse_pass", // "conv_eltwiseadd_bn_fuse_pass", //
"is_test_pass", // "is_test_pass", //
"identity_scale_op_clean_pass", // "expected_kernel_cache_pass", //
"runtime_context_cache_pass", //
}); });
use_gpu_ = false; use_gpu_ = false;
} }
......
...@@ -23,18 +23,11 @@ namespace analysis { ...@@ -23,18 +23,11 @@ namespace analysis {
void SetConfig(AnalysisConfig *cfg) { void SetConfig(AnalysisConfig *cfg) {
cfg->SetModel(FLAGS_infer_model); cfg->SetModel(FLAGS_infer_model);
cfg->SetProgFile("__model__");
cfg->DisableGpu(); cfg->DisableGpu();
cfg->SwitchIrOptim(); cfg->SwitchIrOptim();
cfg->SwitchSpecifyInputNames(false); cfg->SwitchSpecifyInputNames();
cfg->SetCpuMathLibraryNumThreads(FLAGS_paddle_num_threads); cfg->SetCpuMathLibraryNumThreads(FLAGS_paddle_num_threads);
cfg->EnableMKLDNN(); cfg->EnableMKLDNN();
cfg->pass_builder()->SetPasses(
{"infer_clean_graph_pass", "mkldnn_placement_pass",
"depthwise_conv_mkldnn_pass", "conv_bn_fuse_pass",
"conv_eltwiseadd_bn_fuse_pass", "conv_bias_mkldnn_fuse_pass",
"conv_elementwise_add_mkldnn_fuse_pass", "conv_relu_mkldnn_fuse_pass",
"fc_fuse_pass", "is_test_pass"});
} }
template <typename T> template <typename T>
...@@ -84,13 +77,13 @@ std::shared_ptr<std::vector<PaddleTensor>> GetWarmupData( ...@@ -84,13 +77,13 @@ std::shared_ptr<std::vector<PaddleTensor>> GetWarmupData(
std::to_string(num_images) + " is bigger than all test data size."); std::to_string(num_images) + " is bigger than all test data size.");
PaddleTensor images; PaddleTensor images;
images.name = "input"; images.name = "image";
images.shape = {num_images, 3, 224, 224}; images.shape = {num_images, 3, 224, 224};
images.dtype = PaddleDType::FLOAT32; images.dtype = PaddleDType::FLOAT32;
images.data.Resize(sizeof(float) * num_images * 3 * 224 * 224); images.data.Resize(sizeof(float) * num_images * 3 * 224 * 224);
PaddleTensor labels; PaddleTensor labels;
labels.name = "labels"; labels.name = "label";
labels.shape = {num_images, 1}; labels.shape = {num_images, 1};
labels.dtype = PaddleDType::INT64; labels.dtype = PaddleDType::INT64;
labels.data.Resize(sizeof(int64_t) * num_images); labels.data.Resize(sizeof(int64_t) * num_images);
...@@ -132,7 +125,7 @@ void SetInput(std::vector<std::vector<PaddleTensor>> *inputs, ...@@ -132,7 +125,7 @@ void SetInput(std::vector<std::vector<PaddleTensor>> *inputs,
images_offset_in_file + sizeof(float) * total_images * 3 * 224 * 224; images_offset_in_file + sizeof(float) * total_images * 3 * 224 * 224;
TensorReader<float> image_reader(file, images_offset_in_file, TensorReader<float> image_reader(file, images_offset_in_file,
image_batch_shape, "input"); image_batch_shape, "image");
TensorReader<int64_t> label_reader(file, labels_offset_in_file, TensorReader<int64_t> label_reader(file, labels_offset_in_file,
label_batch_shape, "label"); label_batch_shape, "label");
......
...@@ -47,6 +47,7 @@ struct DataRecord { ...@@ -47,6 +47,7 @@ struct DataRecord {
num_lines++; num_lines++;
std::vector<std::string> data; std::vector<std::string> data;
split(line, '\t', &data); split(line, '\t', &data);
PADDLE_ENFORCE(data.size() >= 4);
// load title1 data // load title1 data
std::vector<int64_t> title1_data; std::vector<int64_t> title1_data;
split_to_int64(data[0], ' ', &title1_data); split_to_int64(data[0], ' ', &title1_data);
......
...@@ -150,6 +150,9 @@ void SetConfig(AnalysisConfig *cfg, bool use_mkldnn = false) { ...@@ -150,6 +150,9 @@ void SetConfig(AnalysisConfig *cfg, bool use_mkldnn = false) {
if (use_mkldnn) { if (use_mkldnn) {
cfg->EnableMKLDNN(); cfg->EnableMKLDNN();
} }
// Enable seqpool_concat_fuse_pass, disabled by default since it takes much
// time
cfg->pass_builder()->InsertPass(2, "seqpool_concat_fuse_pass");
} }
void profile(bool use_mkldnn = false) { void profile(bool use_mkldnn = false) {
......
...@@ -214,28 +214,23 @@ TEST(Analyzer_Transformer, fuse_statis) { ...@@ -214,28 +214,23 @@ TEST(Analyzer_Transformer, fuse_statis) {
} }
// Compare result of NativeConfig and AnalysisConfig // Compare result of NativeConfig and AnalysisConfig
// void compare(bool use_mkldnn = false) { void compare(bool use_mkldnn = false) {
// AnalysisConfig cfg; AnalysisConfig cfg;
// SetConfig(&cfg); SetConfig(&cfg);
// if (use_mkldnn) { if (use_mkldnn) {
// cfg.EnableMKLDNN(); cfg.EnableMKLDNN();
// } }
//
// std::vector<std::vector<PaddleTensor>> input_slots_all; std::vector<std::vector<PaddleTensor>> input_slots_all;
// SetInput(&input_slots_all); SetInput(&input_slots_all);
// CompareNativeAndAnalysis( CompareNativeAndAnalysis(
// reinterpret_cast<const PaddlePredictor::Config *>(&cfg), reinterpret_cast<const PaddlePredictor::Config *>(&cfg), input_slots_all);
// input_slots_all); }
// }
TEST(Analyzer_Transformer, compare) { compare(); }
// TODO(yihuaxu): #ifdef PADDLE_WITH_MKLDNN
// Disable compare and compare_mkldnn temporary, see TEST(Analyzer_Transformer, compare_mkldnn) { compare(true /* use_mkldnn */); }
// https://github.com/paddlePaddle/Paddle/issues/16316 for details. #endif
// TEST(Analyzer_Transformer, compare) { compare(); }
// #ifdef PADDLE_WITH_MKLDNN
// TEST(Analyzer_Transformer, compare_mkldnn) { compare(true /* use_mkldnn */);
// }
// #endif
} // namespace inference } // namespace inference
} // namespace paddle } // namespace paddle
# INT8 MKL-DNN quantization
This document describes how to use Paddle inference Engine to convert the FP32 model to INT8 model on ResNet-50 and MobileNet-V1. We provide the instructions on enabling INT8 MKL-DNN quantization in Paddle inference and show the ResNet-50 and MobileNet-V1 results in accuracy and performance.
## 0. Install PaddlePaddle
Follow PaddlePaddle [installation instruction](https://github.com/PaddlePaddle/models/tree/develop/fluid/PaddleCV/image_classification#installation) to install PaddlePaddle. If you build PaddlePaddle yourself, please use the following cmake arguments.
```
cmake .. -DWITH_TESTING=ON -WITH_FLUID_ONLY=ON -DWITH_GPU=OFF -DWITH_MKL=ON -WITH_SWIG_PY=OFF -DWITH_INFERENCE_API_TEST=ON -DON_INFER=ON
```
Note: MKL-DNN and MKL are required.
## 1. Enable INT8 MKL-DNN quantization
For reference, please examine the code of unit test enclosed in [analyzer_int8_image_classification_tester.cc](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/inference/tests/api/analyzer_int8_image_classification_tester.cc).
* ### Create Analysis config
INT8 quantization is one of the optimizations in analysis config. More information about analysis config can be found [here](https://github.com/PaddlePaddle/FluidDoc/blob/develop/doc/fluid/advanced_usage/deploy/inference/native_infer_en.md#upgrade-performance-based-on-contribanalysisconfig-prerelease)
* ### Create quantize config by analysis config
We enable the MKL-DNN quantization procedure by calling an appropriate method from analysis config. Afterwards, all the required quantization parameters (quantization op names, quantization strategies etc.) can be set through quantizer config which is present in the analysis config. It is also necessary to specify a pre-processed warmup dataset and desired batch size.
```cpp
//Enable MKL-DNN quantization
cfg.EnableMkldnnQuantizer();
//use analysis config to call the MKL-DNN quantization config
cfg.mkldnn_quantizer_config()->SetWarmupData(warmup_data);
cfg.mkldnn_quantizer_config()->SetWarmupBatchSize(100);
```
## 2. Accuracy and Performance benchmark
We provide the results of accuracy and performance measured on Intel(R) Xeon(R) Gold 6271 on single core.
>**I. Top-1 Accuracy on Intel(R) Xeon(R) Gold 6271**
| Model | Dataset | FP32 Accuracy | INT8 Accuracy | Accuracy Diff |
| :------------: | :------------: | :------------: | :------------: | :------------: |
| ResNet-50 | Full ImageNet Val | 76.63% | 76.48% | 0.15% |
| MobileNet-V1 | Full ImageNet Val | 70.78% | 70.36% | 0.42% |
>**II. Throughput on Intel(R) Xeon(R) Gold 6271 (batch size 1 on single core)**
| Model | Dataset | FP32 Throughput | INT8 Throughput | Ratio(INT8/FP32) |
| :------------: | :------------: | :------------: | :------------: | :------------: |
| ResNet-50 | Full ImageNet Val | 13.17 images/s | 49.84 images/s | 3.78 |
| MobileNet-V1 | Full ImageNet Val | 75.49 images/s | 232.38 images/s | 3.07 |
Notes:
* Measurement of accuracy requires a model which accepts two inputs: data and labels.
* Different sampling batch size data may cause slight difference on INT8 top accuracy.
* CAPI performance data is better than python API performance data because of the python overhead. Especially for the small computational model, python overhead will be more obvious.
## 3. Commands to reproduce the above accuracy and performance benchmark
* #### Full dataset (Single core)
* ##### Download full ImageNet Validation Dataset
```bash
cd /PATH/TO/PADDLE/build
python ../paddle/fluid/inference/tests/api/full_ILSVRC2012_val_preprocess.py
```
The converted data binary file is saved by default in ~/.cache/paddle/dataset/int8/download/int8_full_val.bin
* ##### ResNet50 Full dataset benchmark
```bash
./paddle/fluid/inference/tests/api/test_analyzer_int8_resnet50 --infer_model=third_party/inference_demo/int8v2/resnet50/model --infer_data=/path/to/converted/int8_full_val.bin --batch_size=1 --paddle_num_threads=1
```
* ##### Mobilenet-v1 Full dataset benchmark
```bash
./paddle/fluid/inference/tests/api/test_analyzer_int8_mobilenet --infer_model=third_party/inference_demo/int8v2/mobilenet/model --infer_data=/path/to/converted/int8_full_val.bin --batch_size=1 --paddle_num_threads=1
```
...@@ -55,6 +55,9 @@ DEFINE_bool(record_benchmark, false, ...@@ -55,6 +55,9 @@ DEFINE_bool(record_benchmark, false,
DEFINE_double(accuracy, 1e-3, "Result Accuracy."); DEFINE_double(accuracy, 1e-3, "Result Accuracy.");
DEFINE_double(quantized_accuracy, 1e-2, "Result Quantized Accuracy."); DEFINE_double(quantized_accuracy, 1e-2, "Result Quantized Accuracy.");
DEFINE_bool(zero_copy, false, "Use ZeroCopy to speedup Feed/Fetch."); DEFINE_bool(zero_copy, false, "Use ZeroCopy to speedup Feed/Fetch.");
DEFINE_bool(warmup, false,
"Use warmup to calculate elapsed_time more accurately. "
"To reduce CI time, it sets false in default.");
DECLARE_bool(profile); DECLARE_bool(profile);
DECLARE_int32(paddle_num_threads); DECLARE_int32(paddle_num_threads);
...@@ -316,7 +319,8 @@ void PredictionRun(PaddlePredictor *predictor, ...@@ -316,7 +319,8 @@ void PredictionRun(PaddlePredictor *predictor,
int num_threads, int tid) { int num_threads, int tid) {
int num_times = FLAGS_repeat; int num_times = FLAGS_repeat;
int iterations = inputs.size(); // process the whole dataset ... int iterations = inputs.size(); // process the whole dataset ...
if (FLAGS_iterations > 0 && FLAGS_iterations < inputs.size()) if (FLAGS_iterations > 0 &&
FLAGS_iterations < static_cast<int64_t>(inputs.size()))
iterations = iterations =
FLAGS_iterations; // ... unless the number of iterations is set FLAGS_iterations; // ... unless the number of iterations is set
outputs->resize(iterations); outputs->resize(iterations);
...@@ -329,14 +333,14 @@ void PredictionRun(PaddlePredictor *predictor, ...@@ -329,14 +333,14 @@ void PredictionRun(PaddlePredictor *predictor,
#endif #endif
if (!FLAGS_zero_copy) { if (!FLAGS_zero_copy) {
run_timer.tic(); run_timer.tic();
for (size_t i = 0; i < iterations; i++) { for (int i = 0; i < iterations; i++) {
for (int j = 0; j < num_times; j++) { for (int j = 0; j < num_times; j++) {
predictor->Run(inputs[i], &(*outputs)[i], FLAGS_batch_size); predictor->Run(inputs[i], &(*outputs)[i], FLAGS_batch_size);
} }
} }
elapsed_time = run_timer.toc(); elapsed_time = run_timer.toc();
} else { } else {
for (size_t i = 0; i < iterations; i++) { for (int i = 0; i < iterations; i++) {
ConvertPaddleTensorToZeroCopyTensor(predictor, inputs[i]); ConvertPaddleTensorToZeroCopyTensor(predictor, inputs[i]);
run_timer.tic(); run_timer.tic();
for (int j = 0; j < num_times; j++) { for (int j = 0; j < num_times; j++) {
...@@ -366,9 +370,10 @@ void TestOneThreadPrediction( ...@@ -366,9 +370,10 @@ void TestOneThreadPrediction(
const std::vector<std::vector<PaddleTensor>> &inputs, const std::vector<std::vector<PaddleTensor>> &inputs,
std::vector<std::vector<PaddleTensor>> *outputs, bool use_analysis = true) { std::vector<std::vector<PaddleTensor>> *outputs, bool use_analysis = true) {
auto predictor = CreateTestPredictor(config, use_analysis); auto predictor = CreateTestPredictor(config, use_analysis);
PredictionWarmUp(predictor.get(), inputs, outputs, FLAGS_paddle_num_threads, if (FLAGS_warmup) {
0); PredictionWarmUp(predictor.get(), inputs, outputs, 1, 0);
PredictionRun(predictor.get(), inputs, outputs, FLAGS_paddle_num_threads, 0); }
PredictionRun(predictor.get(), inputs, outputs, 1, 0);
} }
void TestMultiThreadPrediction( void TestMultiThreadPrediction(
...@@ -395,7 +400,10 @@ void TestMultiThreadPrediction( ...@@ -395,7 +400,10 @@ void TestMultiThreadPrediction(
->SetMkldnnThreadID(static_cast<int>(tid) + 1); ->SetMkldnnThreadID(static_cast<int>(tid) + 1);
} }
#endif #endif
PredictionWarmUp(predictor.get(), inputs, &outputs_tid, num_threads, tid); if (FLAGS_warmup) {
PredictionWarmUp(predictor.get(), inputs, &outputs_tid, num_threads,
tid);
}
PredictionRun(predictor.get(), inputs, &outputs_tid, num_threads, tid); PredictionRun(predictor.get(), inputs, &outputs_tid, num_threads, tid);
}); });
} }
......
abs
acos
asin
atan
attention_lstm attention_lstm
brelu
conv_shift conv_shift
cos
cos_sim cos_sim
dequantize dequantize
elu
fc fc
flatten flatten
fsp fsp
...@@ -21,14 +14,8 @@ fusion_seqconv_eltadd_relu ...@@ -21,14 +14,8 @@ fusion_seqconv_eltadd_relu
fusion_seqexpand_concat_fc fusion_seqexpand_concat_fc
fusion_seqpool_concat fusion_seqpool_concat
fusion_squared_mat_sub fusion_squared_mat_sub
gelu
gru gru
hard_shrink
hierarchical_sigmoid hierarchical_sigmoid
leaky_relu
log
logsigmoid
lookup_table
lrn lrn
lstm_unit lstm_unit
lstmp lstmp
...@@ -39,10 +26,11 @@ modified_huber_loss ...@@ -39,10 +26,11 @@ modified_huber_loss
nce nce
pool2d pool2d
pool3d pool3d
pow
prelu prelu
quantize quantize
rank_loss rank_loss
reduce_all
reduce_any
reduce_max reduce_max
reduce_mean reduce_mean
reduce_min reduce_min
...@@ -51,26 +39,10 @@ reduce_sum ...@@ -51,26 +39,10 @@ reduce_sum
requantize requantize
reshape reshape
rnn_memory_helper rnn_memory_helper
round
sequence_softmax sequence_softmax
sin
softplus
softshrink
softsign
space_to_depth
spp spp
square
squared_l2_distance
squared_l2_norm
squeeze squeeze
stanh
swish
tanh_shrink
teacher_student_sigmoid_loss
tensor_array_to_tensor tensor_array_to_tensor
thresholded_relu
transpose transpose
tree_conv
unpool unpool
unsqueeze unsqueeze
warpctc
...@@ -12,6 +12,9 @@ ...@@ -12,6 +12,9 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include <memory>
#include <string>
#include <vector>
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/activation_op.h" #include "paddle/fluid/operators/activation_op.h"
#include "paddle/fluid/platform/cudnn_desc.h" #include "paddle/fluid/platform/cudnn_desc.h"
...@@ -82,6 +85,8 @@ template <typename T> ...@@ -82,6 +85,8 @@ template <typename T>
struct CudnnReluGradFunctor : public CudnnActivationGradFunctor<T> { struct CudnnReluGradFunctor : public CudnnActivationGradFunctor<T> {
explicit CudnnReluGradFunctor(const CUDADeviceContext& ctx) explicit CudnnReluGradFunctor(const CUDADeviceContext& ctx)
: CudnnActivationGradFunctor<T>(ctx, 0.0, CUDNN_ACTIVATION_RELU) {} : CudnnActivationGradFunctor<T>(ctx, 0.0, CUDNN_ACTIVATION_RELU) {}
static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
}; };
template <typename T> template <typename T>
...@@ -94,6 +99,8 @@ struct CudnnRelu6GradFunctor : public CudnnActivationGradFunctor<T> { ...@@ -94,6 +99,8 @@ struct CudnnRelu6GradFunctor : public CudnnActivationGradFunctor<T> {
explicit CudnnRelu6GradFunctor(const CUDADeviceContext& ctx) explicit CudnnRelu6GradFunctor(const CUDADeviceContext& ctx)
: CudnnActivationGradFunctor<T>(ctx, 6.0, CUDNN_ACTIVATION_CLIPPED_RELU) { : CudnnActivationGradFunctor<T>(ctx, 6.0, CUDNN_ACTIVATION_CLIPPED_RELU) {
} }
static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
}; };
template <typename T> template <typename T>
...@@ -105,6 +112,8 @@ template <typename T> ...@@ -105,6 +112,8 @@ template <typename T>
struct CudnnSigmoidGradFunctor : public CudnnActivationGradFunctor<T> { struct CudnnSigmoidGradFunctor : public CudnnActivationGradFunctor<T> {
explicit CudnnSigmoidGradFunctor(const CUDADeviceContext& ctx) explicit CudnnSigmoidGradFunctor(const CUDADeviceContext& ctx)
: CudnnActivationGradFunctor<T>(ctx, 0.0, CUDNN_ACTIVATION_SIGMOID) {} : CudnnActivationGradFunctor<T>(ctx, 0.0, CUDNN_ACTIVATION_SIGMOID) {}
static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
}; };
template <typename T> template <typename T>
...@@ -116,6 +125,8 @@ template <typename T> ...@@ -116,6 +125,8 @@ template <typename T>
struct CudnnTanhGradFunctor : public CudnnActivationGradFunctor<T> { struct CudnnTanhGradFunctor : public CudnnActivationGradFunctor<T> {
explicit CudnnTanhGradFunctor(const CUDADeviceContext& ctx) explicit CudnnTanhGradFunctor(const CUDADeviceContext& ctx)
: CudnnActivationGradFunctor<T>(ctx, 0.0, CUDNN_ACTIVATION_TANH) {} : CudnnActivationGradFunctor<T>(ctx, 0.0, CUDNN_ACTIVATION_TANH) {}
static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
}; };
template <typename Functor> template <typename Functor>
...@@ -140,10 +151,13 @@ class CudnnActivationGradKernel ...@@ -140,10 +151,13 @@ class CudnnActivationGradKernel
public: public:
using T = typename Functor::ELEMENT_TYPE; using T = typename Functor::ELEMENT_TYPE;
void Compute(const framework::ExecutionContext& context) const override { void Compute(const framework::ExecutionContext& context) const override {
static_assert(Functor::FwdDeps() == kDepOut, "Forward deps must be Out.");
const framework::Tensor *X, *Out, *dOut; const framework::Tensor *X, *Out, *dOut;
X = Out = dOut = nullptr; X = Out = dOut = nullptr;
framework::Tensor* dX = nullptr; framework::Tensor* dX = nullptr;
ExtractActivationGradTensor(context, &X, &Out, &dOut, &dX); ExtractActivationGradTensor<Functor::FwdDeps()>(context, &X, &Out, &dOut,
&dX);
dX->mutable_data<T>(context.GetPlace()); dX->mutable_data<T>(context.GetPlace());
auto& dev_ctx = context.template device_context<CUDADeviceContext>(); auto& dev_ctx = context.template device_context<CUDADeviceContext>();
Functor functor(dev_ctx); Functor functor(dev_ctx);
......
...@@ -15,7 +15,9 @@ limitations under the License. */ ...@@ -15,7 +15,9 @@ limitations under the License. */
#include "paddle/fluid/operators/activation_op.h" #include "paddle/fluid/operators/activation_op.h"
#include <memory> #include <memory>
#include <string> #include <string>
#include <type_traits>
#include <unordered_map> #include <unordered_map>
#include <vector>
#include "paddle/fluid/operators/mkldnn/mkldnn_activation_op.h" #include "paddle/fluid/operators/mkldnn/mkldnn_activation_op.h"
#include "paddle/fluid/platform/port.h" #include "paddle/fluid/platform/port.h"
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
...@@ -27,6 +29,25 @@ namespace operators { ...@@ -27,6 +29,25 @@ namespace operators {
using paddle::framework::Tensor; using paddle::framework::Tensor;
template <typename GradFunctor>
static constexpr bool CanInplaceAct() {
return GradFunctor::FwdDeps() == kDepOut || GradFunctor::FwdDeps() == kNoDeps;
}
std::unique_ptr<std::unordered_set<std::string>> GetInplaceOpSet() {
std::unique_ptr<std::unordered_set<std::string>> ret(
new std::unordered_set<std::string>());
#define INSERT_INTO_INPLACE_OP_SET(op_type, __omitted, fwd_functor, \
bwd_functor) \
if (CanInplaceAct<bwd_functor<float>>()) { \
ret->insert(#op_type); \
}
FOR_EACH_ACTIVATION_OP(INSERT_INTO_INPLACE_OP_SET);
#undef INSERT_INTO_INPLACE_OP_SET
return ret;
}
#define REGISTER_ACTIVATION_OP_MAKER(OP_NAME, OP_COMMENT) \ #define REGISTER_ACTIVATION_OP_MAKER(OP_NAME, OP_COMMENT) \
class OP_NAME##OpMaker \ class OP_NAME##OpMaker \
: public ::paddle::framework::OpProtoAndCheckerMaker { \ : public ::paddle::framework::OpProtoAndCheckerMaker { \
...@@ -50,26 +71,32 @@ using paddle::framework::Tensor; ...@@ -50,26 +71,32 @@ using paddle::framework::Tensor;
} \ } \
} }
#define REGISTER_ACTIVATION_OP_GRAD_MAKER(OP_NAME, KERNEL_TYPE) \ template <ActBwdOpFwdDeps kDepValue>
class OP_NAME##GradMaker \ class ActivationGradOpDescMaker : public framework::SingleGradOpDescMaker {
: public ::paddle::framework::SingleGradOpDescMaker { \ public:
public: \ using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
using ::paddle::framework::SingleGradOpDescMaker::SingleGradOpDescMaker; \
\ protected:
protected: \ std::unique_ptr<framework::OpDesc> Apply() const override {
std::unique_ptr<::paddle::framework::OpDesc> Apply() const override { \ std::unique_ptr<framework::OpDesc> op(new framework::OpDesc());
auto* op = new ::paddle::framework::OpDesc(); \ op->SetType(ForwardOpType() + "_grad");
op->SetType(#KERNEL_TYPE "_grad"); \ op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
op->SetInput("Out", Output("Out")); \ op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
op->SetInput(::paddle::framework::GradVarName("Out"), \ op->SetAttrMap(Attrs());
OutputGrad("Out")); \
\ if (static_cast<int>(kDepValue) &
op->SetAttrMap(Attrs()); \ static_cast<int>(ActBwdOpFwdDeps::kDepX)) {
\ op->SetInput("X", Input("X"));
op->SetOutput(::paddle::framework::GradVarName("X"), InputGrad("X")); \ }
return std::unique_ptr<::paddle::framework::OpDesc>(op); \
} \ if (static_cast<int>(kDepValue) &
static_cast<int>(ActBwdOpFwdDeps::kDepOut)) {
op->SetInput("Out", Output("Out"));
}
return op;
} }
};
framework::OpKernelType GetKernelType(const framework::ExecutionContext& ctx, framework::OpKernelType GetKernelType(const framework::ExecutionContext& ctx,
const framework::OperatorWithKernel& oper, const framework::OperatorWithKernel& oper,
...@@ -129,14 +156,15 @@ class ActivationOpGrad : public framework::OperatorWithKernel { ...@@ -129,14 +156,15 @@ class ActivationOpGrad : public framework::OperatorWithKernel {
using framework::OperatorWithKernel::OperatorWithKernel; using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override { void InferShape(framework::InferShapeContext* ctx) const override {
ctx->ShareDim("Out", framework::GradVarName("X")); auto out_grad_name = framework::GradVarName("Out");
ctx->ShareLoD("Out", framework::GradVarName("X")); ctx->ShareDim(out_grad_name, framework::GradVarName("X"));
ctx->ShareLoD(out_grad_name, framework::GradVarName("X"));
} }
protected: protected:
framework::OpKernelType GetExpectedKernelType( framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override { const framework::ExecutionContext& ctx) const override {
return GetKernelType(ctx, *this, "Out"); return GetKernelType(ctx, *this, framework::GradVarName("Out"));
} }
}; };
...@@ -558,79 +586,27 @@ REGISTER_ACTIVATION_OP_MAKER(Log, LogDoc); ...@@ -558,79 +586,27 @@ REGISTER_ACTIVATION_OP_MAKER(Log, LogDoc);
REGISTER_ACTIVATION_OP_MAKER(Square, SquareDoc); REGISTER_ACTIVATION_OP_MAKER(Square, SquareDoc);
REGISTER_ACTIVATION_OP_MAKER(Softplus, SoftplusDoc); REGISTER_ACTIVATION_OP_MAKER(Softplus, SoftplusDoc);
REGISTER_ACTIVATION_OP_MAKER(Softsign, SoftsignDoc); REGISTER_ACTIVATION_OP_MAKER(Softsign, SoftsignDoc);
REGISTER_ACTIVATION_OP_GRAD_MAKER(Sigmoid, sigmoid);
REGISTER_ACTIVATION_OP_GRAD_MAKER(Relu, relu);
REGISTER_ACTIVATION_OP_GRAD_MAKER(Gelu, gelu);
REGISTER_ACTIVATION_OP_GRAD_MAKER(Exp, exp);
REGISTER_ACTIVATION_OP_GRAD_MAKER(Tanh, tanh);
REGISTER_ACTIVATION_OP_GRAD_MAKER(Ceil, ceil);
REGISTER_ACTIVATION_OP_GRAD_MAKER(Floor, floor);
REGISTER_ACTIVATION_OP_GRAD_MAKER(Sqrt, sqrt);
REGISTER_ACTIVATION_OP_GRAD_MAKER(SoftRelu, soft_relu);
REGISTER_ACTIVATION_OP_GRAD_MAKER(Relu6, relu6);
REGISTER_ACTIVATION_OP_GRAD_MAKER(Reciprocal, reciprocal);
REGISTER_ACTIVATION_OP_GRAD_MAKER(HardSigmoid, hard_sigmoid);
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
namespace ops = paddle::operators; namespace ops = paddle::operators;
#define FOR_EACH_INPLACE_OP_FUNCTOR(__macro) \ #define REGISTER_ACTIVATION_OP(KERNEL_TYPE, OP_NAME, functor, grad_functor) \
__macro(Sigmoid, sigmoid); \ REGISTER_OPERATOR( \
__macro(Relu, relu); \ KERNEL_TYPE, ops::ActivationOp, ops::OP_NAME##OpMaker, \
__macro(Exp, exp); \ ops::ActivationOpInferVarType, \
__macro(Tanh, tanh); \ ops::ActivationGradOpDescMaker<ops::grad_functor<float>::FwdDeps()>, \
__macro(Ceil, ceil); \ std::conditional<ops::CanInplaceAct<ops::grad_functor<float>>(), \
__macro(Floor, floor); \ ::paddle::framework::SingleOpInplaceInToOut, \
__macro(Sqrt, sqrt); \ void>::type); \
__macro(SoftRelu, soft_relu); \ REGISTER_OPERATOR( \
__macro(Relu6, relu6); \ KERNEL_TYPE##_grad, ops::ActivationOpGrad, \
__macro(Reciprocal, reciprocal); \ std::conditional<ops::CanInplaceAct<ops::grad_functor<float>>(), \
__macro(HardSigmoid, hard_sigmoid); ::paddle::framework::SingleOpInplaceInToOut, \
void>::type)
#define FOR_EACH_OP_FUNCTOR(__macro) \
__macro(LogSigmoid, logsigmoid); \ #define REGISTER_ACTIVATION_CPU_KERNEL(act_type, op_name, functor, \
__macro(SoftShrink, softshrink); \ grad_functor) \
__macro(Abs, abs); \
__macro(Cos, cos); \
__macro(Acos, acos); \
__macro(Sin, sin); \
__macro(Asin, asin); \
__macro(Atan, atan); \
__macro(Round, round); \
__macro(Log, log); \
__macro(Square, square); \
__macro(Gelu, gelu); \
__macro(BRelu, brelu); \
__macro(Pow, pow); \
__macro(STanh, stanh); \
__macro(Softplus, softplus); \
__macro(Softsign, softsign); \
__macro(LeakyRelu, leaky_relu); \
__macro(TanhShrink, tanh_shrink); \
__macro(ELU, elu); \
__macro(HardShrink, hard_shrink); \
__macro(Swish, swish); \
__macro(ThresholdedRelu, thresholded_relu);
#define REGISTER_INPLACE_ACTIVATION_OP(OP_NAME, KERNEL_TYPE) \
REGISTER_OPERATOR(KERNEL_TYPE, ::paddle::operators::ActivationOp, \
::paddle::operators::OP_NAME##OpMaker, \
::paddle::operators::ActivationOpInferVarType, \
::paddle::operators::OP_NAME##GradMaker, \
::paddle::framework::SingleOpInplaceInToOut); \
REGISTER_OPERATOR(KERNEL_TYPE##_grad, ::paddle::operators::ActivationOpGrad, \
::paddle::framework::SingleOpInplaceInToOut)
#define REGISTER_ACTIVATION_OP(OP_NAME, KERNEL_TYPE) \
REGISTER_OPERATOR(KERNEL_TYPE, ::paddle::operators::ActivationOp, \
::paddle::operators::OP_NAME##OpMaker, \
::paddle::operators::ActivationOpInferVarType, \
::paddle::framework::DefaultGradOpDescMaker<true>); \
REGISTER_OPERATOR(KERNEL_TYPE##_grad, ::paddle::operators::ActivationOpGrad)
#define REGISTER_ACTIVATION_CPU_KERNEL(act_type, functor, grad_functor) \
REGISTER_OP_CPU_KERNEL( \ REGISTER_OP_CPU_KERNEL( \
act_type, ops::ActivationKernel<paddle::platform::CPUDeviceContext, \ act_type, ops::ActivationKernel<paddle::platform::CPUDeviceContext, \
ops::functor<float>>, \ ops::functor<float>>, \
...@@ -643,6 +619,5 @@ namespace ops = paddle::operators; ...@@ -643,6 +619,5 @@ namespace ops = paddle::operators;
ops::ActivationGradKernel<paddle::platform::CPUDeviceContext, \ ops::ActivationGradKernel<paddle::platform::CPUDeviceContext, \
ops::grad_functor<double>>); ops::grad_functor<double>>);
FOR_EACH_OP_FUNCTOR(REGISTER_ACTIVATION_OP); FOR_EACH_ACTIVATION_OP(REGISTER_ACTIVATION_OP);
FOR_EACH_INPLACE_OP_FUNCTOR(REGISTER_INPLACE_ACTIVATION_OP); FOR_EACH_ACTIVATION_OP(REGISTER_ACTIVATION_CPU_KERNEL);
FOR_EACH_KERNEL_FUNCTOR(REGISTER_ACTIVATION_CPU_KERNEL);
...@@ -15,7 +15,8 @@ limitations under the License. */ ...@@ -15,7 +15,8 @@ limitations under the License. */
namespace ops = paddle::operators; namespace ops = paddle::operators;
namespace plat = paddle::platform; namespace plat = paddle::platform;
#define REGISTER_ACTIVATION_CUDA_KERNEL(act_type, functor, grad_functor) \ #define REGISTER_ACTIVATION_CUDA_KERNEL(act_type, op_name, functor, \
grad_functor) \
REGISTER_OP_CUDA_KERNEL( \ REGISTER_OP_CUDA_KERNEL( \
act_type, \ act_type, \
ops::ActivationKernel<plat::CUDADeviceContext, ops::functor<float>>, \ ops::ActivationKernel<plat::CUDADeviceContext, ops::functor<float>>, \
...@@ -30,4 +31,4 @@ namespace plat = paddle::platform; ...@@ -30,4 +31,4 @@ namespace plat = paddle::platform;
ops::ActivationGradKernel<plat::CUDADeviceContext, \ ops::ActivationGradKernel<plat::CUDADeviceContext, \
ops::grad_functor<plat::float16>>); ops::grad_functor<plat::float16>>);
FOR_EACH_KERNEL_FUNCTOR(REGISTER_ACTIVATION_CUDA_KERNEL); FOR_EACH_ACTIVATION_OP(REGISTER_ACTIVATION_CUDA_KERNEL);
...@@ -121,9 +121,11 @@ class AffineGridOpKernel : public framework::OpKernel<T> { ...@@ -121,9 +121,11 @@ class AffineGridOpKernel : public framework::OpKernel<T> {
// TODO(wanghaoshuang): Refine batched matrix multiply // TODO(wanghaoshuang): Refine batched matrix multiply
auto blas = math::GetBlas<DeviceContext, T>(ctx); auto blas = math::GetBlas<DeviceContext, T>(ctx);
for (int i = 0; i < n; ++i) { for (int i = 0; i < n; ++i) {
Tensor sliced_grid = grid.Slice(i, i + 1).Resize({h * w, 3}); Tensor sliced_grid = grid.Slice(i, i + 1).Resize(
{static_cast<int64_t>(h) * static_cast<int64_t>(w), 3});
Tensor sliced_theta = theta->Slice(i, i + 1).Resize({2, 3}); Tensor sliced_theta = theta->Slice(i, i + 1).Resize({2, 3});
Tensor sliced_out = output->Slice(i, i + 1).Resize({h * w, 2}); Tensor sliced_out = output->Slice(i, i + 1).Resize(
{static_cast<int64_t>(h) * static_cast<int64_t>(w), 2});
blas.MatMul(sliced_grid, false, sliced_theta, true, T(1), &sliced_out, blas.MatMul(sliced_grid, false, sliced_theta, true, T(1), &sliced_out,
T(0)); T(0));
} }
...@@ -161,8 +163,10 @@ class AffineGridGradOpKernel : public framework::OpKernel<T> { ...@@ -161,8 +163,10 @@ class AffineGridGradOpKernel : public framework::OpKernel<T> {
// TODO(wanghaoshuang): Refine batched matrix multiply // TODO(wanghaoshuang): Refine batched matrix multiply
auto blas = math::GetBlas<DeviceContext, T>(ctx); auto blas = math::GetBlas<DeviceContext, T>(ctx);
for (int i = 0; i < n; ++i) { for (int i = 0; i < n; ++i) {
Tensor sliced_grid = grid.Slice(i, i + 1).Resize({h * w, 3}); Tensor sliced_grid = grid.Slice(i, i + 1).Resize(
Tensor sliced_out_grad = output_grad->Slice(i, i + 1).Resize({h * w, 2}); {static_cast<int64_t>(h) * static_cast<int64_t>(w), 3});
Tensor sliced_out_grad = output_grad->Slice(i, i + 1).Resize(
{static_cast<int64_t>(h) * static_cast<int64_t>(w), 2});
Tensor sliced_theta_grad = theta_grad->Slice(i, i + 1).Resize({2, 3}); Tensor sliced_theta_grad = theta_grad->Slice(i, i + 1).Resize({2, 3});
blas.MatMul(sliced_out_grad, true, sliced_grid, false, T(1), blas.MatMul(sliced_out_grad, true, sliced_grid, false, T(1),
&sliced_theta_grad, T(0)); &sliced_theta_grad, T(0));
......
...@@ -12,6 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,6 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include <algorithm> #include <algorithm>
#include <memory>
#include <string>
#include <vector>
#include "paddle/fluid/framework/executor.h" #include "paddle/fluid/framework/executor.h"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/var_type.h" #include "paddle/fluid/framework/var_type.h"
...@@ -174,24 +177,41 @@ class ConditionalBlockGradOp : public ConditionalOp { ...@@ -174,24 +177,41 @@ class ConditionalBlockGradOp : public ConditionalOp {
framework::Executor exec(dev_place); framework::Executor exec(dev_place);
auto *block = Attr<framework::BlockDesc *>("sub_block"); auto *block = Attr<framework::BlockDesc *>("sub_block");
exec.Run(*block->Program(), &cur_scope, block->ID(), false);
AssignLocalGradientToGlobal(dev_place, cur_scope, Inputs("Input"), const auto &ins = Inputs("Input");
Outputs(framework::GradVarName("Input"))); const auto &d_ins = Outputs(framework::GradVarName("Input"));
const auto &conds = Inputs("Cond");
const auto &d_conds = Outputs(framework::GradVarName("Cond"));
std::vector<std::string> ins_conds_grads;
ins_conds_grads.reserve(ins.size() + conds.size());
for (auto &in : ins) {
ins_conds_grads.emplace_back(framework::GradVarName(in));
}
for (auto &cond : conds) {
ins_conds_grads.emplace_back(framework::GradVarName(cond));
}
exec.Run(*block->Program(), &cur_scope, block->ID(), false, true,
ins_conds_grads);
AssignLocalGradientToGlobal(dev_place, cur_scope, ins_conds_grads.data(),
ins.size(), d_ins);
AssignLocalGradientToGlobal(dev_place, cur_scope, Inputs("Cond"), AssignLocalGradientToGlobal(dev_place, cur_scope,
Outputs(framework::GradVarName("Cond"))); ins_conds_grads.data() + ins.size(),
conds.size(), d_conds);
} }
} }
private: private:
void AssignLocalGradientToGlobal( void AssignLocalGradientToGlobal(
const platform::Place &place, const framework::Scope &cur_scope, const platform::Place &place, const framework::Scope &cur_scope,
const std::vector<std::string> &p_names, const std::string *p_grad_names, size_t p_grad_names_num,
const std::vector<std::string> &pg_names) const { const std::vector<std::string> &pg_names) const {
for (size_t i = 0; i < p_names.size(); ++i) { for (size_t i = 0; i < p_grad_names_num; ++i) {
auto out_grad_name = pg_names[i]; auto out_grad_name = pg_names[i];
auto in_grad_name = framework::GradVarName(p_names[i]); const auto &in_grad_name = p_grad_names[i];
auto *in_var = cur_scope.FindVar(in_grad_name); auto *in_var = cur_scope.FindVar(in_grad_name);
if (in_var == nullptr) { if (in_var == nullptr) {
continue; continue;
......
...@@ -24,6 +24,7 @@ ...@@ -24,6 +24,7 @@
**/ **/
#include "paddle/fluid/operators/detection/gpc.h" #include "paddle/fluid/operators/detection/gpc.h"
#include "paddle/fluid/platform/enforce.h"
namespace gpc { namespace gpc {
...@@ -689,6 +690,7 @@ static bbox *create_contour_bboxes(gpc_polygon *p) { ...@@ -689,6 +690,7 @@ static bbox *create_contour_bboxes(gpc_polygon *p) {
gpc_malloc<bbox>(box, p->num_contours * sizeof(bbox), gpc_malloc<bbox>(box, p->num_contours * sizeof(bbox),
const_cast<char *>("Bounding box creation")); const_cast<char *>("Bounding box creation"));
PADDLE_ENFORCE_NOT_NULL(box);
/* Construct contour bounding boxes */ /* Construct contour bounding boxes */
for (c = 0; c < p->num_contours; c++) { for (c = 0; c < p->num_contours; c++) {
...@@ -852,6 +854,7 @@ void gpc_add_contour(gpc_polygon *p, gpc_vertex_list *new_contour, int hole) { ...@@ -852,6 +854,7 @@ void gpc_add_contour(gpc_polygon *p, gpc_vertex_list *new_contour, int hole) {
/* Create an extended hole array */ /* Create an extended hole array */
gpc_malloc<int>(extended_hole, (p->num_contours + 1) * sizeof(int), gpc_malloc<int>(extended_hole, (p->num_contours + 1) * sizeof(int),
const_cast<char *>("contour hole addition")); const_cast<char *>("contour hole addition"));
PADDLE_ENFORCE_NOT_NULL(extended_hole);
/* Create an extended contour array */ /* Create an extended contour array */
gpc_malloc<gpc_vertex_list>(extended_contour, gpc_malloc<gpc_vertex_list>(extended_contour,
...@@ -969,6 +972,7 @@ void gpc_polygon_clip(gpc_op op, gpc_polygon *subj, gpc_polygon *clip, ...@@ -969,6 +972,7 @@ void gpc_polygon_clip(gpc_op op, gpc_polygon *subj, gpc_polygon *clip,
/* Build scanbeam table from scanbeam tree */ /* Build scanbeam table from scanbeam tree */
gpc_malloc<double>(sbt, sbt_entries * sizeof(double), gpc_malloc<double>(sbt, sbt_entries * sizeof(double),
const_cast<char *>("sbt creation")); const_cast<char *>("sbt creation"));
PADDLE_ENFORCE_NOT_NULL(sbt);
build_sbt(&scanbeam, sbt, sbtree); build_sbt(&scanbeam, sbt, sbtree);
scanbeam = 0; scanbeam = 0;
free_sbtree(&sbtree); free_sbtree(&sbtree);
...@@ -1604,6 +1608,7 @@ void gpc_tristrip_clip(gpc_op op, gpc_polygon *subj, gpc_polygon *clip, ...@@ -1604,6 +1608,7 @@ void gpc_tristrip_clip(gpc_op op, gpc_polygon *subj, gpc_polygon *clip,
/* Build scanbeam table from scanbeam tree */ /* Build scanbeam table from scanbeam tree */
gpc_malloc<double>(sbt, sbt_entries * sizeof(double), gpc_malloc<double>(sbt, sbt_entries * sizeof(double),
const_cast<char *>("sbt creation")); const_cast<char *>("sbt creation"));
PADDLE_ENFORCE_NOT_NULL(sbt);
build_sbt(&scanbeam, sbt, sbtree); build_sbt(&scanbeam, sbt, sbtree);
scanbeam = 0; scanbeam = 0;
free_sbtree(&sbtree); free_sbtree(&sbtree);
......
...@@ -16,6 +16,7 @@ limitations under the License. */ ...@@ -16,6 +16,7 @@ limitations under the License. */
#include <nccl.h> #include <nccl.h>
#endif #endif
#include <limits> #include <limits>
#include <memory>
#include <thread> // NOLINT #include <thread> // NOLINT
#include "google/protobuf/io/coded_stream.h" #include "google/protobuf/io/coded_stream.h"
...@@ -104,8 +105,10 @@ void SerializeToByteBuffer(const std::string& name, framework::Variable* var, ...@@ -104,8 +105,10 @@ void SerializeToByteBuffer(const std::string& name, framework::Variable* var,
e.WriteVarlengthBeginning(VarMsg::kSerializedFieldNumber, e.WriteVarlengthBeginning(VarMsg::kSerializedFieldNumber,
payload->memory_size()); payload->memory_size());
if (payload->memory_size() >= std::numeric_limits<int>::max()) { if (payload->memory_size() >= std::numeric_limits<int>::max()) {
LOG(FATAL) << "AppendZeroCopy varname:" << name LOG(FATAL) << "FATAL error: varname:" << name
<< ", vlen:" << payload->memory_size(); << ", vlen:" << payload->memory_size()
<< " >= std::numeric_limits<int>::max():"
<< std::numeric_limits<int>::max() << ", so exit!";
} }
// steal reference of tensor data // steal reference of tensor data
::grpc::Slice slices[4]; // metadata, tensor, rows meta, rows ::grpc::Slice slices[4]; // metadata, tensor, rows meta, rows
......
...@@ -37,10 +37,19 @@ class InterpolateOp : public framework::OperatorWithKernel { ...@@ -37,10 +37,19 @@ class InterpolateOp : public framework::OperatorWithKernel {
"Interpolation method can only be \"bilinear\" or \"nearest\"."); "Interpolation method can only be \"bilinear\" or \"nearest\".");
auto dim_x = ctx->GetInputDim("X"); // NCHW format auto dim_x = ctx->GetInputDim("X"); // NCHW format
int out_h = ctx->Attrs().Get<int>("out_h");
int out_w = ctx->Attrs().Get<int>("out_w");
PADDLE_ENFORCE_EQ(dim_x.size(), 4, "X's dimension must be 4"); PADDLE_ENFORCE_EQ(dim_x.size(), 4, "X's dimension must be 4");
int out_h, out_w;
float scale = ctx->Attrs().Get<float>("scale");
if (scale > 0) {
// round down
out_h = static_cast<int>(dim_x[2] * scale);
out_w = static_cast<int>(dim_x[3] * scale);
} else {
out_h = ctx->Attrs().Get<int>("out_h");
out_w = ctx->Attrs().Get<int>("out_w");
}
if (ctx->HasInput("OutSize") && ctx->IsRuntime()) { if (ctx->HasInput("OutSize") && ctx->IsRuntime()) {
auto out_size_dim = ctx->GetInputDim("OutSize"); auto out_size_dim = ctx->GetInputDim("OutSize");
PADDLE_ENFORCE_EQ(out_size_dim.size(), 1, PADDLE_ENFORCE_EQ(out_size_dim.size(), 1,
...@@ -77,6 +86,7 @@ class InterpolateOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -77,6 +86,7 @@ class InterpolateOpMaker : public framework::OpProtoAndCheckerMaker {
AddAttr<int>("out_h", "output height of interpolate op."); AddAttr<int>("out_h", "output height of interpolate op.");
AddAttr<int>("out_w", "output width of interpolate op."); AddAttr<int>("out_w", "output width of interpolate op.");
AddAttr<float>("scale", "scale factor of interpolate op.").SetDefault(0.);
AddAttr<std::string>("interp_method", AddAttr<std::string>("interp_method",
"(string, default \"bilinear\"), interpolation " "(string, default \"bilinear\"), interpolation "
"method, can be \"bilinear\" for " "method, can be \"bilinear\" for "
......
...@@ -192,9 +192,21 @@ class InterpolateOpCUDAKernel : public framework::OpKernel<T> { ...@@ -192,9 +192,21 @@ class InterpolateOpCUDAKernel : public framework::OpKernel<T> {
auto* output = ctx.Output<Tensor>("Out"); auto* output = ctx.Output<Tensor>("Out");
auto* input_data = input->data<T>(); auto* input_data = input->data<T>();
int n = input->dims()[0];
int c = input->dims()[1];
int in_h = input->dims()[2];
int in_w = input->dims()[3];
auto interp_method = ctx.Attr<std::string>("interp_method"); auto interp_method = ctx.Attr<std::string>("interp_method");
int out_h = ctx.Attr<int>("out_h"); int out_h = ctx.Attr<int>("out_h");
int out_w = ctx.Attr<int>("out_w"); int out_w = ctx.Attr<int>("out_w");
float scale = ctx.Attr<float>("scale");
if (scale > 0) {
out_h = in_h * scale;
out_w = in_w * scale;
}
auto out_size = ctx.Input<Tensor>("OutSize"); auto out_size = ctx.Input<Tensor>("OutSize");
if (out_size != nullptr) { if (out_size != nullptr) {
Tensor sizes; Tensor sizes;
...@@ -207,11 +219,6 @@ class InterpolateOpCUDAKernel : public framework::OpKernel<T> { ...@@ -207,11 +219,6 @@ class InterpolateOpCUDAKernel : public framework::OpKernel<T> {
bool align_corners = ctx.Attr<bool>("align_corners"); bool align_corners = ctx.Attr<bool>("align_corners");
int align_mode = ctx.Attr<int>("align_mode"); int align_mode = ctx.Attr<int>("align_mode");
int n = input->dims()[0];
int c = input->dims()[1];
int in_h = input->dims()[2];
int in_w = input->dims()[3];
auto* output_data = auto* output_data =
output->mutable_data<T>({n, c, out_h, out_w}, ctx.GetPlace()); output->mutable_data<T>({n, c, out_h, out_w}, ctx.GetPlace());
...@@ -268,14 +275,20 @@ class InterpolateGradOpCUDAKernel : public framework::OpKernel<T> { ...@@ -268,14 +275,20 @@ class InterpolateGradOpCUDAKernel : public framework::OpKernel<T> {
math::SetConstant<platform::CUDADeviceContext, T> zero; math::SetConstant<platform::CUDADeviceContext, T> zero;
zero(device_ctx, input_grad, static_cast<T>(0.0)); zero(device_ctx, input_grad, static_cast<T>(0.0));
int n = input_grad->dims()[0];
int c = input_grad->dims()[1];
int in_h = input_grad->dims()[2];
int in_w = input_grad->dims()[3];
auto interp_method = ctx.Attr<std::string>("interp_method"); auto interp_method = ctx.Attr<std::string>("interp_method");
int out_h = ctx.Attr<int>("out_h"); int out_h = ctx.Attr<int>("out_h");
int out_w = ctx.Attr<int>("out_w"); int out_w = ctx.Attr<int>("out_w");
float scale = ctx.Attr<float>("scale");
if (scale > 0) {
out_h = in_h * scale;
out_w - in_w* scale;
}
auto out_size = ctx.Input<Tensor>("OutSize"); auto out_size = ctx.Input<Tensor>("OutSize");
bool align_corners = ctx.Attr<bool>("align_corners");
int align_mode = ctx.Attr<int>("align_mode");
if (out_size != nullptr) { if (out_size != nullptr) {
Tensor sizes; Tensor sizes;
framework::TensorCopy(*out_size, platform::CPUPlace(), &sizes); framework::TensorCopy(*out_size, platform::CPUPlace(), &sizes);
...@@ -284,10 +297,8 @@ class InterpolateGradOpCUDAKernel : public framework::OpKernel<T> { ...@@ -284,10 +297,8 @@ class InterpolateGradOpCUDAKernel : public framework::OpKernel<T> {
out_w = size_data[1]; out_w = size_data[1];
} }
int n = input_grad->dims()[0]; bool align_corners = ctx.Attr<bool>("align_corners");
int c = input_grad->dims()[1]; int align_mode = ctx.Attr<int>("align_mode");
int in_h = input_grad->dims()[2];
int in_w = input_grad->dims()[3];
int in_hw = in_h * in_w; int in_hw = in_h * in_w;
int out_hw = out_h * out_w; int out_hw = out_h * out_w;
......
...@@ -163,9 +163,21 @@ class InterpolateKernel : public framework::OpKernel<T> { ...@@ -163,9 +163,21 @@ class InterpolateKernel : public framework::OpKernel<T> {
auto* input = ctx.Input<Tensor>("X"); auto* input = ctx.Input<Tensor>("X");
auto* output = ctx.Output<Tensor>("Out"); auto* output = ctx.Output<Tensor>("Out");
const int n = input->dims()[0];
const int c = input->dims()[1];
const int in_h = input->dims()[2];
const int in_w = input->dims()[3];
std::string interp_method = ctx.Attr<std::string>("interp_method"); std::string interp_method = ctx.Attr<std::string>("interp_method");
int out_h = ctx.Attr<int>("out_h"); int out_h = ctx.Attr<int>("out_h");
int out_w = ctx.Attr<int>("out_w"); int out_w = ctx.Attr<int>("out_w");
float scale = ctx.Attr<float>("scale");
if (scale > 0) {
out_h = static_cast<int>(in_h * scale);
out_w = static_cast<int>(in_w * scale);
}
auto out_size = ctx.Input<Tensor>("OutSize"); auto out_size = ctx.Input<Tensor>("OutSize");
if (out_size != nullptr) { if (out_size != nullptr) {
auto out_size_data = out_size->data<int>(); auto out_size_data = out_size->data<int>();
...@@ -175,11 +187,6 @@ class InterpolateKernel : public framework::OpKernel<T> { ...@@ -175,11 +187,6 @@ class InterpolateKernel : public framework::OpKernel<T> {
bool align_corners = ctx.Attr<bool>("align_corners"); bool align_corners = ctx.Attr<bool>("align_corners");
int align_mode = ctx.Attr<int>("align_mode"); int align_mode = ctx.Attr<int>("align_mode");
const int n = input->dims()[0];
const int c = input->dims()[1];
const int in_h = input->dims()[2];
const int in_w = input->dims()[3];
output->mutable_data<T>({n, c, out_h, out_w}, ctx.GetPlace()); output->mutable_data<T>({n, c, out_h, out_w}, ctx.GetPlace());
auto& device_ctx = auto& device_ctx =
ctx.template device_context<platform::CPUDeviceContext>(); ctx.template device_context<platform::CPUDeviceContext>();
...@@ -221,23 +228,31 @@ class InterpolateGradKernel : public framework::OpKernel<T> { ...@@ -221,23 +228,31 @@ class InterpolateGradKernel : public framework::OpKernel<T> {
auto* input_grad = ctx.Output<Tensor>(framework::GradVarName("X")); auto* input_grad = ctx.Output<Tensor>(framework::GradVarName("X"));
auto* output_grad = ctx.Input<Tensor>(framework::GradVarName("Out")); auto* output_grad = ctx.Input<Tensor>(framework::GradVarName("Out"));
const int n = input->dims()[0];
const int c = input->dims()[1];
const int in_h = input->dims()[2];
const int in_w = input->dims()[3];
std::string interp_method = ctx.Attr<std::string>("interp_method"); std::string interp_method = ctx.Attr<std::string>("interp_method");
int out_h = ctx.Attr<int>("out_h"); int out_h = ctx.Attr<int>("out_h");
int out_w = ctx.Attr<int>("out_w"); int out_w = ctx.Attr<int>("out_w");
float scale = ctx.Attr<float>("scale");
if (scale > 0) {
out_h = static_cast<int>(in_h * scale);
out_w = static_cast<int>(in_w * scale);
}
auto out_size = ctx.Input<Tensor>("OutSize"); auto out_size = ctx.Input<Tensor>("OutSize");
if (out_size != nullptr) { if (out_size != nullptr) {
auto out_size_data = out_size->data<int>(); auto out_size_data = out_size->data<int>();
out_h = out_size_data[0]; out_h = out_size_data[0];
out_w = out_size_data[1]; out_w = out_size_data[1];
} }
bool align_corners = ctx.Attr<bool>("align_corners"); bool align_corners = ctx.Attr<bool>("align_corners");
int align_mode = ctx.Attr<int>("align_mode"); int align_mode = ctx.Attr<int>("align_mode");
const int n = input->dims()[0];
const int c = input->dims()[1];
const int in_h = input->dims()[2];
const int in_w = input->dims()[3];
input_grad->mutable_data<T>({n, c, in_h, in_w}, ctx.GetPlace()); input_grad->mutable_data<T>({n, c, in_h, in_w}, ctx.GetPlace());
auto& device_ctx = auto& device_ctx =
ctx.template device_context<platform::CPUDeviceContext>(); ctx.template device_context<platform::CPUDeviceContext>();
......
...@@ -991,15 +991,17 @@ TEST(JITKernel_pool, jitpool) { ...@@ -991,15 +991,17 @@ TEST(JITKernel_pool, jitpool) {
TEST(JITKernel_pool, more) { TEST(JITKernel_pool, more) {
const auto& kers = jit::KernelPool::Instance().AllKernels(); const auto& kers = jit::KernelPool::Instance().AllKernels();
#if defined(__APPLE__) || defined(__OSX__) size_t target_num = 8;
EXPECT_EQ(kers.size(), 10UL);
#else #ifdef __AVX__
#ifdef PADDLE_WITH_MKLML target_num += 2;
EXPECT_EQ(kers.size(), 22UL);
#else
EXPECT_EQ(kers.size(), 8UL);
#endif #endif
#ifdef PADDLE_WITH_MKLML
target_num += 12;
#endif #endif
EXPECT_EQ(kers.size(), target_num);
} }
TEST(JITKernel_pool, refer) { TEST(JITKernel_pool, refer) {
......
/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/linspace_op.h"
namespace paddle {
namespace operators {
class LinspaceOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("Start"),
"Input(Start) of LinspaceOp should not be null.");
PADDLE_ENFORCE(ctx->HasInput("Stop"),
"Input(Stop) of LinspaceOp should not be null.");
PADDLE_ENFORCE(ctx->HasInput("Num"),
"Input(Num) of LinspaceOp should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("Out"),
"Output(OUt) of LinspaceOp should not be null.");
auto s_dims = ctx->GetInputDim("Start");
PADDLE_ENFORCE((s_dims.size() == 1) && (s_dims[0] == 1),
"The shape of Input(Start) should be [1].");
auto e_dims = ctx->GetInputDim("Stop");
PADDLE_ENFORCE((e_dims.size() == 1) && (e_dims[0] == 1),
"The shape of Input(Stop) should be [1].");
auto step_dims = ctx->GetInputDim("Num");
PADDLE_ENFORCE((step_dims.size() == 1) && (step_dims[0] == 1),
"The shape of Input(Num) should be [1].");
ctx->SetOutputDim("Out", {-1});
}
protected:
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override {
framework::LibraryType library_{framework::LibraryType::kPlain};
framework::DataLayout layout_ = framework::DataLayout::kAnyLayout;
return framework::OpKernelType(
ctx.Input<framework::Tensor>("Start")->type(), ctx.device_context(),
layout_, library_);
}
};
class LinspaceOpMaker : public framework::OpProtoAndCheckerMaker {
public:
void Make() override {
AddInput("Start",
"First entry in the sequence. It is a tensor of shape [1], should "
"be of type float32 or float64.");
AddInput("Stop",
"Last entry in the sequence. It is a tensor of shape [1], should "
"be of type float32 or float64.");
AddInput("Num",
"Number of entry in the sequence. It is a tensor of shape [1], "
"should be of type int32.");
AddOutput("Out", "A sequence of numbers.");
AddComment(R"DOC(
Return fixed number of evenly spaced values within a given interval. First entry is start, and last entry is stop. In the case when Num is 1, only Start is returned. Like linspace function of numpy.
)DOC");
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP_WITHOUT_GRADIENT(linspace, ops::LinspaceOp, ops::LinspaceOpMaker);
REGISTER_OP_CPU_KERNEL(linspace, ops::CPULinspaceKernel<float>,
ops::CPULinspaceKernel<double>);
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/linspace_op.h"
#include "paddle/fluid/platform/cuda_primitives.h"
namespace paddle {
namespace operators {
#define CUDA_1D_KERNEL_LOOP(i, n) \
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
i += blockDim.x * gridDim.x)
template <typename T>
__global__ void LinspaceKernel(T start, T step, int64_t size, T* out) {
CUDA_1D_KERNEL_LOOP(index, size) { out[index] = start + step * index; }
}
template <typename T>
__global__ void LinspaceSpecialKernel(T start, T* out) {
out[0] = start;
}
template <typename T>
class CUDALinspaceKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
auto* start_t = context.Input<framework::Tensor>("Start");
auto* stop_t = context.Input<framework::Tensor>("Stop");
auto* num_t = context.Input<framework::Tensor>("Num");
auto* out = context.Output<framework::Tensor>("Out");
framework::Tensor n;
framework::TensorCopy(*start_t, platform::CPUPlace(), &n);
T start = n.data<T>()[0];
framework::TensorCopy(*stop_t, platform::CPUPlace(), &n);
T stop = n.data<T>()[0];
framework::TensorCopy(*num_t, platform::CPUPlace(), &n);
int32_t num = n.data<int32_t>()[0];
PADDLE_ENFORCE(num > 0, "The num of linspace op should be larger than 0.");
out->Resize(framework::make_ddim({num}));
T* out_data = out->mutable_data<T>(context.GetPlace());
T step = 0;
if (num != 1) {
step = (stop - start) / (num - 1);
}
auto stream = context.cuda_device_context().stream();
int block = 512;
int grid = (num + block - 1) / block;
LinspaceKernel<T><<<grid, block, 0, stream>>>(start, step, num, out_data);
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP_CUDA_KERNEL(linspace, ops::CUDALinspaceKernel<float>,
ops::CUDALinspaceKernel<double>);
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <functional>
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/math/math_function.h"
namespace paddle {
namespace operators {
template <typename T>
class CPULinspaceKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
T start = context.Input<framework::Tensor>("Start")->data<T>()[0];
T stop = context.Input<framework::Tensor>("Stop")->data<T>()[0];
int32_t num = context.Input<framework::Tensor>("Num")->data<int32_t>()[0];
auto* out = context.Output<framework::Tensor>("Out");
PADDLE_ENFORCE(num > 0, "The num of linspace op should be larger than 0.");
out->Resize(framework::make_ddim({num}));
T* out_data = out->mutable_data<T>(context.GetPlace());
if (num > 1) {
T step = (stop - start) / (num - 1);
T value = start;
for (int i = 0; i < num; ++i) {
out_data[i] = value;
value += step;
}
} else {
out_data[0] = start;
}
}
};
} // namespace operators
} // namespace paddle
...@@ -29,7 +29,7 @@ class LoadOp : public framework::OperatorWithKernel { ...@@ -29,7 +29,7 @@ class LoadOp : public framework::OperatorWithKernel {
framework::OpKernelType GetExpectedKernelType( framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext &ctx) const override { const framework::ExecutionContext &ctx) const override {
framework::OpKernelType kt = framework::OpKernelType( framework::OpKernelType kt = framework::OpKernelType(
framework::proto::VarType::FP32, platform::CPUPlace()); framework::proto::VarType::FP32, ctx.GetPlace());
return kt; return kt;
} }
}; };
......
...@@ -30,10 +30,10 @@ class LoDResetOp : public framework::OperatorWithKernel { ...@@ -30,10 +30,10 @@ class LoDResetOp : public framework::OperatorWithKernel {
if (!ctx->HasInput("Y")) { if (!ctx->HasInput("Y")) {
auto level0 = ctx->Attrs().Get<std::vector<int>>("target_lod"); auto level0 = ctx->Attrs().Get<std::vector<int>>("target_lod");
PADDLE_ENFORCE_GT(level0.size(), 1, PADDLE_ENFORCE_GT(level0.size(), 0,
"If Input(Y) not provided, the target lod should be " "If Input(Y) not provided, the target lod should be "
"specified by attribute `target_lod`."); "specified by attribute `target_lod`.");
} else { } else if (ctx->IsRuntime()) {
ctx->ShareLoD("Y", "Out"); ctx->ShareLoD("Y", "Out");
} }
...@@ -48,6 +48,23 @@ class LoDResetOp : public framework::OperatorWithKernel { ...@@ -48,6 +48,23 @@ class LoDResetOp : public framework::OperatorWithKernel {
} }
}; };
class LoDResetOpVarTypeInference : public framework::VarTypeInference {
public:
void operator()(framework::InferVarTypeContext *ctx) const override {
auto x_var_name = ctx->Input("X").front();
auto out_var_name = ctx->Output("Out").front();
if (ctx->HasInput("Y")) {
auto y_var_name = ctx->Input("Y").front();
auto y_lod_level = std::max(ctx->GetLoDLevel(y_var_name), 1);
ctx->SetLoDLevel(out_var_name, y_lod_level);
} else {
ctx->SetLoDLevel(out_var_name, 1);
}
ctx->SetDataType(out_var_name, ctx->GetDataType(x_var_name));
ctx->SetType(out_var_name, paddle::framework::proto::VarType::LOD_TENSOR);
}
};
class LoDResetOpMaker : public framework::OpProtoAndCheckerMaker { class LoDResetOpMaker : public framework::OpProtoAndCheckerMaker {
public: public:
void Make() override { void Make() override {
...@@ -177,9 +194,10 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERENCE(LoDResetGradNoNeedBufferVarInference, ...@@ -177,9 +194,10 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERENCE(LoDResetGradNoNeedBufferVarInference,
namespace ops = paddle::operators; namespace ops = paddle::operators;
REGISTER_OPERATOR(lod_reset, ops::LoDResetOp, ops::LoDResetOpMaker, REGISTER_OPERATOR(lod_reset, ops::LoDResetOp, ops::LoDResetOpMaker,
ops::LoDResetGradDescMaker); ops::LoDResetGradDescMaker, ops::LoDResetOpVarTypeInference);
REGISTER_OPERATOR(lod_reset_grad, ops::LoDResetGradOp, REGISTER_OPERATOR(lod_reset_grad, ops::LoDResetGradOp,
ops::LoDResetGradNoNeedBufferVarInference); ops::LoDResetGradNoNeedBufferVarInference);
REGISTER_OP_CPU_KERNEL( REGISTER_OP_CPU_KERNEL(
lod_reset, ops::LoDResetKernel<paddle::platform::CPUPlace, float>, lod_reset, ops::LoDResetKernel<paddle::platform::CPUPlace, float>,
ops::LoDResetKernel<paddle::platform::CPUPlace, double>, ops::LoDResetKernel<paddle::platform::CPUPlace, double>,
......
...@@ -63,7 +63,7 @@ class LoDResetKernel : public framework::OpKernel<T> { ...@@ -63,7 +63,7 @@ class LoDResetKernel : public framework::OpKernel<T> {
"Target LoD should be a vector end with the " "Target LoD should be a vector end with the "
"first dimension of Input(X)."); "first dimension of Input(X).");
for (size_t i = 0; i < level0.size() - 1; ++i) { for (size_t i = 0; i < level0.size() - 1; ++i) {
PADDLE_ENFORCE(level0[i + 1] > level0[i], PADDLE_ENFORCE(level0[i + 1] >= level0[i],
"Target LoD should be an ascending vector."); "Target LoD should be an ascending vector.");
} }
......
...@@ -13,6 +13,10 @@ See the License for the specific language governing permissions and ...@@ -13,6 +13,10 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/operators/lookup_table_op.h" #include "paddle/fluid/operators/lookup_table_op.h"
#include <memory>
#include "paddle/fluid/framework/no_need_buffer_vars_inference.h"
#include "paddle/fluid/framework/var_type_inference.h" #include "paddle/fluid/framework/var_type_inference.h"
namespace paddle { namespace paddle {
...@@ -119,6 +123,29 @@ or not. And the output only shares the LoD information with input Ids. ...@@ -119,6 +123,29 @@ or not. And the output only shares the LoD information with input Ids.
} }
}; };
DECLARE_NO_NEED_BUFFER_VARS_INFERENCE(LookupTableGradOpNoBuffer, "W");
class LookupTableGradOpDescMaker : public framework::SingleGradOpDescMaker {
public:
using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
protected:
std::unique_ptr<framework::OpDesc> Apply() const override {
std::unique_ptr<framework::OpDesc> op(new framework::OpDesc());
op->SetType("lookup_table_grad");
op->SetInput("W", Input("W"));
op->SetInput("Ids", Input("Ids"));
op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
op->SetOutput(framework::GradVarName("W"), InputGrad("W"));
op->SetAttrMap(Attrs());
return op;
}
};
class LookupTableOpGrad : public framework::OperatorWithKernel { class LookupTableOpGrad : public framework::OperatorWithKernel {
public: public:
using framework::OperatorWithKernel::OperatorWithKernel; using framework::OperatorWithKernel::OperatorWithKernel;
...@@ -131,7 +158,8 @@ class LookupTableOpGrad : public framework::OperatorWithKernel { ...@@ -131,7 +158,8 @@ class LookupTableOpGrad : public framework::OperatorWithKernel {
protected: protected:
framework::OpKernelType GetExpectedKernelType( framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override { const framework::ExecutionContext& ctx) const override {
auto data_type = framework::GetDataTypeOfVar(ctx.InputVar("Out")); auto data_type = framework::GetDataTypeOfVar(
ctx.InputVar(framework::GradVarName("Out")));
return framework::OpKernelType(data_type, ctx.device_context()); return framework::OpKernelType(data_type, ctx.device_context());
} }
}; };
...@@ -159,10 +187,11 @@ class LookupTableOpGradVarTypeInference : public framework::VarTypeInference { ...@@ -159,10 +187,11 @@ class LookupTableOpGradVarTypeInference : public framework::VarTypeInference {
} // namespace paddle } // namespace paddle
namespace ops = paddle::operators; namespace ops = paddle::operators;
REGISTER_OPERATOR(lookup_table, ops::LookupTableOp, REGISTER_OPERATOR(lookup_table, ops::LookupTableOp, ops::LookupTableOpMaker,
paddle::framework::DefaultGradOpDescMaker<true>, ops::LookupTableGradOpDescMaker);
ops::LookupTableOpMaker);
REGISTER_OPERATOR(lookup_table_grad, ops::LookupTableOpGrad, REGISTER_OPERATOR(lookup_table_grad, ops::LookupTableOpGrad,
ops::LookupTableGradOpNoBuffer,
ops::LookupTableOpGradVarTypeInference); ops::LookupTableOpGradVarTypeInference);
REGISTER_OP_CPU_KERNEL(lookup_table, ops::LookupTableKernel<float>, REGISTER_OP_CPU_KERNEL(lookup_table, ops::LookupTableKernel<float>,
......
...@@ -75,6 +75,7 @@ std::vector<std::string> NgraphEngine::feed_vars = {}; ...@@ -75,6 +75,7 @@ std::vector<std::string> NgraphEngine::feed_vars = {};
std::vector<std::string> NgraphEngine::fetch_vars = {}; std::vector<std::string> NgraphEngine::fetch_vars = {};
framework::Variable* NgraphEngine::pre_var_ptr = nullptr; framework::Variable* NgraphEngine::pre_var_ptr = nullptr;
const framework::BlockDesc* NgraphEngine::p_bdesc = nullptr; const framework::BlockDesc* NgraphEngine::p_bdesc = nullptr;
bool NgraphEngine::is_training = false;
std::unordered_map<std::string, EngineCache> NgraphEngine::engine_cache = {}; std::unordered_map<std::string, EngineCache> NgraphEngine::engine_cache = {};
std::unordered_map<std::string, std::unordered_map<std::string,
...@@ -93,11 +94,13 @@ static std::vector<std::vector<int>> NgraphOpIntervals( ...@@ -93,11 +94,13 @@ static std::vector<std::vector<int>> NgraphOpIntervals(
int size = ops->size(); int size = ops->size();
int left = 0; int left = 0;
while (left < size && ops->at(left)->Type() != framework::kFeedOpType && while (left < size && ops->at(left)->Type() != framework::kFeedOpType &&
ops->at(left)->Type() != "read" &&
ops->at(left)->Type() != framework::kFetchOpType) { ops->at(left)->Type() != framework::kFetchOpType) {
++left; ++left;
} }
while (left < size && ops->at(left)->Type() == framework::kFeedOpType) { while (left < size && (ops->at(left)->Type() == framework::kFeedOpType ||
ops->at(left)->Type() == "read")) {
for (auto& var_name_item : ops->at(left)->Outputs()) { for (auto& var_name_item : ops->at(left)->Outputs()) {
for (auto& var_name : var_name_item.second) { for (auto& var_name : var_name_item.second) {
NgraphEngine::feed_vars.emplace_back(var_name); NgraphEngine::feed_vars.emplace_back(var_name);
...@@ -270,6 +273,7 @@ void NgraphEngine::Prepare(const std::vector<int>& interval) { ...@@ -270,6 +273,7 @@ void NgraphEngine::Prepare(const std::vector<int>& interval) {
for (auto op_desc : ops_desc) { for (auto op_desc : ops_desc) {
if (op_desc->Type().find("_grad") != std::string::npos) { if (op_desc->Type().find("_grad") != std::string::npos) {
is_training = true;
this->is_test_ = false; this->is_test_ = false;
break; break;
} }
...@@ -590,7 +594,7 @@ void NgraphEngine::Run(const framework::Scope& scope, ...@@ -590,7 +594,7 @@ void NgraphEngine::Run(const framework::Scope& scope,
} }
bool is_persistable = bool is_persistable =
(p_persistables->find(vi) != p_persistables->end()) ? true : false; (p_persistables->find(vi) != p_persistables->end()) ? true : false;
if (is_test && is_persistable) { if (!is_training && is_test && is_persistable) {
ti->set_stale(false); ti->set_stale(false);
} }
(*p_t_in).emplace_back(ti); (*p_t_in).emplace_back(ti);
......
...@@ -57,6 +57,7 @@ class NgraphEngine { ...@@ -57,6 +57,7 @@ class NgraphEngine {
void Run(const framework::Scope& scope, const platform::Place& place) const; void Run(const framework::Scope& scope, const platform::Place& place) const;
static bool is_training;
static const framework::BlockDesc* p_bdesc; static const framework::BlockDesc* p_bdesc;
static std::vector<std::string> feed_vars, fetch_vars; static std::vector<std::string> feed_vars, fetch_vars;
......
/*Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/pixel_shuffle_op.h"
#include <memory>
namespace paddle {
namespace operators {
class PixelShuffleOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("X"),
"Input(X) of PixelShuffleOp should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("Out"),
"Output(Out) of PixelShuffleOp should not be null.");
auto input_dims = ctx->GetInputDim("X");
PADDLE_ENFORCE(input_dims.size() == 4, "The layout of input is NCHW.");
auto upscale_factor = ctx->Attrs().Get<int>("upscale_factor");
PADDLE_ENFORCE(input_dims[1] % (upscale_factor * upscale_factor) == 0,
"Upscale_factor should devide the number of channel");
auto output_dims = input_dims;
output_dims[0] = input_dims[0];
output_dims[1] = input_dims[1] / (upscale_factor * upscale_factor);
output_dims[2] = input_dims[2] * upscale_factor;
output_dims[3] = input_dims[3] * upscale_factor;
ctx->SetOutputDim("Out", output_dims);
}
};
class PixelShuffleOpMaker : public framework::OpProtoAndCheckerMaker {
public:
void Make() override {
AddInput(
"X",
"(Tensor, default Tensor<float>), "
"the input feature data of PixelShuffleOp, the layout is [N C H W].");
AddOutput(
"Out",
"(Tensor, default Tensor<float>), the output of "
"PixelShuffleOp. The layout is [N,C/factor^2,H*factor,W*factor].");
AddAttr<int>("upscale_factor",
"the factor to increase spatial resolution by.")
.SetDefault(1)
.AddCustomChecker([](const int& upscale_factor) {
PADDLE_ENFORCE_GE(upscale_factor, 1,
"upscale_factor should be larger than 0.");
});
AddComment(R"DOC(
Pixel Shuffle operator
This operator rearranges elements in a tensor of shape :math:`(*, C \times r^2, H, W)`
to a tensor of shape :math:`(C, H \times r, W \times r)`.
This is useful for implementing efficient sub-pixel convolution
with a stride of :math:`1/r`.
Please refer to the paper:
`Real-Time Single Image and Video Super-Resolution Using an Efficient
Sub-Pixel Convolutional Neural Network <https://arxiv.org/abs/1609.05158v2>`_
by Shi et. al (2016) for more details.
)DOC");
}
};
class PixelShuffleGradMaker : public framework::SingleGradOpDescMaker {
public:
using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
std::unique_ptr<framework::OpDesc> Apply() const override {
auto* op = new framework::OpDesc();
op->SetType("pixel_shuffle_grad");
op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
op->SetAttrMap(Attrs());
op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
return std::unique_ptr<framework::OpDesc>(op);
}
};
class PixelShuffleGradOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {
PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
"Input(Out@Grad) should not be null");
PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")),
"Output(X@Grad) should not be null");
auto do_dims = ctx->GetInputDim(framework::GradVarName("Out"));
PADDLE_ENFORCE(do_dims.size() == 4, "The layout of input is NCHW.");
auto upscale_factor = ctx->Attrs().Get<int>("upscale_factor");
auto dx_dims = do_dims;
dx_dims[0] = do_dims[0];
dx_dims[1] = do_dims[1] * (upscale_factor * upscale_factor);
dx_dims[2] = do_dims[2] / upscale_factor;
dx_dims[3] = do_dims[3] / upscale_factor;
ctx->SetOutputDim(framework::GradVarName("X"), dx_dims);
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OPERATOR(pixel_shuffle, ops::PixelShuffleOp, ops::PixelShuffleOpMaker,
ops::PixelShuffleGradMaker);
REGISTER_OPERATOR(pixel_shuffle_grad, ops::PixelShuffleGradOp);
REGISTER_OP_CPU_KERNEL(
pixel_shuffle,
ops::PixelShuffleOpKernel<paddle::platform::CPUDeviceContext, float>,
ops::PixelShuffleOpKernel<paddle::platform::CPUDeviceContext, double>);
REGISTER_OP_CPU_KERNEL(
pixel_shuffle_grad,
ops::PixelShuffleGradOpKernel<paddle::platform::CPUDeviceContext, float>,
ops::PixelShuffleGradOpKernel<paddle::platform::CPUDeviceContext, double>);
/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/pixel_shuffle_op.h"
namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OP_CUDA_KERNEL(
pixel_shuffle, ops::PixelShuffleOpKernel<plat::CUDADeviceContext, float>,
ops::PixelShuffleOpKernel<plat::CUDADeviceContext, double>);
REGISTER_OP_CUDA_KERNEL(
pixel_shuffle_grad,
ops::PixelShuffleGradOpKernel<plat::CUDADeviceContext, float>,
ops::PixelShuffleGradOpKernel<plat::CUDADeviceContext, double>);
/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <algorithm>
#include <vector>
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/math/math_function.h"
namespace paddle {
namespace operators {
template <typename DeviceContext, typename T>
class PixelShuffleOpKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto* in = ctx.Input<framework::Tensor>("X");
auto* out = ctx.Output<framework::Tensor>("Out");
out->mutable_data<T>(ctx.GetPlace());
int factor = ctx.Attr<int>("upscale_factor");
auto in_dims = in->dims();
auto o_dims = out->dims();
framework::Tensor t;
t.ShareDataWith(*in);
t.Resize({in_dims[0], o_dims[1], factor, factor, in_dims[2], in_dims[3]});
std::vector<int> axis = {0, 1, 4, 2, 5, 3};
framework::Tensor o;
o.ShareDataWith(*out);
o.Resize({in_dims[0], o_dims[1], in_dims[2], factor, in_dims[3], factor});
math::Transpose<DeviceContext, T, 6> trans;
auto& dev_ctx = ctx.template device_context<DeviceContext>();
trans(dev_ctx, t, &o, axis);
out->Resize(o_dims);
}
};
template <typename DeviceContext, typename T>
class PixelShuffleGradOpKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto* dout = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
auto* dx = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
dx->mutable_data<T>(ctx.GetPlace());
int factor = ctx.Attr<int>("upscale_factor");
auto do_dims = dout->dims();
auto dx_dims = dx->dims();
framework::Tensor t;
t.ShareDataWith(*dout);
t.Resize({do_dims[0], do_dims[1], dx_dims[2], factor, dx_dims[3], factor});
std::vector<int> axis = {0, 1, 3, 5, 2, 4};
framework::Tensor o;
o.ShareDataWith(*dx);
o.Resize({do_dims[0], do_dims[1], factor, factor, dx_dims[2], dx_dims[3]});
math::Transpose<DeviceContext, T, 6> trans;
auto& dev_ctx = ctx.template device_context<DeviceContext>();
trans(dev_ctx, t, &o, axis);
dx->Resize(dx_dims);
}
};
} // namespace operators
} // namespace paddle
...@@ -23,6 +23,7 @@ constexpr char kInitialStates[] = "initial_states"; ...@@ -23,6 +23,7 @@ constexpr char kInitialStates[] = "initial_states";
constexpr char kParameters[] = "parameters"; constexpr char kParameters[] = "parameters";
constexpr char kOutputs[] = "outputs"; constexpr char kOutputs[] = "outputs";
constexpr char kStepScopes[] = "step_scopes"; constexpr char kStepScopes[] = "step_scopes";
constexpr char kHasStates[] = "has_states";
constexpr char kExStates[] = "ex_states"; constexpr char kExStates[] = "ex_states";
constexpr char kStates[] = "states"; constexpr char kStates[] = "states";
constexpr char kStepBlock[] = "sub_block"; constexpr char kStepBlock[] = "sub_block";
...@@ -241,11 +242,16 @@ class RecurrentOp : public RecurrentBase { ...@@ -241,11 +242,16 @@ class RecurrentOp : public RecurrentBase {
private: private:
void RunImpl(const framework::Scope &scope, void RunImpl(const framework::Scope &scope,
const platform::Place &place) const override { const platform::Place &place) const override {
bool has_state = Attr<bool>(kHasStates);
auto seq_len = static_cast<size_t>(this->GetSequenceLength(scope)); auto seq_len = static_cast<size_t>(this->GetSequenceLength(scope));
VLOG(3) << "Static RNN input sequence length = " << seq_len; VLOG(3) << "Static RNN input sequence length = " << seq_len;
StepScopes scopes = CreateStepScopes(scope, seq_len); StepScopes scopes = CreateStepScopes(scope, seq_len);
auto reverse = Attr<bool>(kReverse); auto reverse = Attr<bool>(kReverse);
// get device context from pool
platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
auto &dev_ctx = *pool.Get(place);
framework::Executor executor(place); framework::Executor executor(place);
auto *block = Attr<framework::BlockDesc *>(kStepBlock); auto *block = Attr<framework::BlockDesc *>(kStepBlock);
...@@ -269,15 +275,17 @@ class RecurrentOp : public RecurrentBase { ...@@ -269,15 +275,17 @@ class RecurrentOp : public RecurrentBase {
inside->Resize(framework::make_ddim(dims)); inside->Resize(framework::make_ddim(dims));
}); });
if (i == 0) { if (has_state) {
// Link initial states --> ex_states if (i == 0) {
LinkTensor(scope, Inputs(kInitialStates), &cur_scope, // Link initial states --> ex_states
Attr<std::vector<std::string>>(kExStates)); LinkTensor(scope, Inputs(kInitialStates), &cur_scope,
} else { Attr<std::vector<std::string>>(kExStates));
auto &ex_scope = scopes.ExScope(); } else {
// Link ex_scope::state --> cur_scope::ex_state auto &ex_scope = scopes.ExScope();
LinkTensor(ex_scope, Attr<std::vector<std::string>>(kStates), // Link ex_scope::state --> cur_scope::ex_state
&cur_scope, Attr<std::vector<std::string>>(kExStates)); LinkTensor(ex_scope, Attr<std::vector<std::string>>(kStates),
&cur_scope, Attr<std::vector<std::string>>(kExStates));
}
} }
// Every inputs are linked now, execute! // Every inputs are linked now, execute!
...@@ -286,11 +294,6 @@ class RecurrentOp : public RecurrentBase { ...@@ -286,11 +294,6 @@ class RecurrentOp : public RecurrentBase {
std::vector<std::string>() /*skip_ref_cnt_vars*/, std::vector<std::string>() /*skip_ref_cnt_vars*/,
true /*force_disable_gc*/); true /*force_disable_gc*/);
// get device context from pool
platform::DeviceContextPool &pool =
platform::DeviceContextPool::Instance();
auto &dev_ctx = *pool.Get(place);
// Copy inside::output -> outside::output // Copy inside::output -> outside::output
// outside::output[seq_offset: seq_offset + 1] = inside::output // outside::output[seq_offset: seq_offset + 1] = inside::output
this->LinkTensorWithCallback( this->LinkTensorWithCallback(
...@@ -333,13 +336,13 @@ class RecurrentGradOp : public RecurrentBase { ...@@ -333,13 +336,13 @@ class RecurrentGradOp : public RecurrentBase {
private: private:
void RunImpl(const framework::Scope &scope, void RunImpl(const framework::Scope &scope,
const platform::Place &place) const override { const platform::Place &place) const override {
auto seq_len = static_cast<size_t>(GetSequenceLength(scope)); bool has_state = Attr<bool>(kHasStates);
const size_t seq_len = static_cast<size_t>(GetSequenceLength(scope));
StepScopes scopes = CreateStepScopes(scope, seq_len); StepScopes scopes = CreateStepScopes(scope, seq_len);
auto reverse = Attr<bool>(kReverse); auto reverse = Attr<bool>(kReverse);
framework::Executor executor(place); framework::Executor executor(place);
auto *block = Attr<framework::BlockDesc *>(kStepBlock); auto *block = Attr<framework::BlockDesc *>(kStepBlock);
auto *program = block->Program(); auto *program = block->Program();
// get device context from pool // get device context from pool
...@@ -350,6 +353,7 @@ class RecurrentGradOp : public RecurrentBase { ...@@ -350,6 +353,7 @@ class RecurrentGradOp : public RecurrentBase {
size_t seq_offset = reverse ? step_id : seq_len - step_id - 1; size_t seq_offset = reverse ? step_id : seq_len - step_id - 1;
VLOG(3) << "Recurrent backward operate at the time step " << seq_offset; VLOG(3) << "Recurrent backward operate at the time step " << seq_offset;
auto &cur_scope = scopes.CurScope(); auto &cur_scope = scopes.CurScope();
// Link outside::output_grads --> inside::output_grads // Link outside::output_grads --> inside::output_grads
// inside::output_grad = outside::output_grad[seq_offset:seq_offset+1] // inside::output_grad = outside::output_grad[seq_offset:seq_offset+1]
LinkTensorWithCallback( LinkTensorWithCallback(
...@@ -370,30 +374,32 @@ class RecurrentGradOp : public RecurrentBase { ...@@ -370,30 +374,32 @@ class RecurrentGradOp : public RecurrentBase {
VLOG(10) << " RNN output gradients = [" << sout.str() << "]"; VLOG(10) << " RNN output gradients = [" << sout.str() << "]";
} }
// Link states if (has_state) {
// if cur_scope::cur_state_grad in out_grads: // Link states
// cur_scope::cur_state_grad += ex_scope::ex_state_grad // if cur_scope::cur_state_grad in out_grads:
// else: // cur_scope::cur_state_grad += ex_scope::ex_state_grad
// ex_scope::ex_state_grad --> cur_scope::cur_state_grad // else:
if (step_id != 0) { // not at beginning // ex_scope::ex_state_grad --> cur_scope::cur_state_grad
auto &ex_scope = scopes.ExScope(); if (step_id != 0) { // not at beginning
auto ex_state_grads = auto &ex_scope = scopes.ExScope();
GradVarLists(Attr<std::vector<std::string>>(kExStates)); auto ex_state_grads =
auto cur_state_grads = GradVarLists(Attr<std::vector<std::string>>(kExStates));
GradVarLists(Attr<std::vector<std::string>>(kStates)); auto cur_state_grads =
GradVarLists(Attr<std::vector<std::string>>(kStates));
PADDLE_ENFORCE_EQ(ex_state_grads.size(), cur_state_grads.size());
for (size_t i = 0; i < ex_state_grads.size(); ++i) { PADDLE_ENFORCE_EQ(ex_state_grads.size(), cur_state_grads.size());
auto &cur_grad = cur_state_grads[i]; for (size_t i = 0; i < ex_state_grads.size(); ++i) {
auto &ex_grad = ex_state_grads[i]; auto &cur_grad = cur_state_grads[i];
auto &ex_tensor = auto &ex_grad = ex_state_grads[i];
ex_scope.FindVar(ex_grad)->Get<framework::LoDTensor>(); auto &ex_tensor =
ex_scope.FindVar(ex_grad)->Get<framework::LoDTensor>();
VLOG(10) << " RNN link " << cur_grad << " from " << ex_grad;
auto *cur_grad_var = cur_scope.Var(cur_grad); VLOG(10) << " RNN link " << cur_grad << " from " << ex_grad;
auto cur_grad_tensor = auto *cur_grad_var = cur_scope.Var(cur_grad);
cur_grad_var->GetMutable<framework::LoDTensor>(); auto cur_grad_tensor =
framework::TensorCopy(ex_tensor, place, dev_ctx, cur_grad_tensor); cur_grad_var->GetMutable<framework::LoDTensor>();
framework::TensorCopy(ex_tensor, place, dev_ctx, cur_grad_tensor);
}
} }
} }
...@@ -442,8 +448,8 @@ class RecurrentGradOp : public RecurrentBase { ...@@ -442,8 +448,8 @@ class RecurrentGradOp : public RecurrentBase {
} }
auto new_inside_name = cur_scope.Rename(inside_grad_name); auto new_inside_name = cur_scope.Rename(inside_grad_name);
// sum gradient
// sum gradient
auto sum_op = framework::OpRegistry::CreateOp( auto sum_op = framework::OpRegistry::CreateOp(
"sum", {{"X", {pg_names[param_id], new_inside_name}}}, "sum", {{"X", {pg_names[param_id], new_inside_name}}},
{{"Out", {pg_names[param_id]}}}, {{"Out", {pg_names[param_id]}}},
...@@ -475,22 +481,33 @@ class RecurrentGradOp : public RecurrentBase { ...@@ -475,22 +481,33 @@ class RecurrentGradOp : public RecurrentBase {
true /*is_backward*/); true /*is_backward*/);
VLOG(5) << "Link outside gradient finished "; VLOG(5) << "Link outside gradient finished ";
if (step_id + 1 == seq_len) { // at_end if (has_state) {
// copy initialize states gradient from inside to outside if (step_id + 1 == seq_len) { // at_end
LinkTensorWithCallback( // copy initialize states gradient from inside to outside
cur_scope, GradVarLists(Attr<std::vector<std::string>>(kExStates)), LinkTensorWithCallback(
scope, Outputs(kInitStateGrads), cur_scope,
[&](const framework::LoDTensor &inside, GradVarLists(Attr<std::vector<std::string>>(kExStates)), scope,
framework::LoDTensor *outside) { Outputs(kInitStateGrads),
outside->Resize(inside.dims()); [&](const framework::LoDTensor &inside,
outside->mutable_data(place, inside.type()); framework::LoDTensor *outside) {
framework::TensorCopy(inside, place, dev_ctx, outside); outside->Resize(inside.dims());
}, outside->mutable_data(place, inside.type());
true /*is_backward*/); framework::TensorCopy(inside, place, dev_ctx, outside);
VLOG(5) << "Link initialize state gradient finished "; },
true /*is_backward*/);
VLOG(5) << "Link initialize state gradient finished ";
}
} }
scopes.Next(); scopes.Next();
} }
// Delete the scope of StepScopes
dev_ctx.Wait();
auto *var = scope.FindVar(Input(kStepScopes));
PADDLE_ENFORCE(var != nullptr);
auto step_scopes = var->GetMutable<StepScopeVar>();
for (auto *sub_scope : *step_scopes) {
const_cast<framework::Scope &>(scope).DeleteScope(sub_scope);
}
} }
private: private:
...@@ -541,6 +558,7 @@ class RecurrentOpProtoMaker : public framework::OpProtoAndCheckerMaker { ...@@ -541,6 +558,7 @@ class RecurrentOpProtoMaker : public framework::OpProtoAndCheckerMaker {
.AsDuplicable(); .AsDuplicable();
AddOutput(kStepScopes, AddOutput(kStepScopes,
"StepScopes contain all local variables in each time step."); "StepScopes contain all local variables in each time step.");
AddAttr<bool>(kHasStates, "Whether has states.").SetDefault(false);
AddAttr<std::vector<std::string>>(kExStates, AddAttr<std::vector<std::string>>(kExStates,
string::Sprintf( string::Sprintf(
R"DOC(The ex-state variable names. R"DOC(The ex-state variable names.
...@@ -624,20 +642,44 @@ class RecurrentGradOpDescMaker : public framework::SingleGradOpDescMaker { ...@@ -624,20 +642,44 @@ class RecurrentGradOpDescMaker : public framework::SingleGradOpDescMaker {
class RecurrentGradOpShapeInference : public framework::InferShapeBase { class RecurrentGradOpShapeInference : public framework::InferShapeBase {
public: public:
void operator()(framework::InferShapeContext *ctx) const override { void operator()(framework::InferShapeContext *ctx) const override {
std::vector<std::string> input{kInputs, kInitialStates};
std::vector<std::string> output{kOutputs}; std::vector<std::string> output{kOutputs};
for (auto &s : input) {
// NOTE(zcd): In some case, some of kInputs doesn't have gradient. // In some case the kInitialStates is empty.
PADDLE_ENFORCE(ctx->HasInputs(s)); // If the kInitialStates is empty, all the states should be empty.
} if (!ctx->HasInputs(kInitialStates)) {
for (auto &s : output) { PADDLE_ENFORCE_EQ(
PADDLE_ENFORCE(ctx->HasInputs(s)); ctx->Attrs().Get<std::vector<std::string>>(kExStates).size(), 0,
"The Attr(%s) should be empty.", kExStates);
PADDLE_ENFORCE_EQ(
ctx->Attrs().Get<std::vector<std::string>>(kStates).size(), 0,
"The Attr(%s) should be empty.", kStates);
} }
for (auto &s : input) {
ctx->SetOutputsDim(framework::GradVarName(s), ctx->GetInputsDim(s)); PADDLE_ENFORCE(ctx->HasInputs(kInputs),
"The input(%s) should not be empty.", kInputs);
PADDLE_ENFORCE(ctx->HasInputs(kOutputs),
"The input(%s) should not be empty.", kOutputs);
// In some case the kInitialStates is empty.
if (ctx->HasInputs(kInitialStates)) {
PADDLE_ENFORCE(ctx->HasOutputs(framework::GradVarName(kInitialStates)),
"The output of(%s) should not be empty.",
framework::GradVarName(kInitialStates));
ctx->SetOutputsDim(framework::GradVarName(kInitialStates),
ctx->GetInputsDim(kInitialStates));
} }
PADDLE_ENFORCE(ctx->HasOutputs(framework::GradVarName(kInputs)),
"The output of(%s) should not be empty.",
framework::GradVarName(kInputs));
ctx->SetOutputsDim(framework::GradVarName(kInputs),
ctx->GetInputsDim(kInputs));
// In some case the kParameters is empty.
if (ctx->HasInputs(kParameters)) { if (ctx->HasInputs(kParameters)) {
PADDLE_ENFORCE(ctx->HasOutputs(framework::GradVarName(kParameters))); PADDLE_ENFORCE(ctx->HasOutputs(framework::GradVarName(kParameters)),
"The output of(%s) should not be empty.",
framework::GradVarName(kParameters));
ctx->SetOutputsDim(framework::GradVarName(kParameters), ctx->SetOutputsDim(framework::GradVarName(kParameters),
ctx->GetInputsDim(kParameters)); ctx->GetInputsDim(kParameters));
} }
......
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/operators/reduce_ops/reduce_all_op.h"
REGISTER_REDUCE_OP(reduce_all);
REGISTER_OP_CPU_KERNEL(reduce_all,
ops::ReduceKernel<paddle::platform::CPUDeviceContext,
bool, ops::AllFunctor>);
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/operators/reduce_ops/reduce_all_op.h"
REGISTER_OP_CUDA_KERNEL(reduce_all,
ops::ReduceKernel<paddle::platform::CUDADeviceContext,
bool, ops::AllFunctor>);
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/fluid/operators/reduce_ops/reduce_op.h"
namespace paddle {
namespace operators {
struct AllFunctor {
template <typename DeviceContext, typename X, typename Y, typename Dim>
void operator()(const DeviceContext& place, X* x, Y* y, const Dim& dim) {
y->device(place) = x->all(dim);
}
};
} // namespace operators
} // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. Any Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/operators/reduce_ops/reduce_any_op.h"
REGISTER_REDUCE_OP(reduce_any);
REGISTER_OP_CPU_KERNEL(reduce_any,
ops::ReduceKernel<paddle::platform::CPUDeviceContext,
bool, ops::AnyFunctor>);
// Copyright (c) 2018 PaddlePaddle Authors. Any Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/operators/reduce_ops/reduce_any_op.h"
REGISTER_OP_CUDA_KERNEL(reduce_any,
ops::ReduceKernel<paddle::platform::CUDADeviceContext,
bool, ops::AnyFunctor>);
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/fluid/operators/reduce_ops/reduce_op.h"
namespace paddle {
namespace operators {
struct AnyFunctor {
template <typename DeviceContext, typename X, typename Y, typename Dim>
void operator()(const DeviceContext& place, X* x, Y* y, const Dim& dim) {
y->device(place) = x->any(dim);
}
};
} // namespace operators
} // namespace paddle
...@@ -40,9 +40,12 @@ class RNNMemoryHelperOp : public framework::OperatorBase { ...@@ -40,9 +40,12 @@ class RNNMemoryHelperOp : public framework::OperatorBase {
"Cannot find out_var in scope, out_var_name is %s", "Cannot find out_var in scope, out_var_name is %s",
out_name); out_name);
platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
auto &dev_ctx = *pool.Get(dev_place);
auto *out_tensor = out_var->GetMutable<framework::LoDTensor>(); auto *out_tensor = out_var->GetMutable<framework::LoDTensor>();
auto &mem_tensor = mem_var->Get<framework::LoDTensor>(); auto &mem_tensor = mem_var->Get<framework::LoDTensor>();
framework::TensorCopySync(mem_tensor, dev_place, out_tensor); framework::TensorCopy(mem_tensor, dev_place, dev_ctx, out_tensor);
out_tensor->set_lod(mem_tensor.lod()); out_tensor->set_lod(mem_tensor.lod());
} }
}; };
...@@ -92,6 +95,9 @@ class RNNMemoryHelperGradOp : public framework::OperatorBase { ...@@ -92,6 +95,9 @@ class RNNMemoryHelperGradOp : public framework::OperatorBase {
"Cannot find in_grad_var in scope, name is %s", "Cannot find in_grad_var in scope, name is %s",
in_grad_var_name); in_grad_var_name);
platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
auto &dev_ctx = *pool.Get(dev_place);
if (out_grad_var == nullptr) { if (out_grad_var == nullptr) {
VLOG(5) << "Using fill constant 0 as starting gradient"; VLOG(5) << "Using fill constant 0 as starting gradient";
auto in_var_name = Input("X"); auto in_var_name = Input("X");
...@@ -109,7 +115,8 @@ class RNNMemoryHelperGradOp : public framework::OperatorBase { ...@@ -109,7 +115,8 @@ class RNNMemoryHelperGradOp : public framework::OperatorBase {
} else { } else {
auto &out_grad_tensor = out_grad_var->Get<framework::LoDTensor>(); auto &out_grad_tensor = out_grad_var->Get<framework::LoDTensor>();
auto *in_grad_tensor = in_grad_var->GetMutable<framework::LoDTensor>(); auto *in_grad_tensor = in_grad_var->GetMutable<framework::LoDTensor>();
framework::TensorCopySync(out_grad_tensor, dev_place, in_grad_tensor); framework::TensorCopy(out_grad_tensor, dev_place, dev_ctx,
in_grad_tensor);
in_grad_tensor->set_lod(out_grad_tensor.lod()); in_grad_tensor->set_lod(out_grad_tensor.lod());
} }
} }
......
...@@ -13,12 +13,18 @@ See the License for the specific language governing permissions and ...@@ -13,12 +13,18 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/operators/space_to_depth_op.h" #include "paddle/fluid/operators/space_to_depth_op.h"
#include <memory>
#include <string> #include <string>
#include <vector> #include <vector>
#include "paddle/fluid/framework/no_need_buffer_vars_inference.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
using Tensor = framework::Tensor;
class SpaceToDepthOp : public framework::OperatorWithKernel { class SpaceToDepthOp : public framework::OperatorWithKernel {
public: public:
using framework::OperatorWithKernel::OperatorWithKernel; using framework::OperatorWithKernel::OperatorWithKernel;
...@@ -100,6 +106,28 @@ class SpaceToDepthOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -100,6 +106,28 @@ class SpaceToDepthOpMaker : public framework::OpProtoAndCheckerMaker {
} }
}; };
DECLARE_NO_NEED_BUFFER_VARS_INFERENCE(SpaceToDepthGradOpNoBuffer, "X");
class SpaceToDepthGradOpDescMaker : public framework::SingleGradOpDescMaker {
public:
using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
protected:
std::unique_ptr<framework::OpDesc> Apply() const override {
std::unique_ptr<framework::OpDesc> op(new framework::OpDesc());
op->SetType("space_to_depth_grad");
op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
op->SetInput("X", Input("X"));
op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
op->SetAttrMap(Attrs());
return op;
}
};
class SpaceToDepthGradOp : public framework::OperatorWithKernel { class SpaceToDepthGradOp : public framework::OperatorWithKernel {
public: public:
using framework::OperatorWithKernel::OperatorWithKernel; using framework::OperatorWithKernel::OperatorWithKernel;
...@@ -110,6 +138,14 @@ class SpaceToDepthGradOp : public framework::OperatorWithKernel { ...@@ -110,6 +138,14 @@ class SpaceToDepthGradOp : public framework::OperatorWithKernel {
"Input(Out@GRAD) shouldn't be null."); "Input(Out@GRAD) shouldn't be null.");
ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X")); ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
} }
protected:
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override {
return framework::OpKernelType(
ctx.Input<Tensor>(framework::GradVarName("Out"))->type(),
ctx.GetPlace());
}
}; };
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
...@@ -117,8 +153,9 @@ class SpaceToDepthGradOp : public framework::OperatorWithKernel { ...@@ -117,8 +153,9 @@ class SpaceToDepthGradOp : public framework::OperatorWithKernel {
namespace ops = paddle::operators; namespace ops = paddle::operators;
REGISTER_OPERATOR(space_to_depth, ops::SpaceToDepthOp, ops::SpaceToDepthOpMaker, REGISTER_OPERATOR(space_to_depth, ops::SpaceToDepthOp, ops::SpaceToDepthOpMaker,
paddle::framework::DefaultGradOpDescMaker<true>); ops::SpaceToDepthGradOpDescMaker);
REGISTER_OPERATOR(space_to_depth_grad, ops::SpaceToDepthGradOp); REGISTER_OPERATOR(space_to_depth_grad, ops::SpaceToDepthGradOp,
ops::SpaceToDepthGradOpNoBuffer);
REGISTER_OP_CPU_KERNEL( REGISTER_OP_CPU_KERNEL(
space_to_depth, space_to_depth,
ops::SpaceToDepthKernel<paddle::platform::CPUDeviceContext, float>, ops::SpaceToDepthKernel<paddle::platform::CPUDeviceContext, float>,
......
...@@ -14,6 +14,8 @@ limitations under the License. */ ...@@ -14,6 +14,8 @@ limitations under the License. */
#include "paddle/fluid/operators/squared_l2_norm_op.h" #include "paddle/fluid/operators/squared_l2_norm_op.h"
#include <memory>
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -31,6 +33,26 @@ class SquaredL2NormOp : public framework::OperatorWithKernel { ...@@ -31,6 +33,26 @@ class SquaredL2NormOp : public framework::OperatorWithKernel {
} }
}; };
class SquaredL2NormGradOpDescMaker : public framework::SingleGradOpDescMaker {
public:
using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
protected:
std::unique_ptr<framework::OpDesc> Apply() const override {
std::unique_ptr<framework::OpDesc> op(new framework::OpDesc());
op->SetType("squared_l2_norm_grad");
op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
op->SetInput("X", Input("X"));
op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
op->SetAttrMap(Attrs());
return op;
}
};
class SquaredL2NormGradOp : public framework::OperatorWithKernel { class SquaredL2NormGradOp : public framework::OperatorWithKernel {
public: public:
using framework::OperatorWithKernel::OperatorWithKernel; using framework::OperatorWithKernel::OperatorWithKernel;
...@@ -67,8 +89,7 @@ $$Out = \sum_{i} X_{i}^2$$ ...@@ -67,8 +89,7 @@ $$Out = \sum_{i} X_{i}^2$$
namespace ops = paddle::operators; namespace ops = paddle::operators;
REGISTER_OPERATOR(squared_l2_norm, ops::SquaredL2NormOp, REGISTER_OPERATOR(squared_l2_norm, ops::SquaredL2NormOp,
ops::SquaredL2NormOpMaker, ops::SquaredL2NormOpMaker, ops::SquaredL2NormGradOpDescMaker);
paddle::framework::DefaultGradOpDescMaker<true>);
REGISTER_OPERATOR(squared_l2_norm_grad, ops::SquaredL2NormGradOp); REGISTER_OPERATOR(squared_l2_norm_grad, ops::SquaredL2NormGradOp);
REGISTER_OP_CPU_KERNEL( REGISTER_OP_CPU_KERNEL(
squared_l2_norm, squared_l2_norm,
......
...@@ -17,9 +17,6 @@ if (CUPTI_FOUND) ...@@ -17,9 +17,6 @@ if (CUPTI_FOUND)
endif(CUPTI_FOUND) endif(CUPTI_FOUND)
nv_library(dynload_cuda SRCS ${CUDA_SRCS} DEPS dynamic_loader) nv_library(dynload_cuda SRCS ${CUDA_SRCS} DEPS dynamic_loader)
cc_library(dynload_warpctc SRCS warpctc.cc DEPS dynamic_loader warpctc) cc_library(dynload_warpctc SRCS warpctc.cc DEPS dynamic_loader warpctc)
if (WITH_WBAES)
cc_library(dynload_wbaes SRCS wbaes.cc DEPS dynamic_loader wbaes)
endif()
if (WITH_MKLML) if (WITH_MKLML)
cc_library(dynload_mklml SRCS mklml.cc DEPS dynamic_loader mklml) cc_library(dynload_mklml SRCS mklml.cc DEPS dynamic_loader mklml)
endif() endif()
......
...@@ -32,7 +32,6 @@ void* GetWarpCTCDsoHandle(); ...@@ -32,7 +32,6 @@ void* GetWarpCTCDsoHandle();
void* GetNCCLDsoHandle(); void* GetNCCLDsoHandle();
void* GetTensorRtDsoHandle(); void* GetTensorRtDsoHandle();
void* GetMKLMLDsoHandle(); void* GetMKLMLDsoHandle();
void* GetWBAESDsoHandle();
} // namespace dynload } // namespace dynload
} // namespace platform } // namespace platform
......
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册