提交 d6b5302b 编写于 作者: F fengjiayi

Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into dev_op_tensor_support

......@@ -73,6 +73,7 @@ option(PY_VERSION "Compile PaddlePaddle with python3 support" ${PY_VER
if(NOT PY_VERSION)
set(PY_VERSION 2.7)
endif()
set(PYBIND11_PYTHON_VERSION ${PY_VERSION})
# CMAKE_BUILD_TYPE
if(NOT CMAKE_BUILD_TYPE)
......@@ -159,6 +160,7 @@ endif()
########################################################################################
include(external/mklml) # download mklml package
include(external/xbyak) # download xbyak package
include(external/libxsmm) # download, build, install libxsmm
include(external/zlib) # download, build, install zlib
include(external/gflags) # download, build, install gflags
......@@ -175,6 +177,7 @@ include(external/any) # download libn::any
include(external/eigen) # download eigen3
include(external/pybind11) # download pybind11
include(external/cares)
include(external/cub)
if(WITH_DISTRIBUTE)
if(WITH_GRPC)
......@@ -201,12 +204,11 @@ include(external/snappy) # download snappy
include(external/snappystream)
include(external/threadpool)
set(WITH_ANAKIN OFF CACHE STRING "Disable Anakin first, will add it later." FORCE)
if(WITH_GPU)
include(cuda)
include(tensorrt)
include(external/anakin)
else()
set(WITH_ANAKIN OFF CACHE STRING "Anakin is valid only when GPU is set." FORCE)
endif()
include(cudnn) # set cudnn libraries, must before configure
......
......@@ -97,6 +97,14 @@ if(WITH_GPU)
endif()
include_directories(${TENSORRT_INCLUDE_DIR})
endif()
if(WITH_ANAKIN)
if(${CUDA_VERSION_MAJOR} VERSION_LESS 8)
message(FATAL_ERROR "Anakin needs CUDA >= 8.0 to compile")
endif()
if(${CUDNN_MAJOR_VERSION} VERSION_LESS 7)
message(FATAL_ERROR "Anakin needs CUDNN >= 7.0 to compile")
endif()
endif()
elseif(WITH_AMD_GPU)
add_definitions(-DPADDLE_WITH_HIP)
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -D__HIP_PLATFORM_HCC__")
......
......@@ -2,10 +2,22 @@ if (NOT WITH_ANAKIN)
return()
endif()
set(ANAKIN_INSTALL_DIR "${THIRD_PARTY_PATH}/install/anakin" CACHE PATH
"Anakin install path." FORCE)
set(ANAKIN_INCLUDE "${ANAKIN_INSTALL_DIR}" CACHE STRING "root of Anakin header files")
set(ANAKIN_LIBRARY "${ANAKIN_INSTALL_DIR}" CACHE STRING "path of Anakin library")
INCLUDE(ExternalProject)
set(ANAKIN_SOURCE_DIR ${THIRD_PARTY_PATH}/anakin)
# the anakin install dir is only default one now
set(ANAKIN_INSTALL_DIR ${THIRD_PARTY_PATH}/anakin/src/extern_anakin/output)
set(ANAKIN_INCLUDE ${ANAKIN_INSTALL_DIR})
set(ANAKIN_LIBRARY ${ANAKIN_INSTALL_DIR})
set(ANAKIN_SHARED_LIB ${ANAKIN_LIBRARY}/libanakin.so)
set(ANAKIN_SABER_LIB ${ANAKIN_LIBRARY}/libanakin_saber_common.so)
# TODO(luotao): ANAKIN_MODLE_URL will move to demo ci later.
set(ANAKIN_MODLE_URL "http://paddle-inference-dist.bj.bcebos.com/mobilenet_v2.anakin.bin")
execute_process(COMMAND bash -c "mkdir -p ${ANAKIN_SOURCE_DIR}")
execute_process(COMMAND bash -c "cd ${ANAKIN_SOURCE_DIR}; wget -q --no-check-certificate ${ANAKIN_MODLE_URL}")
include_directories(${ANAKIN_INCLUDE})
include_directories(${ANAKIN_INCLUDE}/saber/)
set(ANAKIN_COMPILE_EXTRA_FLAGS
-Wno-error=unused-but-set-variable -Wno-unused-but-set-variable
......@@ -20,36 +32,33 @@ set(ANAKIN_COMPILE_EXTRA_FLAGS
-Wno-reorder
-Wno-error=cpp)
set(ANAKIN_LIBRARY_URL "https://github.com/pangge/Anakin/releases/download/Version0.1.0/anakin.tar.gz")
# A helper function used in Anakin, currently, to use it, one need to recursively include
# nearly all the header files.
function(fetch_include_recursively root_dir)
if (IS_DIRECTORY ${root_dir})
include_directories(${root_dir})
endif()
file(GLOB ALL_SUB RELATIVE ${root_dir} ${root_dir}/*)
foreach(sub ${ALL_SUB})
if (IS_DIRECTORY ${root_dir}/${sub})
fetch_include_recursively(${root_dir}/${sub})
endif()
endforeach()
endfunction()
if (NOT EXISTS "${ANAKIN_INSTALL_DIR}")
# download library
message(STATUS "Download Anakin library from ${ANAKIN_LIBRARY_URL}")
execute_process(COMMAND bash -c "mkdir -p ${ANAKIN_INSTALL_DIR}")
execute_process(COMMAND bash -c "rm -rf ${ANAKIN_INSTALL_DIR}/*")
execute_process(COMMAND bash -c "cd ${ANAKIN_INSTALL_DIR}; wget --no-check-certificate -q ${ANAKIN_LIBRARY_URL}")
execute_process(COMMAND bash -c "mkdir -p ${ANAKIN_INSTALL_DIR}")
execute_process(COMMAND bash -c "cd ${ANAKIN_INSTALL_DIR}; tar xzf anakin.tar.gz")
endif()
ExternalProject_Add(
extern_anakin
${EXTERNAL_PROJECT_LOG_ARGS}
# TODO(luotao): use PaddlePaddle/Anakin later
GIT_REPOSITORY "https://github.com/luotao1/Anakin"
GIT_TAG "3957ae9263eaa0b1986758dac60a88852afb09be"
PREFIX ${ANAKIN_SOURCE_DIR}
UPDATE_COMMAND ""
CMAKE_ARGS -DUSE_GPU_PLACE=YES
-DUSE_X86_PLACE=YES
-DBUILD_WITH_UNIT_TEST=NO
-DPROTOBUF_ROOT=${THIRD_PARTY_PATH}/install/protobuf
-DMKLML_ROOT=${THIRD_PARTY_PATH}/install/mklml
-DCUDNN_ROOT=${CUDNN_ROOT}
${EXTERNAL_OPTIONAL_ARGS}
CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${ANAKIN_INSTALL_DIR}
)
if (WITH_ANAKIN)
message(STATUS "Anakin for inference is enabled")
message(STATUS "Anakin is set INCLUDE:${ANAKIN_INCLUDE} LIBRARY:${ANAKIN_LIBRARY}")
fetch_include_recursively(${ANAKIN_INCLUDE})
link_directories(${ANAKIN_LIBRARY})
endif()
message(STATUS "Anakin for inference is enabled")
message(STATUS "Anakin is set INCLUDE:${ANAKIN_INCLUDE} LIBRARY:${ANAKIN_LIBRARY}")
add_library(anakin_shared SHARED IMPORTED GLOBAL)
set_property(TARGET anakin_shared PROPERTY IMPORTED_LOCATION ${ANAKIN_SHARED_LIB})
add_dependencies(anakin_shared extern_anakin protobuf mklml)
add_library(anakin_saber SHARED IMPORTED GLOBAL)
set_property(TARGET anakin_saber PROPERTY IMPORTED_LOCATION ${ANAKIN_SABER_LIB})
add_dependencies(anakin_saber extern_anakin protobuf mklml)
list(APPEND external_project_dependencies anakin_shared anakin_saber)
if(NOT WITH_GPU)
return()
endif()
include(ExternalProject)
set(CUB_SOURCE_DIR ${THIRD_PARTY_PATH}/cub)
set(CUB_INCLUDE_DIR ${CUB_SOURCE_DIR}/src/extern_cub)
include_directories(${CUB_INCLUDE_DIR})
ExternalProject_Add(
extern_cub
${EXTERNAL_PROJECT_LOG_ARGS}
GIT_REPOSITORY "https://github.com/NVlabs/cub.git"
GIT_TAG "v1.8.0"
PREFIX ${CUB_SOURCE_DIR}
UPDATE_COMMAND ""
CONFIGURE_COMMAND ""
BUILD_COMMAND ""
INSTALL_COMMAND ""
TEST_COMMAND ""
)
if(${CMAKE_VERSION} VERSION_LESS "3.3.0")
set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/cub_dummy.c)
file(WRITE ${dummyfile} "const char *dummy = \"${dummyfile}\";")
add_library(cub STATIC ${dummyfile})
else()
add_library(cub INTERFACE)
endif()
add_dependencies(cub extern_cub)
LIST(APPEND externl_project_dependencies cub)
......@@ -24,7 +24,7 @@ SET(MKLDNN_INSTALL_DIR ${THIRD_PARTY_PATH}/install/mkldnn)
SET(MKLDNN_INC_DIR "${MKLDNN_INSTALL_DIR}/include" CACHE PATH "mkldnn include directory." FORCE)
IF(WIN32 OR APPLE)
MESSAGE(WARNING
MESSAGE(WARNING
"Windows or Mac is not supported with MKLDNN in Paddle yet."
"Force WITH_MKLDNN=OFF")
SET(WITH_MKLDNN OFF CACHE STRING "Disable MKLDNN in Windows and MacOS" FORCE)
......@@ -57,8 +57,10 @@ ExternalProject_Add(
GIT_TAG "a29d8487a63afca3d5b8c5bbdbb473cf8ccc6e51"
PREFIX ${MKLDNN_SOURCES_DIR}
UPDATE_COMMAND ""
CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
CMAKE_ARGS -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${MKLDNN_INSTALL_DIR}
CMAKE_ARGS -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}
CMAKE_ARGS -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}
CMAKE_ARGS -DMKLROOT=${MKLML_ROOT}
CMAKE_ARGS -DCMAKE_C_FLAGS=${MKLDNN_CFLAG}
CMAKE_ARGS -DCMAKE_CXX_FLAGS=${MKLDNN_CXXFLAG}
......
# Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
set(WITH_XBYAK ON)
if(WIN32 OR APPLE)
SET(WITH_XBYAK OFF CACHE STRING "Disable XBYAK in Windows and MacOS" FORCE)
return()
endif()
include(ExternalProject)
set(XBYAK_PROJECT extern_xbyak)
set(XBYAK_PREFIX_DIR ${THIRD_PARTY_PATH}/xbyak)
set(XBYAK_INSTALL_ROOT ${THIRD_PARTY_PATH}/install/xbyak)
set(XBYAK_INC_DIR ${XBYAK_INSTALL_ROOT}/include)
include_directories(${XBYAK_INC_DIR})
include_directories(${XBYAK_INC_DIR}/xbyak)
add_definitions(-DPADDLE_WITH_XBYAK)
# xbyak options
add_definitions(-DXBYAK64)
add_definitions(-DXBYAK_NO_OP_NAMES)
ExternalProject_Add(
${XBYAK_PROJECT}
${EXTERNAL_PROJECT_LOG_ARGS}
DEPENDS ""
GIT_REPOSITORY "https://github.com/herumi/xbyak.git"
GIT_TAG "v5.661" # Jul 26th
PREFIX ${XBYAK_PREFIX_DIR}
UPDATE_COMMAND ""
CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${XBYAK_INSTALL_ROOT}
CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${XBYAK_INSTALL_ROOT}
)
if (${CMAKE_VERSION} VERSION_LESS "3.3.0")
set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/xbyak_dummy.c)
file(WRITE ${dummyfile} "const char *dummy_xbyak = \"${dummyfile}\";")
add_library(xbyak STATIC ${dummyfile})
else()
add_library(xbyak INTERFACE)
endif()
add_dependencies(xbyak ${XBYAK_PROJECT})
list(APPEND external_project_dependencies xbyak)
......@@ -143,7 +143,7 @@ if (WITH_ANAKIN AND WITH_GPU)
copy(anakin_inference_lib DEPS paddle_inference_api inference_anakin_api
SRCS
${PADDLE_BINARY_DIR}/paddle/fluid/inference/api/libinference_anakin_api* # compiled anakin api
${PADDLE_BINARY_DIR}/third_party/install/anakin/*.tar.gz # anakin release
${ANAKIN_INSTALL_DIR} # anakin release
DSTS ${dst_dir}/inference/anakin ${dst_dir}/inference/anakin)
list(APPEND inference_deps anakin_inference_lib)
endif()
......
......@@ -38,11 +38,3 @@ _switch_scope
.. autofunction:: paddle.fluid.executor._switch_scope
:noindex:
.. _api_fluid_executor_fetch_var:
fetch_var
---------
.. autofunction:: paddle.fluid.executor.fetch_var
:noindex:
......@@ -106,22 +106,6 @@ _switch_scope
.. autofunction:: paddle.fluid._switch_scope
:noindex:
.. _api_fluid_fetch_var:
fetch_var
---------
.. autofunction:: paddle.fluid.fetch_var
:noindex:
.. _api_fluid_Go:
Go
--
.. autoclass:: paddle.fluid.Go
:members:
:noindex:
.. _api_fluid_make_channel:
......
......@@ -177,8 +177,8 @@ graph = PassRegistry::Instance().Get("op_fuse_pass").Apply(std::move(grah));
auto mem_opt_pass = PassRegistry::Instance().Get("memory_optimization_pass");
mem_opt_pass.SetNotOwned<int>("optimize_level", 1);
mem_opt_pass->Apply(std::move(graph));
graph = PassRegistry::Instance().Get("multi_device_pass").Apply(std::move(grah));
graph = PassRegistry::Instance().Get("multi_device_check_pass").Apply(std::move(grah));
graph = PassRegistry::Instance().Get("multi_devices_pass").Apply(std::move(grah));
graph = PassRegistry::Instance().Get("multi_devices_check_pass").Apply(std::move(grah));
Executor exe;
exe.Run(graph);
......
......@@ -6,7 +6,7 @@ paddle.fluid.Program.create_block ArgSpec(args=['self', 'parent_idx'], varargs=N
paddle.fluid.Program.current_block ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
paddle.fluid.Program.get_desc ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
paddle.fluid.Program.global_block ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
paddle.fluid.Program.inference_optimize ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
paddle.fluid.Program.inference_optimize ArgSpec(args=['self', 'export_for_deployment'], varargs=None, keywords=None, defaults=(True,))
paddle.fluid.Program.list_vars ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
paddle.fluid.Program.optimized_guard ArgSpec(args=[], varargs='args', keywords='kwds', defaults=None)
paddle.fluid.Program.parse_from_string ArgSpec(args=['binary_str'], varargs=None, keywords=None, defaults=None)
......@@ -18,6 +18,9 @@ paddle.fluid.Operator.all_attrs ArgSpec(args=['self'], varargs=None, keywords=No
paddle.fluid.Operator.attr ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=None)
paddle.fluid.Operator.attr_type ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=None)
paddle.fluid.Operator.block_attr ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=None)
paddle.fluid.Operator.block_attr_id ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=None)
paddle.fluid.Operator.blocks_attr ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=None)
paddle.fluid.Operator.blocks_attr_ids ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=None)
paddle.fluid.Operator.has_attr ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=None)
paddle.fluid.Operator.has_kernel ArgSpec(args=['self', 'op_type'], varargs=None, keywords=None, defaults=None)
paddle.fluid.Operator.input ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=None)
......@@ -34,21 +37,10 @@ paddle.fluid.default_main_program ArgSpec(args=[], varargs=None, keywords=None,
paddle.fluid.program_guard ArgSpec(args=[], varargs='args', keywords='kwds', defaults=None)
paddle.fluid.get_var ArgSpec(args=['name', 'program'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.Executor.__init__ ArgSpec(args=['self', 'place'], varargs=None, keywords=None, defaults=None)
paddle.fluid.Executor.as_lodtensor ArgSpec(args=['self', 'data'], varargs=None, keywords=None, defaults=None)
paddle.fluid.Executor.close ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
paddle.fluid.Executor.run ArgSpec(args=['self', 'program', 'feed', 'fetch_list', 'feed_var_name', 'fetch_var_name', 'scope', 'return_numpy', 'use_program_cache'], varargs=None, keywords=None, defaults=(None, None, None, 'feed', 'fetch', None, True, False))
paddle.fluid.global_scope ArgSpec(args=[], varargs=None, keywords=None, defaults=None)
paddle.fluid.scope_guard ArgSpec(args=[], varargs='args', keywords='kwds', defaults=None)
paddle.fluid.fetch_var ArgSpec(args=['name', 'scope', 'return_numpy'], varargs=None, keywords=None, defaults=(None, True))
paddle.fluid.Go.__init__ ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.Go.construct_go_op ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
paddle.fluid.make_channel ArgSpec(args=['dtype', 'capacity'], varargs=None, keywords=None, defaults=(0,))
paddle.fluid.channel_send ArgSpec(args=['channel', 'value', 'is_copy'], varargs=None, keywords=None, defaults=(False,))
paddle.fluid.channel_recv ArgSpec(args=['channel', 'return_value'], varargs=None, keywords=None, defaults=None)
paddle.fluid.channel_close ArgSpec(args=['channel'], varargs=None, keywords=None, defaults=None)
paddle.fluid.Select.__init__ ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.Select.case ArgSpec(args=['self', 'channel_action_fn', 'channel', 'value', 'is_copy'], varargs=None, keywords=None, defaults=(False,))
paddle.fluid.Select.default ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
paddle.fluid.Trainer.__init__ ArgSpec(args=['self', 'train_func', 'optimizer_func', 'param_path', 'place', 'parallel', 'checkpoint_config'], varargs=None, keywords=None, defaults=(None, None, False, None))
paddle.fluid.Trainer.save_params ArgSpec(args=['self', 'param_path'], varargs=None, keywords=None, defaults=None)
paddle.fluid.Trainer.stop ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
......@@ -62,20 +54,16 @@ paddle.fluid.CheckpointConfig.__init__ ArgSpec(args=['self', 'checkpoint_dir', '
paddle.fluid.Inferencer.__init__ ArgSpec(args=['self', 'infer_func', 'param_path', 'place', 'parallel'], varargs=None, keywords=None, defaults=(None, False))
paddle.fluid.Inferencer.infer ArgSpec(args=['self', 'inputs', 'return_numpy'], varargs=None, keywords=None, defaults=(True,))
paddle.fluid.DistributeTranspiler.__init__ ArgSpec(args=['self', 'config'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.DistributeTranspiler.create_splited_vars ArgSpec(args=['self', 'source_var', 'block', 'tag'], varargs=None, keywords=None, defaults=None)
paddle.fluid.DistributeTranspiler.get_pserver_program ArgSpec(args=['self', 'endpoint'], varargs=None, keywords=None, defaults=None)
paddle.fluid.DistributeTranspiler.get_startup_program ArgSpec(args=['self', 'endpoint', 'pserver_program'], varargs=None, keywords=None, defaults=None)
paddle.fluid.DistributeTranspiler.get_trainer_program ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
paddle.fluid.DistributeTranspiler.transpile ArgSpec(args=['self', 'trainer_id', 'program', 'pservers', 'trainers', 'sync_mode'], varargs=None, keywords=None, defaults=(None, '127.0.0.1:6174', 1, True))
paddle.fluid.InferenceTranspiler.__init__
paddle.fluid.InferenceTranspiler.fuse_batch_norm ArgSpec(args=['self', 'program', 'place', 'scope'], varargs=None, keywords=None, defaults=None)
paddle.fluid.InferenceTranspiler.fuse_relu_mkldnn ArgSpec(args=['self', 'program'], varargs=None, keywords=None, defaults=None)
paddle.fluid.InferenceTranspiler.transpile ArgSpec(args=['self', 'program', 'place', 'scope'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.memory_optimize ArgSpec(args=['input_program', 'skip_opt_set', 'print_log', 'level'], varargs=None, keywords=None, defaults=(None, False, 0))
paddle.fluid.release_memory ArgSpec(args=['input_program', 'skip_opt_set'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.DistributeTranspilerConfig.__init__
paddle.fluid.ParallelExecutor.__init__ ArgSpec(args=['self', 'use_cuda', 'loss_name', 'main_program', 'share_vars_from', 'exec_strategy', 'build_strategy', 'num_trainers', 'trainer_id'], varargs=None, keywords='kwargs', defaults=(None, None, None, None, None, 1, 0))
paddle.fluid.ParallelExecutor.bcast_params ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
paddle.fluid.ParallelExecutor.run ArgSpec(args=['self', 'fetch_list', 'feed', 'feed_dict', 'return_numpy'], varargs=None, keywords=None, defaults=(None, None, True))
paddle.fluid.ExecutionStrategy.__init__ __init__(self: paddle.fluid.core.ExecutionStrategy) -> None
paddle.fluid.BuildStrategy.GradientScaleStrategy.__init__ __init__(self: paddle.fluid.core.GradientScaleStrategy, arg0: int) -> None
......@@ -89,7 +77,7 @@ paddle.fluid.io.save_persistables ArgSpec(args=['executor', 'dirname', 'main_pro
paddle.fluid.io.load_vars ArgSpec(args=['executor', 'dirname', 'main_program', 'vars', 'predicate', 'filename'], varargs=None, keywords=None, defaults=(None, None, None, None))
paddle.fluid.io.load_params ArgSpec(args=['executor', 'dirname', 'main_program', 'filename'], varargs=None, keywords=None, defaults=(None, None))
paddle.fluid.io.load_persistables ArgSpec(args=['executor', 'dirname', 'main_program', 'filename'], varargs=None, keywords=None, defaults=(None, None))
paddle.fluid.io.save_inference_model ArgSpec(args=['dirname', 'feeded_var_names', 'target_vars', 'executor', 'main_program', 'model_filename', 'params_filename'], varargs=None, keywords=None, defaults=(None, None, None))
paddle.fluid.io.save_inference_model ArgSpec(args=['dirname', 'feeded_var_names', 'target_vars', 'executor', 'main_program', 'model_filename', 'params_filename', 'export_for_deployment'], varargs=None, keywords=None, defaults=(None, None, None, True))
paddle.fluid.io.load_inference_model ArgSpec(args=['dirname', 'executor', 'model_filename', 'params_filename'], varargs=None, keywords=None, defaults=(None, None))
paddle.fluid.io.get_inference_program ArgSpec(args=['target_vars', 'main_program'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.initializer.ConstantInitializer.__init__ ArgSpec(args=['self', 'value', 'force_cpu'], varargs=None, keywords=None, defaults=(0.0, False))
......@@ -338,14 +326,11 @@ paddle.fluid.contrib.BeamSearchDecoder.read_array ArgSpec(args=['self', 'init',
paddle.fluid.contrib.BeamSearchDecoder.update_array ArgSpec(args=['self', 'array', 'value'], varargs=None, keywords=None, defaults=None)
paddle.fluid.contrib.memory_usage ArgSpec(args=['program', 'batch_size'], varargs=None, keywords=None, defaults=None)
paddle.fluid.transpiler.DistributeTranspiler.__init__ ArgSpec(args=['self', 'config'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.transpiler.DistributeTranspiler.create_splited_vars ArgSpec(args=['self', 'source_var', 'block', 'tag'], varargs=None, keywords=None, defaults=None)
paddle.fluid.transpiler.DistributeTranspiler.get_pserver_program ArgSpec(args=['self', 'endpoint'], varargs=None, keywords=None, defaults=None)
paddle.fluid.transpiler.DistributeTranspiler.get_startup_program ArgSpec(args=['self', 'endpoint', 'pserver_program'], varargs=None, keywords=None, defaults=None)
paddle.fluid.transpiler.DistributeTranspiler.get_trainer_program ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
paddle.fluid.transpiler.DistributeTranspiler.transpile ArgSpec(args=['self', 'trainer_id', 'program', 'pservers', 'trainers', 'sync_mode'], varargs=None, keywords=None, defaults=(None, '127.0.0.1:6174', 1, True))
paddle.fluid.transpiler.InferenceTranspiler.__init__
paddle.fluid.transpiler.InferenceTranspiler.fuse_batch_norm ArgSpec(args=['self', 'program', 'place', 'scope'], varargs=None, keywords=None, defaults=None)
paddle.fluid.transpiler.InferenceTranspiler.fuse_relu_mkldnn ArgSpec(args=['self', 'program'], varargs=None, keywords=None, defaults=None)
paddle.fluid.transpiler.InferenceTranspiler.transpile ArgSpec(args=['self', 'program', 'place', 'scope'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.transpiler.memory_optimize ArgSpec(args=['input_program', 'skip_opt_set', 'print_log', 'level'], varargs=None, keywords=None, defaults=(None, False, 0))
paddle.fluid.transpiler.release_memory ArgSpec(args=['input_program', 'skip_opt_set'], varargs=None, keywords=None, defaults=(None,))
......
......@@ -100,7 +100,7 @@ else()
endif()
cc_library(parallel_executor SRCS parallel_executor.cc DEPS threaded_ssa_graph_executor scope_buffered_ssa_graph_executor graph graph_viz_pass multi_devices_graph_builder ssa_graph_printer ssa_graph_checker)
cc_library(parallel_executor SRCS parallel_executor.cc DEPS threaded_ssa_graph_executor scope_buffered_ssa_graph_executor graph graph_viz_pass multi_devices_graph_pass multi_devices_graph_print_pass multi_devices_graph_check_pass)
cc_library(prune SRCS prune.cc DEPS framework_proto)
cc_test(prune_test SRCS prune_test.cc DEPS op_info prune recurrent_op device_context)
......
......@@ -5,9 +5,9 @@ cc_library(fetch_op_handle SRCS fetch_op_handle.cc DEPS op_handle_base scope lod
cc_library(computation_op_handle SRCS computation_op_handle.cc DEPS framework_proto scope place operator op_registry)
cc_library(rpc_op_handle SRCS rpc_op_handle.cc DEPS framework_proto scope place operator op_registry)
cc_library(ssa_graph_builder SRCS ssa_graph_builder.cc DEPS graph graph_helper)
cc_library(ssa_graph_printer SRCS ssa_graph_printer.cc DEPS ssa_graph_builder)
cc_library(ssa_graph_checker SRCS ssa_graph_checker.cc DEPS ssa_graph_builder)
cc_library(multi_devices_helper SRCS multi_devices_helper.cc DEPS graph graph_helper)
cc_library(multi_devices_graph_print_pass SRCS multi_devices_graph_print_pass.cc DEPS multi_devices_helper)
cc_library(multi_devices_graph_check_pass SRCS multi_devices_graph_check_pass.cc DEPS multi_devices_helper)
cc_library(variable_visitor SRCS variable_visitor.cc DEPS lod_tensor selected_rows)
......@@ -28,7 +28,7 @@ cc_library(data_balance_op_handle SRCS data_balance_op_handle.cc DEPS op_handle_
cc_library(gather_op_handle SRCS gather_op_handle.cc DEPS op_handle_base scope ddim memory variable_visitor)
cc_library(fuse_vars_op_handle SRCS fuse_vars_op_handle.cc DEPS op_handle_base scope)
cc_library(multi_devices_graph_builder SRCS multi_devices_graph_builder.cc DEPS ssa_graph_builder computation_op_handle
cc_library(multi_devices_graph_pass SRCS multi_devices_graph_pass.cc DEPS multi_devices_helper computation_op_handle
scale_loss_grad_op_handle rpc_op_handle all_reduce_op_handle reduce_op_handle broadcast_op_handle data_balance_op_handle)
cc_library(ssa_graph_executor SRCS ssa_graph_executor.cc DEPS graph framework_proto)
......
......@@ -12,7 +12,7 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/details/ssa_graph_checker.h"
#include "paddle/fluid/framework/details/multi_devices_graph_check_pass.h"
#include <string>
#include "paddle/fluid/framework/ir/graph.h"
......@@ -86,7 +86,7 @@ bool SSAGraghBuilderWithChecker::IsValidGraph(const ir::Graph *graph) const {
} // namespace framework
} // namespace paddle
REGISTER_PASS(multi_device_check_pass,
REGISTER_PASS(multi_devices_check_pass,
paddle::framework::details::SSAGraghBuilderWithChecker)
.RequireGraphAttr(paddle::framework::details::kGraphVars)
.RequireGraphAttr(paddle::framework::details::kGraphDepVars)
......
......@@ -14,7 +14,7 @@
#pragma once
#include "paddle/fluid/framework/details/ssa_graph_builder.h"
#include "paddle/fluid/framework/details/multi_devices_helper.h"
#include <string>
......@@ -22,7 +22,7 @@ namespace paddle {
namespace framework {
namespace details {
class SSAGraghBuilderWithChecker : public SSAGraphBuilder {
class SSAGraghBuilderWithChecker : public ir::Pass {
protected:
std::unique_ptr<ir::Graph> ApplyImpl(
std::unique_ptr<ir::Graph> graph) const override {
......
......@@ -21,7 +21,7 @@
#include "paddle/fluid/framework/details/broadcast_op_handle.h"
#include "paddle/fluid/framework/details/computation_op_handle.h"
#include "paddle/fluid/framework/details/data_balance_op_handle.h"
#include "paddle/fluid/framework/details/multi_devices_graph_builder.h"
#include "paddle/fluid/framework/details/multi_devices_graph_pass.h"
#include "paddle/fluid/framework/details/reduce_op_handle.h"
#include "paddle/fluid/framework/details/rpc_op_handle.h"
#include "paddle/fluid/framework/details/scale_loss_grad_op_handle.h"
......@@ -33,6 +33,92 @@
namespace paddle {
namespace framework {
namespace details {
namespace {
void PolishGraphToSupportDataHazards(ir::Graph *graph) {
for (auto &var_map : graph->Get<GraphVars>(kGraphVars)) {
for (auto &name_pair : var_map) {
if (name_pair.second.size() <= 1) {
continue;
}
auto it_new = name_pair.second.rbegin();
auto it_old = name_pair.second.rbegin();
++it_old;
for (; it_old != name_pair.second.rend(); it_new = it_old, ++it_old) {
OpHandleBase *write_op = (*it_new)->GeneratedOp();
const auto &read_ops = (*it_old)->PendingOps();
for (auto *read_op : read_ops) {
// Manually add a dependency var from read_op to write_op;
if (read_op == write_op) {
// Read Write is the same op.
continue;
}
bool has_dep = false;
for (auto *r_out : read_op->Outputs()) {
for (auto *w_in : write_op->Inputs()) {
if (r_out->Node() == w_in->Node()) {
has_dep = true;
break;
}
}
}
if (has_dep) continue;
auto *dep_var = new DummyVarHandle(graph->CreateControlDepVar());
read_op->AddOutput(dep_var);
write_op->AddInput(dep_var);
graph->Get<GraphDepVars>(kGraphDepVars).emplace(dep_var);
}
}
}
}
}
VarHandle *CreateOrGetLatestVarHandle(ir::Graph *graph, ir::Node *node,
const platform::Place &place,
size_t place_offset) {
auto &var_holders = graph->Get<GraphVars>(kGraphVars)[place_offset];
auto &var_holder = var_holders[node->Name()];
VarHandle *var = nullptr;
if (var_holder.empty()) {
if (node->Var()) {
var = new VarHandle(graph->CreateVarNode(node->Var()), 0, place_offset,
node->Name(), place);
} else {
var = new VarHandle(
graph->CreateEmptyNode(node->Name(), ir::Node::Type::kVariable), 0,
place_offset, node->Name(), place);
}
var_holder.emplace_back(var);
} else {
var = var_holder.rbegin()->get();
}
return var;
}
void CreateOpOutput(ir::Graph *graph, OpHandleBase *op_handle,
ir::Node *new_node, const platform::Place &place,
size_t place_offset) {
auto &vars =
graph->Get<GraphVars>(kGraphVars)[place_offset][new_node->Name()];
size_t version = vars.size();
auto var =
new VarHandle(new_node, version, place_offset, new_node->Name(), place);
vars.emplace_back(var);
op_handle->AddOutput(var);
}
void AddOutputToLeafOps(ir::Graph *graph) {
for (auto &op : graph->Get<GraphOps>(kGraphOps)) {
if (!op->Outputs().empty()) {
continue;
}
auto *dummy_leaf = new DummyVarHandle(graph->CreateControlDepVar());
graph->Get<GraphDepVars>(kGraphDepVars).emplace(dummy_leaf);
op->AddOutput(dummy_leaf);
}
}
} // namespace
static const char kLossVarName[] = "loss_var_name";
static const char kPlaces[] = "places";
......@@ -751,7 +837,7 @@ bool MultiDevSSAGraphBuilder::IsScaleLossOp(ir::Node *node) const {
} // namespace framework
} // namespace paddle
REGISTER_PASS(multi_device_pass,
REGISTER_PASS(multi_devices_pass,
paddle::framework::details::MultiDevSSAGraphBuilder)
.RequirePassAttr(paddle::framework::details::kLossVarName)
.RequirePassAttr(paddle::framework::details::kPlaces)
......
......@@ -18,7 +18,7 @@
#include <vector>
#include "paddle/fluid/framework/details/build_strategy.h"
#include "paddle/fluid/framework/details/ssa_graph_builder.h"
#include "paddle/fluid/framework/details/multi_devices_helper.h"
#include "paddle/fluid/framework/ir/graph.h"
namespace paddle {
......@@ -30,7 +30,7 @@ namespace framework {
class Scope;
namespace details {
class MultiDevSSAGraphBuilder : public SSAGraphBuilder {
class MultiDevSSAGraphBuilder : public ir::Pass {
protected:
std::unique_ptr<ir::Graph> ApplyImpl(
std::unique_ptr<ir::Graph> graph) const override;
......
......@@ -12,7 +12,7 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/details/ssa_graph_printer.h"
#include "paddle/fluid/framework/details/multi_devices_graph_print_pass.h"
#include <string>
#include "paddle/fluid/framework/ir/graph.h"
......@@ -82,5 +82,5 @@ void GraphvizSSAGraphPrinter::Print(const ir::Graph &graph,
} // namespace framework
} // namespace paddle
REGISTER_PASS(multi_device_print_pass,
REGISTER_PASS(multi_devices_print_pass,
paddle::framework::details::SSAGraghBuilderWithPrinter);
......@@ -18,7 +18,7 @@
#include <iosfwd>
#include <ostream>
#include <string>
#include "paddle/fluid/framework/details/ssa_graph_builder.h"
#include "paddle/fluid/framework/details/multi_devices_helper.h"
namespace paddle {
namespace framework {
......@@ -35,7 +35,7 @@ class GraphvizSSAGraphPrinter : public SSAGraphPrinter {
void Print(const ir::Graph& graph, std::ostream& sout) const override;
};
class SSAGraghBuilderWithPrinter : public SSAGraphBuilder {
class SSAGraghBuilderWithPrinter : public ir::Pass {
protected:
std::unique_ptr<ir::Graph> ApplyImpl(
std::unique_ptr<ir::Graph> graph) const override {
......
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/details/multi_devices_helper.h"
namespace paddle {
namespace framework {
namespace details {} // namespace details
} // namespace framework
} // namespace paddle
......@@ -52,33 +52,6 @@ const char kGraphOps[] = "ops";
typedef std::unordered_map<std::string, int> ShardedVarDevice;
const char kShardedVarDevice[] = "sharded_var_device";
class SSAGraphBuilder : public ir::Pass {
public:
SSAGraphBuilder() {}
virtual ~SSAGraphBuilder() {}
DISABLE_COPY_AND_ASSIGN(SSAGraphBuilder);
protected:
/*
Dependency graph has been constructed. However, there are still data
hazards need to be handled.
*/
static void PolishGraphToSupportDataHazards(ir::Graph *graph);
static VarHandle *CreateOrGetLatestVarHandle(ir::Graph *graph, ir::Node *node,
const platform::Place &place,
size_t place_offset);
// Add an output variable (each_var_name, place, place_offset) to op_handle,
// which belongs to graph
static void CreateOpOutput(ir::Graph *graph, OpHandleBase *op_handle,
ir::Node *new_node, const platform::Place &place,
size_t place_offset);
static void AddOutputToLeafOps(ir::Graph *graph);
};
} // namespace details
} // namespace framework
} // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/details/ssa_graph_builder.h"
#include <utility>
namespace paddle {
namespace framework {
namespace details {
void SSAGraphBuilder::PolishGraphToSupportDataHazards(ir::Graph *graph) {
for (auto &var_map : graph->Get<GraphVars>(kGraphVars)) {
for (auto &name_pair : var_map) {
if (name_pair.second.size() <= 1) {
continue;
}
auto it_new = name_pair.second.rbegin();
auto it_old = name_pair.second.rbegin();
++it_old;
for (; it_old != name_pair.second.rend(); it_new = it_old, ++it_old) {
OpHandleBase *write_op = (*it_new)->GeneratedOp();
const auto &read_ops = (*it_old)->PendingOps();
for (auto *read_op : read_ops) {
// Manually add a dependency var from read_op to write_op;
if (read_op == write_op) {
// Read Write is the same op.
continue;
}
bool has_dep = false;
for (auto *r_out : read_op->Outputs()) {
for (auto *w_in : write_op->Inputs()) {
if (r_out->Node() == w_in->Node()) {
has_dep = true;
break;
}
}
}
if (has_dep) continue;
auto *dep_var = new DummyVarHandle(graph->CreateControlDepVar());
read_op->AddOutput(dep_var);
write_op->AddInput(dep_var);
graph->Get<GraphDepVars>(kGraphDepVars).emplace(dep_var);
}
}
}
}
}
VarHandle *SSAGraphBuilder::CreateOrGetLatestVarHandle(
ir::Graph *graph, ir::Node *node, const platform::Place &place,
size_t place_offset) {
auto &var_holders = graph->Get<GraphVars>(kGraphVars)[place_offset];
auto &var_holder = var_holders[node->Name()];
VarHandle *var = nullptr;
if (var_holder.empty()) {
if (node->Var()) {
var = new VarHandle(graph->CreateVarNode(node->Var()), 0, place_offset,
node->Name(), place);
} else {
var = new VarHandle(
graph->CreateEmptyNode(node->Name(), ir::Node::Type::kVariable), 0,
place_offset, node->Name(), place);
}
var_holder.emplace_back(var);
} else {
var = var_holder.rbegin()->get();
}
return var;
}
void SSAGraphBuilder::CreateOpOutput(ir::Graph *graph, OpHandleBase *op_handle,
ir::Node *new_node,
const platform::Place &place,
size_t place_offset) {
auto &vars =
graph->Get<GraphVars>(kGraphVars)[place_offset][new_node->Name()];
size_t version = vars.size();
auto var =
new VarHandle(new_node, version, place_offset, new_node->Name(), place);
vars.emplace_back(var);
op_handle->AddOutput(var);
}
void SSAGraphBuilder::AddOutputToLeafOps(ir::Graph *graph) {
for (auto &op : graph->Get<GraphOps>(kGraphOps)) {
if (!op->Outputs().empty()) {
continue;
}
auto *dummy_leaf = new DummyVarHandle(graph->CreateControlDepVar());
graph->Get<GraphDepVars>(kGraphDepVars).emplace(dummy_leaf);
op->AddOutput(dummy_leaf);
}
}
} // namespace details
} // namespace framework
} // namespace paddle
......@@ -14,7 +14,7 @@
#include "paddle/fluid/framework/details/threaded_ssa_graph_executor.h"
#include "paddle/fluid/framework/details/ssa_graph_builder.h"
#include "paddle/fluid/framework/details/multi_devices_helper.h"
#include "paddle/fluid/platform/profiler.h"
namespace paddle {
......
......@@ -330,12 +330,7 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
}
for (auto& op : ctx->ops_) {
VLOG(4) << place_ << " " << op->DebugStringEx(local_scope);
op->Run(*local_scope, place_);
// NOTE! Please do not delete this line, it's usefull because the debug
// string before and after op.run are different, after run the output
// will have right shape which is usefull for debug.
VLOG(3) << place_ << " " << op->DebugStringEx(local_scope);
if (FLAGS_benchmark) {
VLOG(2) << "Memory used after operator " + op->Type() + " running: "
......
......@@ -182,9 +182,11 @@ Graph::Graph(const ProgramDesc &program) : program_(program) {
}
/**
* We only handle write after read(WAR), since it should not have a write
* after write in program. If there are write after write operators, we need
* prune them.
* We should handle write after read(WAR) and write after write(WAW) here.
* Because some of the operators of the program can be executed parallelly.
* So, to make the program running in the right order, we should add the
* dependence of WAR and WAW.
*
*
* https://en.wikipedia.org/wiki/Hazard_(computer_architecture)#Write_after_read_(WAR)
*/
......@@ -201,6 +203,19 @@ Graph::Graph(const ProgramDesc &program) : program_(program) {
(*it_new)->inputs.empty() ? nullptr : (*it_new)->inputs[0];
const auto &read_ops = (*it_old)->outputs;
PADDLE_ENFORCE(write_op, "The write_op should not be empty.");
// Add write after write dependence
ir::Node *upstream_op =
(*it_old)->inputs.empty() ? nullptr : (*it_old)->inputs[0];
if (upstream_op) {
ir::Node *dep_var = CreateControlDepVar();
write_op->inputs.push_back(dep_var);
upstream_op->outputs.push_back(dep_var);
dep_var->outputs.push_back(write_op);
dep_var->inputs.push_back(upstream_op);
}
for (auto *read_op : read_ops) {
// Manually add a dependency var from read_op to write_op;
if (read_op == write_op) {
......
......@@ -238,7 +238,20 @@ Attribute OpDesc::GetNullableAttr(const std::string &name) const {
}
}
int OpDesc::GetBlockAttr(const std::string &name) const {
std::vector<int> OpDesc::GetBlocksAttrIds(const std::string &name) const {
auto it = attrs_.find(name);
PADDLE_ENFORCE(it != attrs_.end(), "Attribute %s is not found", name);
auto blocks = boost::get<std::vector<BlockDesc *>>(it->second);
std::vector<int> ids;
for (auto n : blocks) {
ids.push_back(n->ID());
}
return ids;
}
int OpDesc::GetBlockAttrId(const std::string &name) const {
auto it = attrs_.find(name);
PADDLE_ENFORCE(it != attrs_.end(), "Attribute %s is not found", name);
return boost::get<BlockDesc *>(it->second)->ID();
......
......@@ -83,7 +83,9 @@ class OpDesc {
Attribute GetNullableAttr(const std::string &name) const;
int GetBlockAttr(const std::string &name) const;
int GetBlockAttrId(const std::string &name) const;
std::vector<int> GetBlocksAttrIds(const std::string &name) const;
void Rename(const std::string &old_name, const std::string &new_name);
......
......@@ -127,7 +127,7 @@ static LoD GetLoD(const Scope& scope, const std::string& name) {
}
void OperatorBase::Run(const Scope& scope, const platform::Place& place) {
VLOG(10) << "- " << DebugStringEx(&scope);
VLOG(4) << place << " " << DebugStringEx(&scope);
if (platform::is_gpu_place(place)) {
#ifndef PADDLE_WITH_CUDA
PADDLE_THROW("Cannot run operator on place %s", place);
......@@ -139,7 +139,7 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) {
platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
platform::RecordEvent record_event(Type(), pool.Get(place));
RunImpl(scope, place);
VLOG(10) << "+ " << DebugStringEx(&scope);
VLOG(3) << place << " " << DebugStringEx(&scope);
}
bool OperatorBase::HasInputs(const std::string& name) const {
......
......@@ -25,9 +25,9 @@ limitations under the License. */
#include "paddle/fluid/platform/nccl_helper.h"
#endif
#include "paddle/fluid/framework/details/multi_devices_graph_check_pass.h"
#include "paddle/fluid/framework/details/multi_devices_graph_print_pass.h"
#include "paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h"
#include "paddle/fluid/framework/details/ssa_graph_checker.h"
#include "paddle/fluid/framework/details/ssa_graph_printer.h"
#include "paddle/fluid/framework/details/threaded_ssa_graph_executor.h"
#include "paddle/fluid/platform/profiler.h"
......@@ -57,39 +57,39 @@ std::unique_ptr<ir::Graph> ApplyParallelExecutorPass(
}
// Convert graph to run on multi-devices.
auto multi_device_pass =
ir::PassRegistry::Instance().Get("multi_device_pass");
multi_device_pass->SetNotOwned<const std::vector<platform::Place>>("places",
&places);
multi_device_pass->SetNotOwned<const std::string>("loss_var_name",
&loss_var_name);
multi_device_pass->SetNotOwned<const std::unordered_set<std::string>>(
auto multi_devices_pass =
ir::PassRegistry::Instance().Get("multi_devices_pass");
multi_devices_pass->SetNotOwned<const std::vector<platform::Place>>("places",
&places);
multi_devices_pass->SetNotOwned<const std::string>("loss_var_name",
&loss_var_name);
multi_devices_pass->SetNotOwned<const std::unordered_set<std::string>>(
"params", &param_names);
multi_device_pass->SetNotOwned<const std::vector<Scope *>>("local_scopes",
&local_scopes);
multi_device_pass->SetNotOwned<const BuildStrategy>("strategy", &strategy);
multi_devices_pass->SetNotOwned<const std::vector<Scope *>>("local_scopes",
&local_scopes);
multi_devices_pass->SetNotOwned<const BuildStrategy>("strategy", &strategy);
#ifdef PADDLE_WITH_CUDA
platform::NCCLContextMap *nctx = use_cuda ? nccl_ctxs : nullptr;
multi_device_pass->SetNotOwned<platform::NCCLContextMap>("nccl_ctxs", nctx);
multi_devices_pass->SetNotOwned<platform::NCCLContextMap>("nccl_ctxs", nctx);
#endif
graph = multi_device_pass->Apply(std::move(graph));
graph = multi_devices_pass->Apply(std::move(graph));
// Apply a graph print pass to record a graph with device info.
if (!strategy.debug_graphviz_path_.empty()) {
auto multi_device_print_pass =
ir::PassRegistry::Instance().Get("multi_device_print_pass");
multi_device_print_pass->SetNotOwned<const std::string>(
auto multi_devices_print_pass =
ir::PassRegistry::Instance().Get("multi_devices_print_pass");
multi_devices_print_pass->SetNotOwned<const std::string>(
"debug_graphviz_path", &strategy.debug_graphviz_path_);
multi_device_print_pass->Set<details::GraphvizSSAGraphPrinter>(
multi_devices_print_pass->Set<details::GraphvizSSAGraphPrinter>(
"graph_printer", new details::GraphvizSSAGraphPrinter);
graph = multi_device_print_pass->Apply(std::move(graph));
graph = multi_devices_print_pass->Apply(std::move(graph));
}
// Verify that the graph is correct for multi-device executor.
auto multi_device_check_pass =
ir::PassRegistry::Instance().Get("multi_device_check_pass");
graph = multi_device_check_pass->Apply(std::move(graph));
auto multi_devices_check_pass =
ir::PassRegistry::Instance().Get("multi_devices_check_pass");
graph = multi_devices_check_pass->Apply(std::move(graph));
return graph;
}
......@@ -354,6 +354,6 @@ ParallelExecutor::~ParallelExecutor() {
} // namespace paddle
USE_PASS(graph_viz_pass);
USE_PASS(multi_device_pass);
USE_PASS(multi_device_check_pass);
USE_PASS(multi_device_print_pass);
USE_PASS(multi_devices_pass);
USE_PASS(multi_devices_check_pass);
USE_PASS(multi_devices_print_pass);
......@@ -19,7 +19,7 @@ limitations under the License. */
#include <unordered_set>
#include <vector>
#include "paddle/fluid/framework/details/execution_strategy.h"
#include "paddle/fluid/framework/details/multi_devices_graph_builder.h"
#include "paddle/fluid/framework/details/multi_devices_graph_pass.h"
#include "paddle/fluid/framework/executor.h"
#include "paddle/fluid/framework/op_info.h"
#include "paddle/fluid/framework/program_desc.h"
......
......@@ -58,7 +58,7 @@ ProgramDesc::ProgramDesc(const ProgramDesc &o) {
for (const std::string &attr_name : op->AttrNames()) {
if (op->GetAttrType(attr_name) == proto::AttrType::BLOCK) {
int sub_block_id =
o.Block(block_id).Op(op_id)->GetBlockAttr(attr_name);
o.Block(block_id).Op(op_id)->GetBlockAttrId(attr_name);
op->SetBlockAttr(attr_name, MutableBlock(sub_block_id));
}
}
......
......@@ -44,13 +44,13 @@ class DfgPassManagerImpl final : public DfgPassManager {
if (FLAGS_inference_analysis_enable_tensorrt_subgraph_engine) {
auto trt_teller = [&](const Node* node) {
std::unordered_set<std::string> teller_set(
{"elementwise_add", "mul", "conv2d", "pool2d", "relu"});
{"elementwise_add", "mul", "conv2d", "pool2d", "relu", "softmax"});
if (!node->IsFunction()) return false;
const auto* func = static_cast<const Function*>(node);
if (teller_set.count(func->func_type()))
if (teller_set.count(func->func_type())) {
return true;
else {
} else {
return false;
}
};
......
......@@ -337,6 +337,34 @@ ExtractInputAndOutputOfSubGraph(std::vector<Node *> &graph) { // NOLINT
std::vector<Node *>(outputs.begin(), outputs.end()));
}
void FilterRedundantOutputOfSubGraph(DataFlowGraph *graph) {
std::vector<Node *> op_nodes;
for (auto &node : GraphTraits<DataFlowGraph>(graph).nodes_in_TS()) {
if (node.type() == Node::Type::kValue || node.deleted()) {
continue;
}
op_nodes.push_back(&node);
}
size_t op_num = op_nodes.size();
for (size_t i = 0; i < op_num; i++) {
if (op_nodes[i]->type() == Node::Type::kFunction) continue;
std::unordered_set<std::string> follow_up_input_names;
for (size_t j = i + 1; j < op_num; j++) {
for (auto *in : op_nodes[j]->inlinks) {
follow_up_input_names.insert(in->name());
}
}
std::vector<Node *> filtered_subgraph_outlinks;
for (auto *out : op_nodes[i]->outlinks) {
if (follow_up_input_names.count(out->name())) {
filtered_subgraph_outlinks.push_back(out);
}
}
PADDLE_ENFORCE_GE(filtered_subgraph_outlinks.size(), 1UL);
op_nodes[i]->outlinks = filtered_subgraph_outlinks;
}
}
} // namespace analysis
} // namespace inference
} // namespace paddle
......@@ -178,6 +178,7 @@ struct GraphTraits<DataFlowGraph> {
std::pair<std::vector<Node *>, std::vector<Node *>>
ExtractInputAndOutputOfSubGraph(std::vector<Node *> &graph); // NOLINT
void FilterRedundantOutputOfSubGraph(DataFlowGraph *graph);
} // namespace analysis
} // namespace inference
} // namespace paddle
......@@ -52,6 +52,7 @@ bool DataFlowGraphToFluidPass::Initialize(Argument *argument) {
bool DataFlowGraphToFluidPass::Finalize() { return true; }
void DataFlowGraphToFluidPass::Run(DataFlowGraph *graph) {
FilterRedundantOutputOfSubGraph(graph);
LOG(INFO) << "graph.inputs " << graph->inputs.size();
for (auto &node : GraphTraits<DataFlowGraph>(graph).nodes_in_TS()) {
if (node.deleted()) continue;
......
......@@ -46,9 +46,9 @@ std::string DFG_GraphvizDrawPass::Draw(DataFlowGraph *graph) {
for (size_t i = 0; i < graph->nodes.size(); i++) {
const Node &node = graph->nodes.Get(i);
if (!config_.display_deleted_node && node.deleted()) continue;
for (auto &in : node.inlinks) {
if (!config_.display_deleted_node && in->deleted()) continue;
dot.AddEdge(in->repr(), node.repr(), {});
for (auto &out : node.outlinks) {
if (!config_.display_deleted_node && out->deleted()) continue;
dot.AddEdge(node.repr(), out->repr(), {});
}
}
return dot.Build();
......
......@@ -45,7 +45,6 @@ endfunction(inference_api_test)
cc_library(paddle_inference_api SRCS api.cc api_impl.cc DEPS lod_tensor)
cc_test(test_paddle_inference_api
SRCS api_tester.cc
DEPS paddle_inference_api)
......@@ -62,22 +61,18 @@ inference_api_test(test_api_tensorrt_subgraph_engine SRC api_tensorrt_subgraph_e
endif()
if (WITH_ANAKIN) # only needed in CI
# Due to Anakin do not have official library releases and the versions of protobuf and cuda do not match Paddle's,
# so anakin library will not be merged to our official inference library. To use anakin prediction API, one need to
# compile the libinference_anakin_api.a and compile with anakin.so.
fetch_include_recursively(${ANAKIN_INCLUDE})
# compile the libinference_anakin_api.a and anakin.so.
nv_library(inference_anakin_api SRCS api.cc api_anakin_engine.cc)
nv_library(inference_anakin_api_shared SHARED SRCS api.cc api_anakin_engine.cc)
target_compile_options(inference_anakin_api BEFORE PUBLIC ${ANAKIN_COMPILE_EXTRA_FLAGS})
target_compile_options(inference_anakin_api_shared BEFORE PUBLIC ${ANAKIN_COMPILE_EXTRA_FLAGS})
target_link_libraries(inference_anakin_api anakin anakin_saber_common)
target_link_libraries(inference_anakin_api_shared anakin anakin_saber_common)
nv_library(inference_anakin_api SRCS api.cc api_anakin_engine.cc DEPS anakin_shared anakin_saber)
#nv_library(inference_anakin_api_shared SHARED SRCS api.cc api_anakin_engine.cc DEPS anakin)
function(anakin_target target_name)
target_compile_options(${target_name} BEFORE PUBLIC ${ANAKIN_COMPILE_EXTRA_FLAGS})
endfunction()
anakin_target(inference_anakin_api)
#anakin_target(inference_anakin_api_shared)
if (WITH_TESTING)
# this test is unstable, disable it first.
#cc_test(inference_anakin_test SRCS api_anakin_engine_tester.cc
#ARGS --model=${ANAKIN_INSTALL_DIR}/mobilenet_v2.anakin.bin
#DEPS inference_anakin_api_shared)
#target_compile_options(inference_anakin_test BEFORE PUBLIC ${ANAKIN_COMPILE_EXTRA_FLAGS})
endif(WITH_TESTING)
cc_test(inference_anakin_test SRCS api_anakin_engine_tester.cc
ARGS --model=${ANAKIN_SOURCE_DIR}/mobilenet_v2.anakin.bin
DEPS inference_anakin_api dynload_cuda SERIAL)
target_compile_options(inference_anakin_test BEFORE PUBLIC ${ANAKIN_COMPILE_EXTRA_FLAGS})
endif(WITH_TESTING)
endif()
......@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <glog/logging.h>
#include "paddle/fluid/inference/api/paddle_inference_api.h"
namespace paddle {
......@@ -40,19 +41,36 @@ PaddleBuf::PaddleBuf(PaddleBuf&& other)
PaddleBuf::PaddleBuf(const PaddleBuf& other) { *this = other; }
PaddleBuf& PaddleBuf::operator=(const PaddleBuf& other) {
if (!other.memory_owned_) {
data_ = other.data_;
length_ = other.length_;
memory_owned_ = other.memory_owned_;
} else {
Resize(other.length());
memcpy(data_, other.data(), other.length());
length_ = other.length();
memory_owned_ = true;
}
return *this;
}
PaddleBuf& PaddleBuf::operator=(PaddleBuf&& other) {
// only the buffer with external memory can be copied
assert(!other.memory_owned_);
data_ = other.data_;
length_ = other.length_;
memory_owned_ = other.memory_owned_;
other.data_ = nullptr;
other.length_ = 0;
other.memory_owned_ = false;
return *this;
}
void PaddleBuf::Resize(size_t length) {
// Only the owned memory can be reset, the external memory can't be changed.
if (length_ == length) return;
assert(memory_owned_);
Free();
if (memory_owned_) {
Free();
}
data_ = new char[length];
length_ = length;
memory_owned_ = true;
......@@ -68,7 +86,7 @@ void PaddleBuf::Reset(void* data, size_t length) {
void PaddleBuf::Free() {
if (memory_owned_ && data_) {
assert(length_ > 0);
delete static_cast<char*>(data_);
delete[] static_cast<char*>(data_);
data_ = nullptr;
length_ = 0;
}
......
......@@ -40,6 +40,7 @@ class PaddleBuf {
// Copy only available when memory is managed externally.
explicit PaddleBuf(const PaddleBuf&);
PaddleBuf& operator=(const PaddleBuf&);
PaddleBuf& operator=(PaddleBuf&&);
// Do not own the memory.
PaddleBuf(void* data, size_t length)
: data_(data), length_(length), memory_owned_{false} {}
......
# Add TRT tests
nv_library(tensorrt_converter
SRCS mul_op.cc conv2d_op.cc fc_op.cc pool2d_op.cc elementwise_op.cc
activation_op.cc
activation_op.cc softmax_op.cc
DEPS tensorrt_engine operator scope framework_proto op_registry)
nv_test(test_op_converter SRCS test_op_converter.cc DEPS
......@@ -21,3 +21,6 @@ nv_test(test_trt_pool2d_op SRCS test_pool2d_op.cc pool2d_op.cc
nv_test(test_trt_elementwise_op SRCS test_elementwise_op.cc elementwise_op.cc
DEPS ${FLUID_CORE_MODULES} tensorrt_engine elementwise_add_op SERIAL)
nv_test(test_trt_softmax_op SRCS test_softmax_op.cc softmax_op.cc
DEPS ${FLUID_CORE_MODULES} tensorrt_engine softmax_op SERIAL)
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
namespace paddle {
namespace inference {
namespace tensorrt {
/*
* SoftMaxOp, ISoftMaxLayer in TRT. This Layer doesn't has weights.
*/
class SoftMaxOpConverter : public OpConverter {
public:
void operator()(const framework::proto::OpDesc& op,
const framework::Scope& scope, bool test_mode) override {
VLOG(4)
<< "convert a fluid softmax op to tensorrt softmax layer without bias";
framework::OpDesc op_desc(op, nullptr);
// Declare inputs
auto* input1 = engine_->GetITensor(op_desc.Input("X")[0]);
auto* layer = TRT_ENGINE_ADD_LAYER(engine_, SoftMax,
*const_cast<nvinfer1::ITensor*>(input1));
auto output_name = op_desc.Output("Out")[0];
engine_->SetITensor(output_name, layer->getOutput(0));
if (test_mode) {
engine_->DeclareOutput(output_name);
}
}
};
} // namespace tensorrt
} // namespace inference
} // namespace paddle
USE_OP(softmax);
REGISTER_TRT_OP_CONVERTER(softmax, SoftMaxOpConverter);
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <gtest/gtest.h>
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/inference/tensorrt/convert/ut_helper.h"
namespace paddle {
namespace inference {
namespace tensorrt {
TEST(SoftMaxOpConverter, main) {
framework::Scope scope;
std::unordered_set<std::string> parameters;
TRTConvertValidation validator(8, parameters, scope, 1000);
std::vector<int> tensor_shape{8, 10};
validator.DeclInputVar("softmax-X", tensor_shape,
nvinfer1::DimsCHW(10, 1, 1));
validator.DeclOutputVar("softmax-Out", nvinfer1::DimsCHW(10, 1, 1));
// Prepare Op description
framework::OpDesc desc;
desc.SetType("softmax");
desc.SetInput("X", {"softmax-X"});
desc.SetOutput("Out", {"softmax-Out"});
LOG(INFO) << "set OP";
validator.SetOp(*desc.Proto());
LOG(INFO) << "execute";
validator.Execute(3);
}
} // namespace tensorrt
} // namespace inference
} // namespace paddle
USE_OP(softmax);
......@@ -79,6 +79,12 @@ class TRTConvertValidation {
}
// Declare a Variable as input with random initialization.
void DeclInputVar(const std::string& name, const std::vector<int> tensor_dims,
const nvinfer1::Dims& trt_dims) {
DeclVar(name, tensor_dims);
engine_->DeclareInput(name, nvinfer1::DataType::kFLOAT, trt_dims);
}
void DeclInputVar(const std::string& name, const nvinfer1::Dims& dims) {
DeclVar(name, dims);
// Declare TRT inputs.
......@@ -94,12 +100,18 @@ class TRTConvertValidation {
DeclVar(name, dims);
}
// Declare a variable in a fluid Scope.
void DeclVar(const std::string& name, const nvinfer1::Dims& dims,
bool is_param = false) {
void DeclVar(const std::string& name, const std::vector<int> dim_vec) {
platform::CPUPlace place;
platform::CPUDeviceContext ctx(place);
auto* x = scope_.Var(name);
auto* x_tensor = x->GetMutable<framework::LoDTensor>();
x_tensor->Resize(framework::make_ddim(dim_vec));
RandomizeTensor(x_tensor, place, ctx);
}
// Declare a variable in a fluid Scope.
void DeclVar(const std::string& name, const nvinfer1::Dims& dims,
bool is_param = false) {
// Init Fluid tensor.
std::vector<int> dim_vec(dims.d, dims.d + dims.nbDims);
// There is no batchsize in ITensor's shape, but We should add it to
......@@ -107,10 +119,8 @@ class TRTConvertValidation {
// if_add_batch_ flag is true, add the max batchsize to dim_vec.
if (is_param != true && if_add_batch_ == true)
dim_vec.insert(dim_vec.begin(), max_batch_size_);
auto* x = scope_.Var(name);
auto* x_tensor = x->GetMutable<framework::LoDTensor>();
x_tensor->Resize(framework::make_ddim(dim_vec));
RandomizeTensor(x_tensor, place, ctx);
DeclVar(name, dim_vec);
}
void SetOp(const framework::proto::OpDesc& desc) {
......
......@@ -235,7 +235,12 @@ else()
endif()
op_library(cross_entropy_op DEPS cross_entropy)
op_library(softmax_with_cross_entropy_op DEPS cross_entropy softmax)
if(WITH_GPU)
op_library(softmax_with_cross_entropy_op DEPS cross_entropy softmax cub)
else()
op_library(softmax_with_cross_entropy_op DEPS cross_entropy softmax)
endif()
op_library(softmax_op DEPS softmax)
op_library(sequence_softmax_op DEPS softmax)
if (WITH_GPU AND TENSORRT_FOUND)
......@@ -273,9 +278,9 @@ op_library(squeeze_op DEPS reshape_op)
op_library(extract_rows_op DEPS memory)
op_library(flatten_op DEPS reshape_op)
if (WITH_GPU)
op_library(conv_op DEPS vol2col depthwise_conv im2col)
op_library(layer_norm_op DEPS cub)
else()
op_library(conv_op DEPS vol2col im2col)
endif()
......
......@@ -280,12 +280,16 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
* ('any') which lets a primitive (convolution in this case) choose
* the memory format preferred for best performance
*/
std::string data_format = ctx.Attr<std::string>("data_format");
auto chosen_memory_format =
platform::data_format_to_memory_format(data_format);
auto src_md = platform::MKLDNNMemDesc(
src_tz, platform::MKLDNNGetDataType<T>(), memory::format::any);
src_tz, platform::MKLDNNGetDataType<T>(), chosen_memory_format);
auto weights_md = platform::MKLDNNMemDesc(
weights_tz, platform::MKLDNNGetDataType<T>(), memory::format::any);
weights_tz, platform::MKLDNNGetDataType<T>(), chosen_memory_format);
auto dst_md = platform::MKLDNNMemDesc(
dst_tz, platform::MKLDNNGetDataType<T>(), memory::format::any);
dst_tz, platform::MKLDNNGetDataType<T>(), chosen_memory_format);
// create a conv primitive descriptor and save it for usage in backward
std::shared_ptr<mkldnn::convolution_forward::primitive_desc> conv_pd =
......@@ -423,16 +427,20 @@ class ConvMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
* ('any') which lets a primitive (conv backward in this case) choose
* the memory format preferred for best performance
*/
std::string data_format = ctx.Attr<std::string>("data_format");
auto chosen_memory_format =
platform::data_format_to_memory_format(data_format);
auto src_md = platform::MKLDNNMemDesc(
src_tz, platform::MKLDNNGetDataType<T>(), memory::format::any);
src_tz, platform::MKLDNNGetDataType<T>(), chosen_memory_format);
auto diff_src_md = platform::MKLDNNMemDesc(
src_tz, platform::MKLDNNGetDataType<T>(), memory::format::any);
src_tz, platform::MKLDNNGetDataType<T>(), chosen_memory_format);
auto weights_md = platform::MKLDNNMemDesc(
weights_tz, platform::MKLDNNGetDataType<T>(), memory::format::any);
weights_tz, platform::MKLDNNGetDataType<T>(), chosen_memory_format);
auto diff_weights_md = platform::MKLDNNMemDesc(
weights_tz, platform::MKLDNNGetDataType<T>(), memory::format::any);
weights_tz, platform::MKLDNNGetDataType<T>(), chosen_memory_format);
auto diff_dst_md = platform::MKLDNNMemDesc(
dst_tz, platform::MKLDNNGetDataType<T>(), memory::format::any);
dst_tz, platform::MKLDNNGetDataType<T>(), chosen_memory_format);
// Retrieve conv_pd from device context
auto conv_pd =
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
......@@ -188,6 +188,7 @@ namespace ops = paddle::operators;
REGISTER_OPERATOR(crop, ops::CropOp, ops::CropOpMaker,
paddle::framework::DefaultGradOpDescMaker<true>);
REGISTER_OPERATOR(crop_grad, ops::CropOpGrad);
REGISTER_OP_CPU_KERNEL(crop, ops::CropKernel<float>);
REGISTER_OP_CPU_KERNEL(
crop, ops::CropKernel<paddle::platform::CPUDeviceContext, float>);
REGISTER_OP_CPU_KERNEL(
crop_grad, ops::CropGradKernel<paddle::platform::CPUDeviceContext, float>);
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
......@@ -16,6 +16,7 @@ limitations under the License. */
#include "paddle/fluid/operators/crop_op.h"
namespace ops = paddle::operators;
REGISTER_OP_CUDA_KERNEL(crop, ops::CropKernel<float>);
REGISTER_OP_CUDA_KERNEL(
crop, ops::CropKernel<paddle::platform::CUDADeviceContext, float>);
REGISTER_OP_CUDA_KERNEL(
crop_grad, ops::CropGradKernel<paddle::platform::CUDADeviceContext, float>);
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
......@@ -58,32 +58,74 @@ static std::vector<int> GetOffsets(const framework::ExecutionContext& ctx) {
return res;
}
template <typename T>
template <typename DeviceContext, typename T, size_t D>
void CropFunction(const framework::ExecutionContext& context) {
auto* x = context.Input<Tensor>("X");
auto* out = context.Output<Tensor>("Out");
auto out_dims = out->dims();
if (out_dims[0] == -1) {
out_dims[0] = x->dims()[0];
}
out->mutable_data<T>(out_dims, context.GetPlace());
auto x_stride = framework::stride(x->dims());
auto out_stride = framework::stride(out->dims());
auto offsets = GetOffsets(context);
int64_t offset = 0;
for (size_t i = 0; i < offsets.size(); ++i) {
offset += (x_stride[i] * offsets[i]);
}
auto x_tensor = EigenTensor<T, D>::From(*x);
auto out_tensor = EigenTensor<T, D>::From(*out);
Eigen::array<int, D> e_offsets;
Eigen::array<int, D> e_shape;
for (size_t i = 0; i < D; ++i) {
e_offsets[i] = offsets[i];
e_shape[i] = out->dims()[i];
}
auto& place =
*context.template device_context<DeviceContext>().eigen_device();
out_tensor.device(place) = x_tensor.slice(e_offsets, e_shape);
}
template <typename DeviceContext, typename T>
class CropKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
auto* x = context.Input<Tensor>("X");
auto* out = context.Output<Tensor>("Out");
const T* x_data = x->data<T>();
T* out_data = out->mutable_data<T>(context.GetPlace());
auto x_stride = framework::stride(x->dims());
auto out_stride = framework::stride(out->dims());
auto offsets = GetOffsets(context);
int64_t offset = 0;
for (size_t i = 0; i < offsets.size(); ++i) {
offset += (x_stride[i] * offsets[i]);
int rank = context.Input<Tensor>("X")->dims().size();
switch (rank) {
case 1:
CropFunction<DeviceContext, T, 1>(context);
break;
case 2:
CropFunction<DeviceContext, T, 2>(context);
break;
case 3:
CropFunction<DeviceContext, T, 3>(context);
break;
case 4:
CropFunction<DeviceContext, T, 4>(context);
break;
case 5:
CropFunction<DeviceContext, T, 5>(context);
break;
case 6:
CropFunction<DeviceContext, T, 6>(context);
break;
default:
PADDLE_THROW(
"CropOp only support tensors with no more than 6 dimensions.");
}
StridedMemcpy<T>(context.device_context(), x_data + offset, x_stride,
out->dims(), out_stride, out_data);
}
};
template <typename DeviceContext, typename T, size_t D>
void CropGradFunction(const framework::ExecutionContext& context) {
auto* d_x = context.Output<Tensor>(framework::GradVarName("X"));
auto* x = context.Input<Tensor>("X");
if (d_x != nullptr) {
auto* d_out = context.Input<Tensor>(framework::GradVarName("Out"));
d_x->mutable_data<T>(context.GetPlace());
d_x->mutable_data<T>(x->dims(), context.GetPlace());
auto offsets = GetOffsets(context);
Eigen::array<std::pair<int, int>, D> paddings;
for (size_t i = 0; i < D; ++i) {
......
......@@ -227,6 +227,9 @@ class MineHardExamplesOp : public framework::OperatorWithKernel {
PADDLE_ENFORCE_GT(
neg_pos_ratio, 0.0f,
"neg_pos_ratio must greater than zero in max_negative mode");
PADDLE_ENFORCE_LT(
neg_dist_threshold, 1.0f,
"neg_dist_threshold must less than one in max_negative mode");
PADDLE_ENFORCE_GT(
neg_dist_threshold, 0.0f,
"neg_dist_threshold must greater than zero in max_negative mode");
......
......@@ -41,6 +41,7 @@ bool RequestSendHandler::Handle(const std::string& varname,
// Async
if (!sync_mode_) {
rpc_server_->Profiler().OneStep();
try {
executor_->RunPreparedContext((*grad_to_prepared_ctx_)[varname].get(),
scope);
......
......@@ -18,11 +18,44 @@
#include <string>
#include "paddle/fluid/operators/distributed/rpc_server.h"
#include "paddle/fluid/platform/profiler.h"
DEFINE_int32(rpc_server_profile_period, 0,
"the period of listen_and_serv to do profile");
DEFINE_string(rpc_server_profile_path, "/dev/null",
"the profile log file path");
namespace paddle {
namespace operators {
namespace distributed {
RPCServerProfiler::RPCServerProfiler(int profile_period,
const std::string& profile_log_path)
: profile_period_(profile_period), profile_log_path_(profile_log_path) {
step_ = 0;
}
void RPCServerProfiler::OneStep() {
PADDLE_ENFORCE_LE(step_, profile_period_,
"step_ should not be larger then "
"profile_period_");
if (profile_period_ <= 0) {
return;
}
if (step_ == 0) {
auto pf_state = paddle::platform::ProfilerState::kCPU;
paddle::platform::EnableProfiler(pf_state);
}
if (step_ == profile_period_) {
paddle::platform::DisableProfiler(paddle::platform::EventSortingKey::kTotal,
profile_log_path_);
step_ = 0;
} else {
step_++;
}
}
void RPCServer::ShutDown() {
LOG(INFO) << "RPCServer ShutDown ";
ShutDownImpl();
......
......@@ -19,16 +19,33 @@
#include <thread> // NOLINT
#include <utility>
#include <vector>
#include "paddle/fluid/operators/distributed/request_handler.h"
DECLARE_int32(rpc_server_profile_period);
DECLARE_string(rpc_server_profile_path);
namespace paddle {
namespace operators {
namespace distributed {
class RPCServerProfiler {
public:
RPCServerProfiler(int profile_period, const std::string& profile_log_path);
void OneStep();
private:
const int profile_period_;
std::string profile_log_path_;
int step_;
};
class RPCServer {
public:
explicit RPCServer(const std::string& address, int client_num)
: cur_cond_(0),
profiler_(FLAGS_rpc_server_profile_period,
FLAGS_rpc_server_profile_path),
bind_address_(address),
exit_flag_(false),
selected_port_(0),
......@@ -67,6 +84,7 @@ class RPCServer {
void Complete();
void ResetBarrierCounter();
RPCServerProfiler& Profiler() { return profiler_; }
protected:
virtual void ShutDownImpl() = 0;
......@@ -79,6 +97,7 @@ class RPCServer {
std::unordered_map<std::string, int> rpc_cond_map_;
std::atomic<int> cur_cond_;
std::condition_variable rpc_cond_;
RPCServerProfiler profiler_;
protected:
std::string bind_address_;
......
......@@ -534,8 +534,8 @@ void ElemwiseGradCompute(const framework::ExecutionContext& ctx,
const framework::Tensor& dout, int axis,
framework::Tensor* dx, framework::Tensor* dy,
DX_OP dx_op, DY_OP dy_op) {
const framework::DDim x_dim = x.dims();
const framework::DDim y_dim = y.dims();
const framework::DDim& x_dim = x.dims();
const framework::DDim& y_dim = y.dims();
if (x.dims() == y.dims()) {
ElemwiseGradComputeNoBroadcast<DeviceContext, T, DX_OP, DY_OP>(
ctx, x_dim, y_dim, x, y, out, dout, axis, dx, dy, dx_op, dy_op);
......@@ -558,19 +558,19 @@ void ElemwiseExplicitGradCompute(const framework::ExecutionContext& ctx,
framework::Tensor* dx, framework::Tensor* dy,
DX_OP dx_op, DY_OP dy_op) {
if (dy == nullptr) {
const framework::DDim dx_dims = dout.dims();
const framework::DDim& dx_dims = dout.dims();
auto dy_dims = dx_dims;
ElemwiseGradComputeNoBroadcast<DeviceContext, T, DX_OP, DY_OP>(
ctx, dx_dims, dy_dims, x, y, out, dout, axis, dx, dy, dx_op, dy_op);
} else {
if (dout.dims() == dy->dims()) {
const framework::DDim dx_dims = dout.dims();
const framework::DDim dy_dims = dy->dims();
const framework::DDim& dx_dims = dout.dims();
const framework::DDim& dy_dims = dy->dims();
ElemwiseGradComputeNoBroadcast<DeviceContext, T, DX_OP, DY_OP>(
ctx, dx_dims, dy_dims, x, y, out, dout, axis, dx, dy, dx_op, dy_op);
} else { // Y is a scalar
auto dx_dims = dout.dims();
const framework::DDim dy_dims = dy->dims();
const framework::DDim& dy_dims = dy->dims();
ElemwiseGradComputeWithBroadcast<DeviceContext, T, DX_OP, DY_OP>(
ctx, dx_dims, dy_dims, x, y, out, dout, axis, dx, dy, dx_op, dy_op);
}
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
......@@ -12,8 +12,512 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <cub/cub.cuh>
#include "paddle/fluid/operators/layer_norm_op.h"
namespace paddle {
namespace operators {
inline static int GetDesiredBlockDim(int block_dim) {
const int kMaxBlockDim = 512;
return block_dim >= kMaxBlockDim
? kMaxBlockDim
: (1 << (static_cast<int>(std::log2f(block_dim))));
}
#define FIXED_BLOCK_DIM_CASE_BASE(log2_block_dim, ...) \
case (1 << (log2_block_dim)): { \
constexpr auto kBlockDim = (1 << (log2_block_dim)); \
__VA_ARGS__; \
} break
#define FIXED_BLOCK_DIM_CASE(...) \
FIXED_BLOCK_DIM_CASE_BASE(9, ##__VA_ARGS__); \
FIXED_BLOCK_DIM_CASE_BASE(8, ##__VA_ARGS__); \
FIXED_BLOCK_DIM_CASE_BASE(7, ##__VA_ARGS__); \
FIXED_BLOCK_DIM_CASE_BASE(6, ##__VA_ARGS__); \
FIXED_BLOCK_DIM_CASE_BASE(5, ##__VA_ARGS__); \
FIXED_BLOCK_DIM_CASE_BASE(4, ##__VA_ARGS__); \
FIXED_BLOCK_DIM_CASE_BASE(3, ##__VA_ARGS__); \
FIXED_BLOCK_DIM_CASE_BASE(2, ##__VA_ARGS__); \
FIXED_BLOCK_DIM_CASE_BASE(1, ##__VA_ARGS__)
static __device__ __forceinline__ float real_sqrt(float x) { return sqrtf(x); }
static __device__ __forceinline__ double real_sqrt(double x) { return sqrt(x); }
template <typename T>
struct PairForLayerNorm {
__device__ __forceinline__ PairForLayerNorm() {}
__device__ __forceinline__ PairForLayerNorm(const T &first, const T &second)
: first_(first), second_(second) {}
T first_;
T second_;
};
template <typename T>
struct PairForLayerNormAddFunctor {
__device__ __forceinline__ PairForLayerNorm<T> operator()(
const PairForLayerNorm<T> &p1, const PairForLayerNorm<T> &p2) {
return PairForLayerNorm<T>(p1.first_ + p2.first_, p1.second_ + p2.second_);
}
};
template <typename T, int BlockDim>
__global__ void LayerNormForward(const T *x, const T *scale, const T *bias,
T *y, T *mean, T *var, float epsilon,
int feature_size) {
using BlockReduce = cub::BlockReduce<PairForLayerNorm<T>, BlockDim>;
__shared__ typename BlockReduce::TempStorage temp_storage;
int beg_idx = blockIdx.x * feature_size + threadIdx.x;
int end_idx = (blockIdx.x + 1) * feature_size;
// Step 1: Reduce to calculate mean and var
T mean_val = static_cast<T>(0);
T var_val = static_cast<T>(0);
for (int i = beg_idx; i < end_idx; i += BlockDim) {
T tmp = x[i];
mean_val += tmp;
var_val += (tmp * tmp);
}
auto pair = BlockReduce(temp_storage)
.Reduce(PairForLayerNorm<T>(mean_val, var_val),
PairForLayerNormAddFunctor<T>());
if (threadIdx.x == 0) {
auto tmp = pair.first_ / feature_size;
mean[blockIdx.x] = tmp;
var[blockIdx.x] = pair.second_ / feature_size - tmp * tmp;
}
__syncthreads();
mean_val = mean[blockIdx.x];
var_val = static_cast<T>(real_sqrt(var[blockIdx.x] + epsilon));
// Step 2: Calculate y
if (scale != nullptr) {
if (bias != nullptr) {
for (int i = beg_idx, j = threadIdx.x; i < end_idx;
i += BlockDim, j += BlockDim) {
y[i] = scale[j] * (x[i] - mean_val) / var_val + bias[j];
}
} else {
for (int i = beg_idx, j = threadIdx.x; i < end_idx;
i += BlockDim, j += BlockDim) {
y[i] = scale[j] * (x[i] - mean_val) / var_val;
}
}
} else { // scale == nullptr
if (bias != nullptr) {
for (int i = beg_idx, j = threadIdx.x; i < end_idx;
i += BlockDim, j += BlockDim) {
y[i] = (x[i] - mean_val) / var_val + bias[j];
}
} else {
for (int i = beg_idx, j = threadIdx.x; i < end_idx;
i += BlockDim, j += BlockDim) {
y[i] = (x[i] - mean_val) / var_val;
}
}
}
}
// Make sure that d_scale != nullptr && d_bias != nullptr
// Since d_scale != nullptr, scale would not be nullptr
template <typename T, int BlockDim, bool HasDx>
__global__ void LayerNormBackwardGradientAll(const T *x, const T *d_y,
T *d_scale, T *d_bias, T *d_x,
const T *mean, const T *var,
const T *scale, float epsilon,
int batch_size, int feature_size) {
using BlockReduce = cub::BlockReduce<PairForLayerNorm<T>, BlockDim>;
__shared__ typename BlockReduce::TempStorage temp_storage;
int beg_idx = threadIdx.x * feature_size + blockIdx.x;
int end_idx = batch_size * feature_size + blockIdx.x;
int stride = BlockDim * feature_size;
T d_scale_partial = 0, d_bias_partial = 0;
for (int i = beg_idx; i < end_idx; i += stride) {
int row_idx = i / feature_size;
auto var_val = static_cast<T>(real_sqrt(var[row_idx] + epsilon));
d_scale_partial += d_y[i] * (x[i] - mean[row_idx]) / var_val;
d_bias_partial += d_y[i];
if (HasDx) {
d_x[i] = d_y[i] * scale[blockIdx.x] / var_val;
}
}
auto pair = BlockReduce(temp_storage)
.Reduce(PairForLayerNorm<T>(d_scale_partial, d_bias_partial),
PairForLayerNormAddFunctor<T>());
if (threadIdx.x == 0) {
d_scale[blockIdx.x] = pair.first_;
d_bias[blockIdx.x] = pair.second_;
}
}
// Make sure that there is only one true expression: d_scale != nullptr
// or d_bias != nullptr
// Notice: scale may be nullptr
template <typename T, int BlockDim, bool HasDx, bool HasDScale>
__global__ void LayerNormBackwardGradientScaleOrBias(
const T *x, const T *d_y, T *d_scale, T *d_bias, T *d_x, const T *mean,
const T *var, const T *scale, float epsilon, int batch_size,
int feature_size) {
using BlockReduce = cub::BlockReduce<T, BlockDim>;
__shared__ typename BlockReduce::TempStorage temp_storage;
int beg_idx = threadIdx.x * feature_size + blockIdx.x;
int end_idx = batch_size * feature_size + blockIdx.x;
int stride = BlockDim * feature_size;
T d_scale_or_d_bias_partial = 0;
for (int i = beg_idx; i < end_idx; i += stride) {
int row_idx = i / feature_size;
auto var_val = static_cast<T>(real_sqrt(var[row_idx] + epsilon));
if (HasDScale) {
d_scale_or_d_bias_partial += d_y[i] * (x[i] - mean[row_idx]) / var_val;
} else { // d_bias != nullptr
d_scale_or_d_bias_partial += d_y[i];
}
if (HasDx) {
if (scale != nullptr) {
d_x[i] = d_y[i] * scale[blockIdx.x] / var_val;
} else {
d_x[i] = d_y[i] / var_val;
}
}
}
d_scale_or_d_bias_partial =
BlockReduce(temp_storage).Reduce(d_scale_or_d_bias_partial, cub::Sum());
if (threadIdx.x == 0) {
if (HasDScale) {
d_scale[blockIdx.x] = d_scale_or_d_bias_partial;
} else {
d_bias[blockIdx.x] = d_scale_or_d_bias_partial;
}
}
}
template <typename T, int BlockDim>
__global__ void LayerNormBackwardPostProcessToCalculateDX(const T *x, T *d_x,
const T *mean,
const T *var,
float epsilon,
int feature_size) {
using BlockReduce = cub::BlockReduce<PairForLayerNorm<T>, BlockDim>;
__shared__ typename BlockReduce::TempStorage temp_storage;
__shared__ T d_x_reduce_tmp[2];
int beg_idx = blockIdx.x * feature_size + threadIdx.x;
int end_idx = (blockIdx.x + 1) * feature_size;
T block_mean = mean[blockIdx.x];
T block_var = var[blockIdx.x];
T d_x_mean_partial = 0, d_x_var_partial = 0;
for (int i = beg_idx; i < end_idx; i += BlockDim) {
d_x_mean_partial += d_x[i];
d_x_var_partial += d_x[i] * (x[i] - block_mean);
}
auto pair =
BlockReduce(temp_storage)
.Reduce(PairForLayerNorm<T>(d_x_mean_partial, d_x_var_partial),
PairForLayerNormAddFunctor<T>());
if (threadIdx.x == 0) {
d_x_reduce_tmp[0] = pair.first_ / feature_size;
d_x_reduce_tmp[1] = pair.second_ / (feature_size * (block_var + epsilon));
}
__syncthreads();
d_x_mean_partial = d_x_reduce_tmp[0];
d_x_var_partial = d_x_reduce_tmp[1];
for (int i = beg_idx; i < end_idx; i += BlockDim) {
d_x[i] -= d_x_mean_partial;
d_x[i] -= (x[i] - block_mean) * d_x_var_partial;
}
}
// Here, we only calculate d_x
template <typename T, int BlockDim>
__global__ void LayerNormBackwardGradientOnlyDX(const T *x, const T *d_y,
T *d_x, const T *mean,
const T *var, const T *scale,
float epsilon,
int feature_size) {
using BlockReduce = cub::BlockReduce<PairForLayerNorm<T>, BlockDim>;
__shared__ typename BlockReduce::TempStorage temp_storage;
__shared__ T d_x_reduce_tmp[2];
int beg_idx = blockIdx.x * feature_size + threadIdx.x;
int end_idx = (blockIdx.x + 1) * feature_size;
T block_mean = mean[blockIdx.x], block_var = var[blockIdx.x];
T d_x_mean_partial = 0, d_x_var_partial = 0;
for (int i = beg_idx; i < end_idx; i += BlockDim) {
auto var_val = static_cast<T>(real_sqrt(block_var + epsilon));
if (scale != nullptr) {
int col_idx = i % feature_size;
d_x[i] = d_y[i] * scale[col_idx] / var_val;
} else {
d_x[i] = d_y[i] / var_val;
}
d_x_mean_partial += d_x[i];
d_x_var_partial += d_x[i] * (x[i] - block_mean);
}
auto pair =
BlockReduce(temp_storage)
.Reduce(PairForLayerNorm<T>(d_x_mean_partial, d_x_var_partial),
PairForLayerNormAddFunctor<T>());
if (threadIdx.x == 0) {
d_x_reduce_tmp[0] = pair.first_ / feature_size;
d_x_reduce_tmp[1] = pair.second_ / (feature_size * (block_var + epsilon));
}
__syncthreads();
d_x_mean_partial = d_x_reduce_tmp[0];
d_x_var_partial = d_x_reduce_tmp[1];
for (int i = beg_idx; i < end_idx; i += BlockDim) {
d_x[i] -= d_x_mean_partial;
d_x[i] -= (x[i] - block_mean) * d_x_var_partial;
}
}
template <typename T>
__global__ void LayerNormBackwardWhenBatchSizeIsOne(
const T *x, const T *d_y, T *d_x, T *d_scale, T *d_bias, const T *mean,
const T *var, const T *scale, float epsilon, int feature_size) {
int idx = threadIdx.x + blockIdx.x * blockDim.x;
if (idx < feature_size) {
auto var_val = static_cast<T>(real_sqrt(var[idx] + epsilon));
if (d_x != nullptr) {
if (d_scale == nullptr) {
d_x[idx] = d_y[idx] / var_val;
} else {
d_x[idx] = d_y[idx] * scale[idx] / var_val;
}
}
if (d_scale != nullptr) {
d_scale[idx] = d_y[idx] * (x[idx] - mean[idx]) / var_val;
}
if (d_bias != nullptr) d_bias[idx] = d_y[idx];
}
}
template <typename T>
static void LayerNormBackward(const T *x, const T *d_y, const T *scale,
const T *mean, const T *var, T *d_x, T *d_scale,
T *d_bias, float epsilon, int batch_size,
int feature_size, cudaStream_t stream) {
const int kMaxBlockDim = 512;
int gradient_flag = ((d_x != nullptr ? 1 : 0) << 2) |
((d_scale != nullptr ? 1 : 0) << 1) |
((d_bias != nullptr ? 1 : 0));
if (gradient_flag == 0) return;
if (batch_size == 1) {
LayerNormBackwardWhenBatchSizeIsOne<
T><<<(feature_size + kMaxBlockDim - 1) / kMaxBlockDim, kMaxBlockDim, 0,
stream>>>(x, d_y, d_x, d_scale, d_bias, mean, var, scale, epsilon,
feature_size);
if (d_x != nullptr) {
switch (GetDesiredBlockDim(feature_size)) {
FIXED_BLOCK_DIM_CASE(LayerNormBackwardPostProcessToCalculateDX<
T, kBlockDim><<<1, kBlockDim, 0, stream>>>(
x, d_x, mean, var, epsilon, feature_size));
}
}
return;
}
auto block_dim = GetDesiredBlockDim(batch_size);
switch (gradient_flag) {
case 1: // d_x == nulptr, d_scale == nullptr, d_bias != nullptr
switch (block_dim) {
FIXED_BLOCK_DIM_CASE(LayerNormBackwardGradientScaleOrBias<
T, kBlockDim, false,
false><<<feature_size, kBlockDim, 0, stream>>>(
x, d_y, d_scale, d_bias, d_x, mean, var, scale, epsilon, batch_size,
feature_size));
}
break;
case 2: // d_x == nullptr, d_scale != nullptr, d_bias == nullptr
switch (block_dim) {
FIXED_BLOCK_DIM_CASE(LayerNormBackwardGradientScaleOrBias<
T, kBlockDim, false,
true><<<feature_size, kBlockDim, 0, stream>>>(
x, d_y, d_scale, d_bias, d_x, mean, var, scale, epsilon, batch_size,
feature_size));
}
break;
case 3: // d_x == nullptr, d_scale != nulptr, d_bias != nullptr
switch (block_dim) {
FIXED_BLOCK_DIM_CASE(
LayerNormBackwardGradientAll<
T, kBlockDim, false><<<feature_size, kBlockDim, 0, stream>>>(
x, d_y, d_scale, d_bias, d_x, mean, var, scale, epsilon,
batch_size, feature_size));
}
break;
case 4: // d_x != nullptr, d_scale == nullptr, d_bias == nullptr
switch (GetDesiredBlockDim(feature_size)) {
FIXED_BLOCK_DIM_CASE(
LayerNormBackwardGradientOnlyDX<
T, kBlockDim><<<batch_size, kBlockDim, 0, stream>>>(
x, d_y, d_x, mean, var, scale, epsilon, feature_size));
}
break;
case 5: // d_x != nulptr, d_scale == nullptr, d_bias != nullptr
switch (block_dim) {
FIXED_BLOCK_DIM_CASE(LayerNormBackwardGradientScaleOrBias<
T, kBlockDim, true,
false><<<feature_size, kBlockDim, 0, stream>>>(
x, d_y, d_scale, d_bias, d_x, mean, var, scale, epsilon, batch_size,
feature_size));
}
switch (GetDesiredBlockDim(feature_size)) {
FIXED_BLOCK_DIM_CASE(
LayerNormBackwardPostProcessToCalculateDX<
T, kBlockDim><<<batch_size, kBlockDim, 0, stream>>>(
x, d_x, mean, var, epsilon, feature_size));
}
break;
case 6: // d_x != nullptr, d_scale != nullptr, d_bias == nullptr
switch (block_dim) {
FIXED_BLOCK_DIM_CASE(LayerNormBackwardGradientScaleOrBias<
T, kBlockDim, true,
true><<<feature_size, kBlockDim, 0, stream>>>(
x, d_y, d_scale, d_bias, d_x, mean, var, scale, epsilon, batch_size,
feature_size));
}
switch (GetDesiredBlockDim(feature_size)) {
FIXED_BLOCK_DIM_CASE(
LayerNormBackwardPostProcessToCalculateDX<
T, kBlockDim><<<batch_size, kBlockDim, 0, stream>>>(
x, d_x, mean, var, epsilon, feature_size));
}
break;
case 7: // d_x != nullptr, d_scale != nullptr, d_bias != nullptr
switch (block_dim) {
FIXED_BLOCK_DIM_CASE(
LayerNormBackwardGradientAll<
T, kBlockDim, true><<<feature_size, kBlockDim, 0, stream>>>(
x, d_y, d_scale, d_bias, d_x, mean, var, scale, epsilon,
batch_size, feature_size));
}
switch (GetDesiredBlockDim(feature_size)) {
FIXED_BLOCK_DIM_CASE(
LayerNormBackwardPostProcessToCalculateDX<
T, kBlockDim><<<batch_size, kBlockDim, 0, stream>>>(
x, d_x, mean, var, epsilon, feature_size));
}
break;
default:
break;
}
}
template <typename T>
class LayerNormKernel<platform::CUDADeviceContext, T>
: public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext &ctx) const override {
const float epsilon = ctx.Attr<float>("epsilon");
auto *scale = ctx.Input<Tensor>("Scale");
auto *bias = ctx.Input<Tensor>("Bias");
auto *x = ctx.Input<Tensor>("X");
auto *y = ctx.Output<Tensor>("Y");
auto *mean = ctx.Output<Tensor>("Mean");
auto *var = ctx.Output<Tensor>("Variance");
const auto begin_norm_axis = ctx.Attr<int>("begin_norm_axis");
const auto x_dims = x->dims();
auto *x_data = x->data<T>();
auto *y_data = y->mutable_data<T>(ctx.GetPlace());
auto *mean_data = mean->mutable_data<T>(ctx.GetPlace());
auto *var_data = var->mutable_data<T>(ctx.GetPlace());
auto *scale_data = (scale == nullptr ? nullptr : scale->data<T>());
auto *bias_data = (bias == nullptr ? nullptr : bias->data<T>());
auto matrix_dim = framework::flatten_to_2d(x_dims, begin_norm_axis);
int batch_size = static_cast<int>(matrix_dim[0]);
int feature_size = static_cast<int>(matrix_dim[1]);
auto stream = ctx.cuda_device_context().stream();
switch (GetDesiredBlockDim(feature_size)) {
FIXED_BLOCK_DIM_CASE(
LayerNormForward<T, kBlockDim><<<batch_size, kBlockDim, 0, stream>>>(
x_data, scale_data, bias_data, y_data, mean_data, var_data,
epsilon, feature_size));
default:
PADDLE_THROW(
"Product from begin_norm_axis to end must be larger than 1");
break;
}
}
};
template <typename T>
class LayerNormGradKernel<platform::CUDADeviceContext, T>
: public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext &ctx) const override {
const float epsilon = ctx.Attr<float>("epsilon");
// d_x, d_scale, d_bias may be nullptr
auto *d_x = ctx.Output<Tensor>(framework::GradVarName("X"));
auto *d_scale = ctx.Output<Tensor>(framework::GradVarName("Scale"));
auto *d_bias = ctx.Output<Tensor>(framework::GradVarName("Bias"));
auto *x = ctx.Input<Tensor>("X");
auto *mean = ctx.Input<Tensor>("Mean");
auto *var = ctx.Input<Tensor>("Variance");
auto *scale = ctx.Input<Tensor>("Scale");
auto *d_y = ctx.Input<Tensor>(framework::GradVarName("Y"));
auto *x_data = x->data<T>();
auto *d_y_data = d_y->data<T>();
auto *mean_data = mean->data<T>();
auto *var_data = var->data<T>();
auto *scale_data = (scale == nullptr ? nullptr : scale->data<T>());
auto *d_scale_data =
(d_scale == nullptr ? nullptr
: d_scale->mutable_data<T>(ctx.GetPlace()));
auto *d_bias_data =
(d_bias == nullptr ? nullptr : d_bias->mutable_data<T>(ctx.GetPlace()));
auto *d_x_data =
(d_x == nullptr ? nullptr : d_x->mutable_data<T>(ctx.GetPlace()));
const auto &x_dims = x->dims();
const auto begin_norm_axis = ctx.Attr<int>("begin_norm_axis");
auto matrix_dim = framework::flatten_to_2d(x_dims, begin_norm_axis);
int batch_size = static_cast<int>(matrix_dim[0]);
int feature_size = static_cast<int>(matrix_dim[1]);
auto stream = ctx.cuda_device_context().stream();
LayerNormBackward<T>(x_data, d_y_data, scale_data, mean_data, var_data,
d_x_data, d_scale_data, d_bias_data, epsilon,
batch_size, feature_size, stream);
}
};
#undef FIXED_BLOCK_DIM_CASE_BASE
#undef FIXED_BLOCK_DIM_CASE
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP_CUDA_KERNEL(
layer_norm,
......
......@@ -25,10 +25,6 @@ limitations under the License. */
#include "paddle/fluid/operators/distributed/request_handler_impl.h"
#include "paddle/fluid/operators/listen_and_serv_op.h"
#include "paddle/fluid/platform/profiler.h"
DEFINE_int32(listen_and_serv_profile_period, 0,
"the period of listen_and_serv to do profile");
namespace paddle {
namespace operators {
......@@ -108,6 +104,7 @@ void ListenAndServOp::RunSyncLoop(
framework::Scope *recv_scope,
const std::vector<int> &prefetch_block_id_list,
const int checkpoint_point_block_id) const {
VLOG(2) << "RunSyncLoop";
size_t num_blocks = program->Size();
auto optimize_blocks =
Attr<std::vector<framework::BlockDesc *>>(kOptimizeBlocks);
......@@ -128,17 +125,8 @@ void ListenAndServOp::RunSyncLoop(
rpc_service_->ResetBarrierCounter();
int32_t profile_step = 0;
while (true) {
PADDLE_ENFORCE_LE(profile_step, FLAGS_listen_and_serv_profile_period,
"profile_step should not be larger then "
"FLAGS_listen_and_serv_profile_period");
if (FLAGS_listen_and_serv_profile_period > 0) {
if (profile_step == 0) {
auto pf_state = paddle::platform::ProfilerState::kCPU;
paddle::platform::EnableProfiler(pf_state);
}
}
rpc_service_->Profiler().OneStep();
// Get from multiple trainers, we don't care about the order in which
// the gradients arrives, just add suffix 0~n and merge the gradient.
rpc_service_->SetCond(distributed::kRequestSend);
......@@ -180,21 +168,13 @@ void ListenAndServOp::RunSyncLoop(
// reset received sparse vars to avoid reuse it in the next mini-batch
dynamic_cast<distributed::RequestSendHandler *>(request_send_handler_.get())
->ResetSparseVarRecorder();
if (FLAGS_listen_and_serv_profile_period > 0) {
if (profile_step == FLAGS_listen_and_serv_profile_period) {
paddle::platform::DisableProfiler(
paddle::platform::EventSortingKey::kTotal, "/dev/null");
profile_step = 0;
} else {
profile_step++;
}
}
} // while(true)
}
void ListenAndServOp::RunAsyncLoop(framework::Executor *executor,
framework::ProgramDesc *program,
framework::Scope *recv_scope) const {
VLOG(2) << "RunAsyncLoop";
// grad name to block id
std::unordered_map<std::string, int32_t> grad_to_block_id;
std::unordered_map<int32_t, std::string> id_to_grad;
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
......@@ -14,6 +14,8 @@ limitations under the License. */
#define EIGEN_USE_GPU
#include <cub/cub.cuh>
#include "paddle/fluid/operators/math/cross_entropy.h"
#include "paddle/fluid/operators/softmax_with_cross_entropy_op.h"
namespace paddle {
......@@ -53,8 +55,196 @@ __global__ void SoftCrossEntropyGradientKernel(T* logit_grad,
logit_grad[ids] = loss_grad[row_ids] * (logit_grad[ids] - labels[ids]);
}
}
} // namespace
static __device__ __forceinline__ float real_exp(float x) { return expf(x); }
static __device__ __forceinline__ double real_exp(double x) { return exp(x); }
static __device__ __forceinline__ float real_log(float x) {
return math::TolerableValue<float>()(logf(x));
}
static __device__ __forceinline__ double real_log(double x) {
return math::TolerableValue<double>()(log(x));
}
/** In the following codes, 3 CUDA kernels are implemented to calculate softmax
* and loss **/
/*
Supposing the x is `logits` and y is `labels`, the equations are as
followings:
cross\_entropy_i = \sum_{j}[- y_i_j * log({e^{x_i_j}/\sum_{j}e^{x_i_j}})]
= \sum_{j}[- y_i_j * log({e^{x_i_j - max_i}/\sum_{j}e^{x_i_j-max_i}})]
= \sum_{j}[-y_i_j * (x_i_j - max_i - log\sum_{j}e^{x_i_j - max_i})]
= \sum_{j}[-y_i_j * (x_i_j - max_i - logDiffMaxSum_i)]
= \sum_{j}(-y_i_j * tmp_i_j)
softmax_i_j = e^{tmp_i_j}
where:
max_i = \max_{j}{x_i_j}
logDiffMaxSum_i = log\sum_{j}e^{x_i_j - max_i}
tmp_i_j = x_i_j - max_i - logDiffMaxSum_i
Therefore, the calculation can be separated into 3 steps:
Step 1: row-wise operation to calculate max_i
Step 2: row-wise operation to calculate logDiffMaxSum_i
Step 3: caculate tmp_i_j, and finally get softmax_i_j and cross\_entropy_i
To save memory, we can share memory among max_i, logDiffMaxSum_i and
cross\_entropy_i.
In this way, the 3 steps should be changed to:
Step 1 (RowReductionForMax): row-wise operation to calculate max_i
Step 2 (RowReductionForDiffMaxSum): calculate immediate result of softmax'_i_j =
x_i_j - max_i, and row-wise operation to calculate logDiffMaxSum_i
Step 3 (RowReductionForSoftmaxAndCrossEntropy): calculate tmp_i_j = softmax'_i_j
- logDiffMaxSum_i, and finally get softmax_i_j and cross\_entropy_i
*/
// There are 3 kinds of reduce algorithms in cub:
// BLOCK_REDUCE_RAKING_COMMUTATIVE_ONLY
// BLOCK_REDUCE_RAKING
// BLOCK_REDUCE_WARP_REDUCTIONS (default)
template <typename T, int BlockDim>
using BlockReduce =
cub::BlockReduce<T, BlockDim /*, cub::BLOCK_REDUCE_WARP_REDUCTIONS*/>;
template <typename T, int BlockDim>
using BlockReduceTempStorage = typename BlockReduce<T, BlockDim>::TempStorage;
// Make sure that BlockDim <= feature_size
// This kernel is used to calculate the max element of each row
template <typename T, int BlockDim>
__global__ void RowReductionForMax(const T* logits_data, T* max_data,
int feature_size) {
__shared__ BlockReduceTempStorage<T, BlockDim> temp_storage;
auto beg_idx = feature_size * blockIdx.x + threadIdx.x;
auto end_idx = feature_size * (blockIdx.x + 1);
T cur_max = logits_data[beg_idx];
beg_idx += BlockDim;
while (beg_idx < end_idx) {
if (cur_max < logits_data[beg_idx]) {
cur_max = logits_data[beg_idx];
}
beg_idx += BlockDim;
}
cur_max = BlockReduce<T, BlockDim>(temp_storage).Reduce(cur_max, cub::Max());
if (threadIdx.x == 0) {
max_data[blockIdx.x] = cur_max < -64 ? -64 : cur_max;
}
}
// Make sure that BlockDim <= feature_size
template <typename T, int BlockDim>
__global__ void RowReductionForDiffMaxSum(const T* logits_data, T* max_data,
T* softmax, int feature_size) {
__shared__ BlockReduceTempStorage<T, BlockDim> temp_storage;
auto beg_idx = feature_size * blockIdx.x + threadIdx.x;
auto end_idx = feature_size * (blockIdx.x + 1);
auto block_max = max_data[blockIdx.x];
softmax[beg_idx] = logits_data[beg_idx] - block_max;
T diff_max_sum = real_exp(softmax[beg_idx]);
beg_idx += BlockDim;
while (beg_idx < end_idx) {
softmax[beg_idx] = logits_data[beg_idx] - block_max;
diff_max_sum += real_exp(softmax[beg_idx]);
beg_idx += BlockDim;
}
diff_max_sum =
BlockReduce<T, BlockDim>(temp_storage).Reduce(diff_max_sum, cub::Sum());
if (threadIdx.x == 0) max_data[blockIdx.x] = real_log(diff_max_sum);
}
// Make sure that BlockDim <= feature_size
template <typename T, int BlockDim>
__global__ void RowReductionForSoftmaxAndCrossEntropy(const T* logits_data,
const T* labels_data,
T* loss_data, T* softmax,
int feature_size) {
__shared__ BlockReduceTempStorage<T, BlockDim> temp_storage;
auto beg_idx = feature_size * blockIdx.x + threadIdx.x;
auto end_idx = feature_size * (blockIdx.x + 1);
// log_diff_max_sum shares memory with loss
auto block_log_diff_max_sum = loss_data[blockIdx.x];
auto tmp = softmax[beg_idx] - block_log_diff_max_sum;
softmax[beg_idx] = real_exp(tmp);
auto loss = -labels_data[beg_idx] * tmp;
beg_idx += BlockDim;
while (beg_idx < end_idx) {
tmp = softmax[beg_idx] - block_log_diff_max_sum;
softmax[beg_idx] = real_exp(tmp);
loss -= (labels_data[beg_idx] * tmp);
beg_idx += BlockDim;
}
loss = BlockReduce<T, BlockDim>(temp_storage).Reduce(loss, cub::Sum());
if (threadIdx.x == 0) loss_data[blockIdx.x] = loss;
}
template <typename T>
__global__ void SetSoftmaxToOneWhenFeatureSizeIsOne(T* out, int batch_size) {
auto idx = threadIdx.x + blockIdx.x * blockDim.x;
if (idx < batch_size) out[idx] = static_cast<T>(1);
}
template <typename T>
static void SoftmaxWithCrossEntropyFusedKernel(const T* logits_data,
const T* labels_data,
T* softmax_data, T* loss_data,
int batch_size, int feature_size,
cudaStream_t stream) {
constexpr int kMaxBlockDim = 512;
int block_dim = feature_size >= kMaxBlockDim
? kMaxBlockDim
: (1 << static_cast<int>(std::log2(feature_size)));
#define CALL_SOFTMAX_WITH_CROSS_ENTROPY_FUSED_KERNEL(BlockDim) \
case BlockDim: \
RowReductionForMax<T, BlockDim><<<batch_size, BlockDim, 0, stream>>>( \
logits_data, loss_data, feature_size); \
RowReductionForDiffMaxSum<T, \
BlockDim><<<batch_size, BlockDim, 0, stream>>>( \
logits_data, loss_data, softmax_data, feature_size); \
RowReductionForSoftmaxAndCrossEntropy< \
T, BlockDim><<<batch_size, BlockDim, 0, stream>>>( \
logits_data, labels_data, loss_data, softmax_data, feature_size); \
break
switch (block_dim) {
CALL_SOFTMAX_WITH_CROSS_ENTROPY_FUSED_KERNEL(512);
CALL_SOFTMAX_WITH_CROSS_ENTROPY_FUSED_KERNEL(256);
CALL_SOFTMAX_WITH_CROSS_ENTROPY_FUSED_KERNEL(128);
CALL_SOFTMAX_WITH_CROSS_ENTROPY_FUSED_KERNEL(64);
CALL_SOFTMAX_WITH_CROSS_ENTROPY_FUSED_KERNEL(32);
CALL_SOFTMAX_WITH_CROSS_ENTROPY_FUSED_KERNEL(16);
CALL_SOFTMAX_WITH_CROSS_ENTROPY_FUSED_KERNEL(8);
CALL_SOFTMAX_WITH_CROSS_ENTROPY_FUSED_KERNEL(4);
CALL_SOFTMAX_WITH_CROSS_ENTROPY_FUSED_KERNEL(2);
case 1:
SetSoftmaxToOneWhenFeatureSizeIsOne<<<(batch_size + kMaxBlockDim - 1) /
kMaxBlockDim,
kMaxBlockDim, 0, stream>>>(
softmax_data, batch_size);
cudaMemsetAsync(loss_data, 0, batch_size, stream);
break;
default:
PADDLE_THROW("BlockDim must be 2^n in softmax_with_cross_entropy_op");
break;
}
#undef CALL_SOFTMAX_WITH_CROSS_ENTROPY_FUSED_KERNEL
}
template <typename T>
class SoftmaxWithCrossEntropyCUDAKernel : public framework::OpKernel<T> {
public:
......@@ -66,14 +256,24 @@ class SoftmaxWithCrossEntropyCUDAKernel : public framework::OpKernel<T> {
Tensor* softmax = context.Output<Tensor>("Softmax");
Tensor* loss = context.Output<Tensor>("Loss");
softmax->mutable_data<T>(context.GetPlace());
loss->mutable_data<T>(context.GetPlace());
math::SoftmaxFunctor<platform::CUDADeviceContext, T>()(
context.cuda_device_context(), logits, softmax);
math::CrossEntropyFunctor<platform::CUDADeviceContext, T>()(
context.cuda_device_context(), loss, softmax, labels,
context.Attr<bool>("soft_label"));
auto* softmax_data = softmax->mutable_data<T>(context.GetPlace());
auto* loss_data = loss->mutable_data<T>(context.GetPlace());
auto soft_label = context.Attr<bool>("soft_label");
if (soft_label) {
int batch_size = logits->dims()[0];
int feature_size = logits->dims()[1];
auto* logits_data = logits->data<T>();
auto* labels_data = labels->data<T>();
SoftmaxWithCrossEntropyFusedKernel(
logits_data, labels_data, softmax_data, loss_data, batch_size,
feature_size, context.cuda_device_context().stream());
} else {
math::SoftmaxCUDNNFunctor<T>()(context.cuda_device_context(), logits,
softmax);
math::CrossEntropyFunctor<platform::CUDADeviceContext, T>()(
context.cuda_device_context(), loss, softmax, labels, false);
}
}
};
......
......@@ -18,7 +18,11 @@ else()
endif()
cc_test(enforce_test SRCS enforce_test.cc DEPS stringpiece enforce)
cc_library(cpu_info SRCS cpu_info.cc DEPS gflags glog enforce)
set(CPU_INFO_DEPS gflags glog enforce)
IF(WITH_XBYAK)
list(APPEND CPU_INFO_DEPS xbyak)
ENDIF()
cc_library(cpu_info SRCS cpu_info.cc DEPS ${CPU_INFO_DEPS})
cc_test(cpu_info_test SRCS cpu_info_test.cc DEPS cpu_info)
nv_library(gpu_info SRCS gpu_info.cc DEPS gflags glog enforce)
......
......@@ -14,6 +14,11 @@ limitations under the License. */
#include "paddle/fluid/platform/cpu_info.h"
#ifdef PADDLE_WITH_XBYAK
#include "xbyak/xbyak.h"
#include "xbyak/xbyak_util.h"
#endif
#ifdef __APPLE__
#include <sys/sysctl.h>
#include <sys/types.h>
......@@ -98,5 +103,39 @@ size_t CUDAPinnedMaxChunkSize() {
return CUDAPinnedMaxAllocSize() / 256;
}
#ifdef PADDLE_WITH_XBYAK
namespace jit {
static Xbyak::util::Cpu cpu;
bool MayIUse(const cpu_isa_t cpu_isa) {
using namespace Xbyak::util; // NOLINT
switch (cpu_isa) {
case sse42:
return cpu.has(Cpu::tSSE42);
case avx2:
return cpu.has(Cpu::tAVX2);
case avx512_common:
return cpu.has(Cpu::tAVX512F);
case avx512_core:
return true && cpu.has(Cpu::tAVX512F) && cpu.has(Cpu::tAVX512BW) &&
cpu.has(Cpu::tAVX512VL) && cpu.has(Cpu::tAVX512DQ);
case avx512_core_vnni:
return true && cpu.has(Cpu::tAVX512F) && cpu.has(Cpu::tAVX512BW) &&
cpu.has(Cpu::tAVX512VL) && cpu.has(Cpu::tAVX512DQ) &&
cpu.has(Cpu::tAVX512_VNNI);
case avx512_mic:
return true && cpu.has(Cpu::tAVX512F) && cpu.has(Cpu::tAVX512CD) &&
cpu.has(Cpu::tAVX512ER) && cpu.has(Cpu::tAVX512PF);
case avx512_mic_4ops:
return true && MayIUse(avx512_mic) && cpu.has(Cpu::tAVX512_4FMAPS) &&
cpu.has(Cpu::tAVX512_4VNNIW);
case isa_any:
return true;
}
return false;
}
} // namespace jit
#endif
} // namespace platform
} // namespace paddle
......@@ -37,5 +37,25 @@ size_t CUDAPinnedMinChunkSize();
//! Get the maximum chunk size for buddy allocator.
size_t CUDAPinnedMaxChunkSize();
#ifdef PADDLE_WITH_XBYAK
namespace jit {
typedef enum {
isa_any,
sse42,
avx2,
avx512_common,
avx512_core,
avx512_core_vnni,
avx512_mic,
avx512_mic_4ops,
} cpu_isa_t; // Instruction set architecture
// May I use some instruction
inline bool MayIUse(const cpu_isa_t cpu_isa);
} // namespace jit
#endif
} // namespace platform
} // namespace paddle
......@@ -189,6 +189,8 @@ void CUPTIAPI bufferCompleted(CUcontext ctx, uint32_t streamId, uint8_t *buffer,
}
} // namespace
#endif // PADDLE_WITH_CUPTI
class DeviceTracerImpl : public DeviceTracer {
public:
DeviceTracerImpl() : enabled_(false) {}
......@@ -244,6 +246,8 @@ class DeviceTracerImpl : public DeviceTracer {
if (enabled_) {
return;
}
#ifdef PADDLE_WITH_CUPTI
EnableActivity();
// Register callbacks for buffer requests and completed by CUPTI.
......@@ -262,6 +266,7 @@ class DeviceTracerImpl : public DeviceTracer {
dynload::cuptiEnableCallback(1, subscriber_, CUPTI_CB_DOMAIN_DRIVER_API,
CUPTI_DRIVER_TRACE_CBID_cuLaunchKernel));
CUPTI_CALL(dynload::cuptiGetTimestamp(&start_ns_));
#endif // PADDLE_WITH_CUPTI
enabled_ = true;
}
......@@ -313,16 +318,21 @@ class DeviceTracerImpl : public DeviceTracer {
}
void Disable() {
#ifdef PADDLE_WITH_CUPTI
// flush might cause additional calls to DeviceTracker.
dynload::cuptiActivityFlushAll(CUPTI_ACTIVITY_FLAG_FLUSH_FORCED);
#endif // PADDLE_WITH_CUPTI
std::lock_guard<std::mutex> l(trace_mu_);
#ifdef PADDLE_WITH_CUPTI
DisableActivity();
dynload::cuptiUnsubscribe(subscriber_);
CUPTI_CALL(dynload::cuptiGetTimestamp(&end_ns_));
#endif // PADDLE_WITH_CUPTI
enabled_ = false;
}
private:
#ifdef PADDLE_WITH_CUPTI
static void CUPTIAPI ApiCallback(void *userdata, CUpti_CallbackDomain domain,
CUpti_CallbackId cbid, const void *cbdata) {
auto *cbInfo = reinterpret_cast<const CUpti_CallbackData *>(cbdata);
......@@ -340,7 +350,8 @@ class DeviceTracerImpl : public DeviceTracer {
VLOG(1) << "Unhandled API Callback for " << domain << " " << cbid;
}
}
CUpti_SubscriberHandle subscriber_;
#endif // PADDLE_WITH_CUPTI
std::mutex trace_mu_;
bool enabled_;
uint64_t start_ns_;
......@@ -349,45 +360,9 @@ class DeviceTracerImpl : public DeviceTracer {
std::vector<MemRecord> mem_records_;
std::vector<CPURecord> cpu_records_;
std::unordered_map<uint32_t, std::string> correlations_;
CUpti_SubscriberHandle subscriber_;
};
#endif // PADDLE_WITH_CUPTI
class DeviceTracerDummy : public DeviceTracer {
public:
DeviceTracerDummy() {}
void AddAnnotation(uint64_t id, const std::string &anno) {}
void AddCPURecords(const std::string &anno, uint64_t start_ns,
uint64_t end_ns, int64_t device_id, int64_t thread_id) {}
void AddMemRecords(const std::string &name, uint64_t start_ns,
uint64_t end_ns, int64_t device_id, int64_t stream_id,
uint32_t correlation_id, uint64_t bytes) {}
void AddKernelRecords(uint64_t start, uint64_t end, int64_t device_id,
int64_t stream_id, uint32_t correlation_id) {}
bool IsEnabled() { return false; }
void Enable() {}
proto::Profile GenProfile(const std::string &profile_path) {
return proto::Profile();
}
void Disable() {}
};
void CreateTracer(DeviceTracer **t) {
#ifdef PADDLE_WITH_CUPTI
*t = new DeviceTracerImpl();
#else
*t = new DeviceTracerDummy();
#endif // PADDLE_WITH_CUPTI
}
void CreateTracer(DeviceTracer **t) { *t = new DeviceTracerImpl(); }
DeviceTracer *GetDeviceTracer() {
std::call_once(tracer_once_flag, CreateTracer, &tracer);
......
......@@ -13,6 +13,9 @@ See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <sys/time.h>
#include <time.h>
#include <chrono> // NOLINT
#include <string>
#include "paddle/fluid/platform/dynload/cupti.h"
......@@ -25,6 +28,12 @@ namespace platform {
// WARN: Under Development. Don't depend on it yet.
//////////////////////
inline uint64_t PosixInNsec() {
struct timeval tv;
gettimeofday(&tv, nullptr);
return 1000 * (static_cast<uint64_t>(tv.tv_sec) * 1000000 + tv.tv_usec);
}
// DeviceTracer performs the following tasks:
// 1. Register cuda callbacks for various events: kernel, memcpy, etc.
// 2. Collect cuda statistics: start/end ts, memory, etc.
......
......@@ -223,7 +223,7 @@ class MKLDNNHandler {
static std::string GetHash(mkldnn::memory::dims& operand_dims, // NOLINT
const std::string& suffix) {
return dims2str(operand_dims) + suffix;
};
}
protected:
static std::string dims2str(const mkldnn::memory::dims& operand_dims) {
......@@ -251,5 +251,17 @@ inline mkldnn::memory::format MKLDNNFormatForSize(
return data_format;
}
inline mkldnn::memory::format data_format_to_memory_format(
const std::string& data_format) {
switch (framework::StringToDataLayout(data_format)) {
case framework::DataLayout::kNHWC:
return mkldnn::memory::format::nhwc;
case framework::DataLayout::kNCHW:
return mkldnn::memory::format::nchw;
default:
return mkldnn::memory::format::any;
}
}
} // namespace platform
} // namespace paddle
......@@ -15,7 +15,6 @@ limitations under the License. */
#include "paddle/fluid/platform/profiler.h"
#include <sys/time.h>
#include <time.h>
#include <algorithm>
#include <iomanip>
#include <limits>
......@@ -97,12 +96,6 @@ inline uint64_t GetTimeInNsec() {
.count();
}
inline uint64_t PosixInNsec() {
struct timeval tv;
gettimeofday(&tv, nullptr);
return 1000 * (static_cast<uint64_t>(tv.tv_sec) * 1000000 + tv.tv_usec);
}
Event::Event(EventType type, std::string name, uint32_t thread_id,
const DeviceContext* dev_ctx)
: type_(type), name_(name), thread_id_(thread_id), has_cuda_(false) {
......@@ -277,12 +270,13 @@ struct EventItem {
double min_time;
double max_time;
double ave_time;
float ratio;
};
// Print results
void PrintProfiler(const std::vector<std::vector<EventItem>>& events_table,
const std::string& sorted_domain, const size_t name_width,
const size_t data_width) {
const size_t data_width, double total) {
// Output header information
std::cout << "\n------------------------->"
<< " Profiling Report "
......@@ -307,7 +301,8 @@ void PrintProfiler(const std::vector<std::vector<EventItem>>& events_table,
std::cout << std::setw(name_width) << "Event" << std::setw(data_width)
<< "Calls" << std::setw(data_width) << "Total"
<< std::setw(data_width) << "Min." << std::setw(data_width)
<< "Max." << std::setw(data_width) << "Ave." << std::endl;
<< "Max." << std::setw(data_width) << "Ave."
<< std::setw(data_width) << "Ratio." << std::endl;
for (size_t i = 0; i < events_table.size(); ++i) {
for (size_t j = 0; j < events_table[i].size(); ++j) {
const EventItem& event_item = events_table[i][j];
......@@ -316,7 +311,9 @@ void PrintProfiler(const std::vector<std::vector<EventItem>>& events_table,
<< std::setw(data_width) << event_item.total_time
<< std::setw(data_width) << event_item.min_time
<< std::setw(data_width) << event_item.max_time
<< std::setw(data_width) << event_item.ave_time << std::endl;
<< std::setw(data_width) << event_item.ave_time
<< std::setw(data_width) << event_item.total_time / total
<< std::endl;
}
}
std::cout << std::endl;
......@@ -366,6 +363,7 @@ void ParseEvents(const std::vector<std::vector<Event>>& events,
std::vector<std::vector<EventItem>> events_table;
size_t max_name_width = 0;
double total = 0.; // the total time
for (size_t i = 0; i < events.size(); i++) {
std::list<Event> pushed_events;
std::vector<EventItem> event_items;
......@@ -386,6 +384,7 @@ void ParseEvents(const std::vector<std::vector<Event>>& events,
g_state == ProfilerState::kAll)
? rit->CudaElapsedMs(events[i][j])
: rit->CpuElapsedMs(events[i][j]);
total += event_time;
std::string event_name =
"thread" + std::to_string(rit->thread_id()) + "::" + rit->name();
......@@ -394,7 +393,8 @@ void ParseEvents(const std::vector<std::vector<Event>>& events,
if (event_idx.find(event_name) == event_idx.end()) {
event_idx[event_name] = event_items.size();
EventItem event_item = {event_name, 1, event_time,
event_time, event_time, event_time};
event_time, event_time, event_time,
0.};
event_items.push_back(event_item);
} else {
int index = event_idx[event_name];
......@@ -438,7 +438,7 @@ void ParseEvents(const std::vector<std::vector<Event>>& events,
}
// Print report
PrintProfiler(events_table, sorted_domain, max_name_width + 4, 12);
PrintProfiler(events_table, sorted_domain, max_name_width + 4, 12, total);
}
void DisableProfiler(EventSortingKey sorted_key,
......
......@@ -301,7 +301,8 @@ void BindOpDesc(pybind11::module *m) {
std::string ser(seriralized);
self.SetAttr(name, ser);
})
.def("block_attr", &pd::OpDesc::GetBlockAttr)
.def("block_attr_id", &pd::OpDesc::GetBlockAttrId)
.def("blocks_attr_ids", &pd::OpDesc::GetBlocksAttrIds)
.def("check_attrs", &pd::OpDesc::CheckAttrs)
.def("infer_shape", &pd::OpDesc::InferShape)
.def("infer_var_type", &pd::OpDesc::InferVarType)
......
......@@ -394,8 +394,10 @@ All parameter, weight, gradient are variables in Paddle.
InferenceOptimize(*(origin.Proto()), &pruned_desc);
return new ProgramDesc(pruned_desc);
});
m.def("empty_var_name", []() { return framework::kEmptyVarName; });
m.def("grad_var_suffix", []() { return framework::kGradVarSuffix; });
m.def("empty_var_name",
[]() { return std::string(framework::kEmptyVarName); });
m.def("grad_var_suffix",
[]() { return std::string(framework::kGradVarSuffix); });
m.def_submodule(
"var_names",
"The module will return special predefined variable name in Paddle")
......@@ -662,7 +664,7 @@ All parameter, weight, gradient are variables in Paddle.
const std::string &, Scope *, std::vector<Scope *> &,
const ExecutionStrategy &, const BuildStrategy &, size_t,
size_t>())
.def("bcast_params", &ParallelExecutor::BCastParamsToDevices)
.def("_bcast_params", &ParallelExecutor::BCastParamsToDevices)
// NOTE: even we return a vec<Scope*>* to Python use reference policy.
// We still cannot get local_scope from this vector, since the element
// of vec<Scope*> will be freed by Python GC. We can only return Scope*
......
......@@ -28,11 +28,12 @@ images per class.
"""
import cPickle
import itertools
import numpy
import paddle.dataset.common
import tarfile
from six.moves import zip
from six.moves import cPickle as pickle
__all__ = ['train100', 'test100', 'train10', 'test10', 'convert']
......@@ -48,7 +49,7 @@ def reader_creator(filename, sub_name, cycle=False):
data = batch['data']
labels = batch.get('labels', batch.get('fine_labels', None))
assert labels is not None
for sample, label in itertools.izip(data, labels):
for sample, label in zip(data, labels):
yield (sample / 255.0).astype(numpy.float32), int(label)
def reader():
......@@ -58,7 +59,7 @@ def reader_creator(filename, sub_name, cycle=False):
while True:
for name in names:
batch = cPickle.load(f.extractfile(name))
batch = pickle.load(f.extractfile(name))
for item in read_batch(batch):
yield item
if not cycle:
......
......@@ -20,9 +20,8 @@ import shutil
import sys
import importlib
import paddle.dataset
import cPickle
import six.moves.cPickle as pickle
import glob
import cPickle as pickle
__all__ = [
'DATA_HOME',
......@@ -75,13 +74,13 @@ def download(url, module_name, md5sum, save_name=None):
retry_limit = 3
while not (os.path.exists(filename) and md5file(filename) == md5sum):
if os.path.exists(filename):
print "file md5", md5file(filename), md5sum
print("file md5", md5file(filename), md5sum)
if retry < retry_limit:
retry += 1
else:
raise RuntimeError("Cannot download {0} within retry limit {1}".
format(url, retry_limit))
print "Cache file %s not found, downloading %s" % (filename, url)
print("Cache file %s not found, downloading %s" % (filename, url))
r = requests.get(url, stream=True)
total_length = r.headers.get('content-length')
......@@ -104,8 +103,9 @@ def download(url, module_name, md5sum, save_name=None):
def fetch_all():
for module_name in filter(lambda x: not x.startswith("__"),
dir(paddle.dataset)):
for module_name in [
x for x in dir(paddle.dataset) if not x.startswith("__")
]:
if "fetch" in dir(
importlib.import_module("paddle.dataset.%s" % module_name)):
getattr(
......@@ -114,8 +114,9 @@ def fetch_all():
def fetch_all_recordio(path):
for module_name in filter(lambda x: not x.startswith("__"),
dir(paddle.dataset)):
for module_name in [
x for x in dir(paddle.dataset) if not x.startswith("__")
]:
if "convert" in dir(
importlib.import_module("paddle.dataset.%s" % module_name)) and \
not module_name == "common":
......@@ -126,7 +127,7 @@ def fetch_all_recordio(path):
"convert")(ds_path)
def split(reader, line_count, suffix="%05d.pickle", dumper=cPickle.dump):
def split(reader, line_count, suffix="%05d.pickle", dumper=pickle.dump):
"""
you can call the function as:
......@@ -167,7 +168,7 @@ def split(reader, line_count, suffix="%05d.pickle", dumper=cPickle.dump):
def cluster_files_reader(files_pattern,
trainer_count,
trainer_id,
loader=cPickle.load):
loader=pickle.load):
"""
Create a reader that yield element from the given files, select
a file set according trainer count and trainer_id
......@@ -188,7 +189,7 @@ def cluster_files_reader(files_pattern,
my_file_list = []
for idx, fn in enumerate(file_list):
if idx % trainer_count == trainer_id:
print "append file: %s" % fn
print("append file: %s" % fn)
my_file_list.append(fn)
for fn in my_file_list:
with open(fn, "r") as f:
......@@ -221,7 +222,7 @@ def convert(output_path, reader, line_count, name_prefix):
for l in lines:
# FIXME(Yancey1989):
# dumps with protocol: pickle.HIGHEST_PROTOCOL
writer.write(cPickle.dumps(l))
writer.write(pickle.dumps(l))
writer.close()
lines = []
......
......@@ -24,18 +24,19 @@ import tarfile
import gzip
import itertools
import paddle.dataset.common
from six.moves import zip
__all__ = ['test, get_dict', 'get_embedding', 'convert']
DATA_URL = 'http://www.cs.upc.edu/~srlconll/conll05st-tests.tar.gz'
DATA_MD5 = '387719152ae52d60422c016e92a742fc'
WORDDICT_URL = 'http://paddlepaddle.bj.bcebos.com/demo/srl_dict_and_embedding/wordDict.txt'
WORDDICT_URL = 'http://paddlemodels.bj.bcebos.com/conll05st%2FwordDict.txt'
WORDDICT_MD5 = 'ea7fb7d4c75cc6254716f0177a506baa'
VERBDICT_URL = 'http://paddlepaddle.bj.bcebos.com/demo/srl_dict_and_embedding/verbDict.txt'
VERBDICT_URL = 'http://paddlemodels.bj.bcebos.com/conll05st%2FverbDict.txt'
VERBDICT_MD5 = '0d2977293bbb6cbefab5b0f97db1e77c'
TRGDICT_URL = 'http://paddlepaddle.bj.bcebos.com/demo/srl_dict_and_embedding/targetDict.txt'
TRGDICT_URL = 'http://paddlemodels.bj.bcebos.com/conll05st%2FtargetDict.txt'
TRGDICT_MD5 = 'd8c7f03ceb5fc2e5a0fa7503a4353751'
EMB_URL = 'http://paddlepaddle.bj.bcebos.com/demo/srl_dict_and_embedding/emb'
EMB_URL = 'http://paddlemodels.bj.bcebos.com/conll05st%2Femb'
EMB_MD5 = 'bf436eb0faa1f6f9103017f8be57cdb7'
UNK_IDX = 0
......@@ -87,12 +88,12 @@ def corpus_reader(data_path, words_name, props_name):
sentences = []
labels = []
one_seg = []
for word, label in itertools.izip(words_file, props_file):
for word, label in zip(words_file, props_file):
word = word.strip()
label = label.strip().split()
if len(label) == 0: # end of sentence
for i in xrange(len(one_seg[0])):
for i in range(len(one_seg[0])):
a_kind_lable = [x[i] for x in one_seg]
labels.append(a_kind_lable)
......
......@@ -28,10 +28,9 @@ Graphics and Image Processing (2008)
http://www.robots.ox.ac.uk/~vgg/publications/papers/nilsback08.{pdf,ps.gz}.
"""
import cPickle
import itertools
import functools
from common import download
from .common import download
import tarfile
import scipy.io as scio
from paddle.dataset.image import *
......@@ -39,6 +38,8 @@ from paddle.reader import *
import os
import numpy as np
from multiprocessing import cpu_count
from six.moves import cPickle as pickle
from six.moves import zip
__all__ = ['train', 'test', 'valid']
DATA_URL = 'http://www.robots.ox.ac.uk/~vgg/data/flowers/102/102flowers.tgz'
......@@ -116,10 +117,10 @@ def reader_creator(data_file,
file = file.strip()
batch = None
with open(file, 'r') as f:
batch = cPickle.load(f)
batch = pickle.load(f)
data = batch['data']
labels = batch['label']
for sample, label in itertools.izip(data, batch['label']):
for sample, label in zip(data, batch['label']):
yield sample, int(label) - 1
if not cycle:
break
......
......@@ -36,7 +36,7 @@ except ImportError:
cv2 = None
import os
import tarfile
import cPickle
import six.moves.cPickle as pickle
__all__ = [
"load_image_bytes", "load_image", "resize_short", "to_chw", "center_crop",
......@@ -86,10 +86,10 @@ def batch_images_from_tar(data_file,
output = {}
output['label'] = labels
output['data'] = data
cPickle.dump(
pickle.dump(
output,
open('%s/batch_%d' % (out_path, file_id), 'w'),
protocol=cPickle.HIGHEST_PROTOCOL)
protocol=pickle.HIGHEST_PROTOCOL)
file_id += 1
data = []
labels = []
......@@ -97,10 +97,10 @@ def batch_images_from_tar(data_file,
output = {}
output['label'] = labels
output['data'] = data
cPickle.dump(
pickle.dump(
output,
open('%s/batch_%d' % (out_path, file_id), 'w'),
protocol=cPickle.HIGHEST_PROTOCOL)
protocol=pickle.HIGHEST_PROTOCOL)
with open(meta_file, 'a') as meta:
for file in os.listdir(out_path):
......
......@@ -42,13 +42,13 @@ def tokenize(pattern):
# sequential access of member files, other than
# tarfile.extractfile, which does random access and might
# destroy hard disks.
tf = tarf.next()
tf = next(tarf)
while tf != None:
if bool(pattern.match(tf.name)):
# newline and punctuations removal and ad-hoc tokenization.
yield tarf.extractfile(tf).read().rstrip("\n\r").translate(
None, string.punctuation).lower().split()
tf = tarf.next()
tf = next(tarf)
def build_dict(pattern, cutoff):
......@@ -62,11 +62,11 @@ def build_dict(pattern, cutoff):
word_freq[word] += 1
# Not sure if we should prune less-frequent words here.
word_freq = filter(lambda x: x[1] > cutoff, word_freq.items())
word_freq = [x for x in list(word_freq.items()) if x[1] > cutoff]
dictionary = sorted(word_freq, key=lambda x: (-x[1], x[0]))
words, _ = list(zip(*dictionary))
word_idx = dict(zip(words, xrange(len(words))))
word_idx = dict(list(zip(words, list(range(len(words))))))
word_idx['<unk>'] = len(words)
return word_idx
......
......@@ -64,11 +64,11 @@ def build_dict(min_word_freq=50):
# remove <unk> for now, since we will set it as last index
del word_freq['<unk>']
word_freq = filter(lambda x: x[1] > min_word_freq, word_freq.items())
word_freq = [x for x in list(word_freq.items()) if x[1] > min_word_freq]
word_freq_sorted = sorted(word_freq, key=lambda x: (-x[1], x[0]))
words, _ = list(zip(*word_freq_sorted))
word_idx = dict(zip(words, xrange(len(words))))
word_idx = dict(list(zip(words, list(range(len(words))))))
word_idx['<unk>'] = len(words)
return word_idx
......
......@@ -65,7 +65,7 @@ def reader_creator(image_filename, label_filename, buffer_size):
images = images / 255.0 * 2.0 - 1.0
for i in xrange(buffer_size):
for i in range(buffer_size):
yield images[i, :], int(labels[i])
finally:
try:
......
......@@ -16,7 +16,7 @@ Movielens 1-M dataset.
Movielens 1-M dataset contains 1 million ratings from 6000 users on 4000
movies, which was collected by GroupLens Research. This module will download
Movielens 1-M dataset from
Movielens 1-M dataset from
http://files.grouplens.org/datasets/movielens/ml-1m.zip and parse training
set and test set into paddle reader creators.
......@@ -187,7 +187,7 @@ def max_movie_id():
Get the maximum value of movie id.
"""
__initialize_meta_info__()
return reduce(__max_index_info__, MOVIE_INFO.viewvalues()).index
return reduce(__max_index_info__, list(MOVIE_INFO.values())).index
def max_user_id():
......@@ -195,7 +195,7 @@ def max_user_id():
Get the maximum value of user id.
"""
__initialize_meta_info__()
return reduce(__max_index_info__, USER_INFO.viewvalues()).index
return reduce(__max_index_info__, list(USER_INFO.values())).index
def __max_job_id_impl__(a, b):
......@@ -210,7 +210,7 @@ def max_job_id():
Get the maximum value of job id.
"""
__initialize_meta_info__()
return reduce(__max_job_id_impl__, USER_INFO.viewvalues()).job_id
return reduce(__max_job_id_impl__, list(USER_INFO.values())).job_id
def movie_categories():
......@@ -243,7 +243,7 @@ def unittest():
for test_count, _ in enumerate(test()()):
pass
print train_count, test_count
print(train_count, test_count)
def fetch():
......
......@@ -26,7 +26,7 @@ http://research.microsoft.com/en-us/um/beijing/projects/letor/LETOR4.0/Data/MQ20
import os
import functools
import rarfile
from common import download
from .common import download
import numpy as np
# URL = "http://research.microsoft.com/en-us/um/beijing/projects/letor/LETOR4.0/Data/MQ2007.rar"
......@@ -53,7 +53,7 @@ class Query(object):
----------
query_id : int
query_id in dataset, mapping from query to relevance documents
relevance_score : int
relevance_score : int
relevance score of query and document pair
feature_vector : array, dense feature
feature in vector format
......@@ -92,7 +92,7 @@ class Query(object):
sys.stdout.write("expect 48 space split parts, get %d" %
(len(parts)))
return None
# format : 0 qid:10 1:0.000272 2:0.000000 ....
# format : 0 qid:10 1:0.000272 2:0.000000 ....
self.relevance_score = int(parts[0])
self.query_id = int(parts[1].split(':')[1])
for p in parts[2:]:
......@@ -295,7 +295,7 @@ def __reader__(filepath, format="pairwise", shuffle=False, fill_missing=-1):
--------
filename : string
fill_missing : fill the missing value. default in MQ2007 is -1
Returns
------
yield
......@@ -330,4 +330,4 @@ if __name__ == "__main__":
mytest = functools.partial(
__reader__, filepath="MQ2007/MQ2007/Fold1/sample", format="listwise")
for label, query in mytest():
print label, query
print(label, query)
......@@ -43,11 +43,11 @@ def download_data_if_not_yet():
nltk.data.path.append(paddle.dataset.common.DATA_HOME)
movie_reviews.categories()
except LookupError:
print "Downloading movie_reviews data set, please wait....."
print("Downloading movie_reviews data set, please wait.....")
nltk.download(
'movie_reviews', download_dir=paddle.dataset.common.DATA_HOME)
print "Download data set success....."
print "Path is " + nltk.data.find('corpora/movie_reviews').path
print("Download data set success.....")
print("Path is " + nltk.data.find('corpora/movie_reviews').path)
def get_word_dict():
......@@ -64,7 +64,7 @@ def get_word_dict():
for field in movie_reviews.fileids(category):
for words in movie_reviews.words(field):
word_freq_dict[words] += 1
words_sort_list = word_freq_dict.items()
words_sort_list = list(word_freq_dict.items())
words_sort_list.sort(cmp=lambda a, b: b[1] - a[1])
for index, word in enumerate(words_sort_list):
words_freq_sorted.append((word[0], index))
......@@ -80,7 +80,8 @@ def sort_files():
files_list = list()
neg_file_list = movie_reviews.fileids('neg')
pos_file_list = movie_reviews.fileids('pos')
files_list = list(chain.from_iterable(zip(neg_file_list, pos_file_list)))
files_list = list(
chain.from_iterable(list(zip(neg_file_list, pos_file_list))))
return files_list
......
......@@ -36,7 +36,7 @@ class TestCommon(unittest.TestCase):
def test_split(self):
def test_reader():
def reader():
for x in xrange(10):
for x in range(10):
yield x
return reader
......@@ -49,7 +49,7 @@ class TestCommon(unittest.TestCase):
def test_cluster_file_reader(self):
_, temp_path = tempfile.mkstemp()
for x in xrange(5):
for x in range(5):
with open(temp_path + '/%05d.test' % x) as f:
f.write('%d\n' % x)
reader = paddle.dataset.common.cluster_files_reader(
......@@ -63,7 +63,7 @@ class TestCommon(unittest.TestCase):
def test_reader():
def reader():
for x in xrange(record_num):
for x in range(record_num):
yield x
return reader
......
......@@ -59,7 +59,7 @@ class TestMikolov(unittest.TestCase):
self.assertEqual(first_line, read_line)
def test_total(self):
_, idx = zip(*WORD_DICT.items())
_, idx = list(zip(*list(WORD_DICT.items())))
self.assertEqual(sorted(idx)[-1], len(WORD_DICT) - 1)
......
......@@ -24,9 +24,8 @@ from nltk.corpus import movie_reviews
class TestSentimentMethods(unittest.TestCase):
def test_get_word_dict(self):
word_dict = st.get_word_dict()[0:10]
test_word_list = [(u',', 0), (u'the', 1), (u'.', 2), (u'a', 3),
(u'and', 4), (u'of', 5), (u'to', 6), (u"'", 7),
(u'is', 8), (u'in', 9)]
test_word_list = [(',', 0), ('the', 1), ('.', 2), ('a', 3), ('and', 4),
('of', 5), ('to', 6), ("'", 7), ('is', 8), ('in', 9)]
for idx, each in enumerate(word_dict):
self.assertEqual(each, test_word_list[idx])
self.assertTrue("/root/.cache/paddle/dataset" in nltk.data.path)
......
......@@ -49,9 +49,12 @@ def feature_range(maximums, minimums):
import matplotlib.pyplot as plt
fig, ax = plt.subplots()
feature_num = len(maximums)
ax.bar(range(feature_num), maximums - minimums, color='r', align='center')
ax.bar(list(range(feature_num)),
maximums - minimums,
color='r',
align='center')
ax.set_title('feature scale')
plt.xticks(range(feature_num), feature_names)
plt.xticks(list(range(feature_num)), feature_names)
plt.xlim([-1, feature_num])
fig.set_figheight(6)
fig.set_figwidth(10)
......@@ -71,7 +74,7 @@ def load_data(filename, feature_num=14, ratio=0.8):
maximums, minimums, avgs = data.max(axis=0), data.min(axis=0), data.sum(
axis=0) / data.shape[0]
feature_range(maximums[:-1], minimums[:-1])
for i in xrange(feature_num - 1):
for i in range(feature_num - 1):
data[:, i] = (data[:, i] - avgs[i]) / (maximums[i] - minimums[i])
offset = int(data.shape[0] * ratio)
UCI_TRAIN_DATA = data[:offset]
......
......@@ -36,11 +36,10 @@ URL_DEV_TEST = ('http://www-lium.univ-lemans.fr/~schwenk/'
MD5_DEV_TEST = '7d7897317ddd8ba0ae5c5fa7248d3ff5'
# this is a small set of data for test. The original data is too large and
# will be add later.
URL_TRAIN = ('http://paddlepaddle.cdn.bcebos.com/demo/'
'wmt_shrinked_data/wmt14.tgz')
URL_TRAIN = ('http://paddlemodels.bj.bcebos.com/wmt/wmt14.tgz')
MD5_TRAIN = '0791583d57d5beb693b9414c5b36798c'
# BLEU of this trained model is 26.92
URL_MODEL = 'http://paddlepaddle.bj.bcebos.com/demo/wmt_14/wmt14_model.tar.gz'
URL_MODEL = 'http://paddlemodels.bj.bcebos.com/wmt%2Fwmt14.tgz'
MD5_MODEL = '0cb4a5366189b6acba876491c8724fa3'
START = "<s>"
......@@ -154,8 +153,8 @@ def get_dict(dict_size, reverse=True):
tar_file = paddle.dataset.common.download(URL_TRAIN, 'wmt14', MD5_TRAIN)
src_dict, trg_dict = __read_to_dict(tar_file, dict_size)
if reverse:
src_dict = {v: k for k, v in src_dict.items()}
trg_dict = {v: k for k, v in trg_dict.items()}
src_dict = {v: k for k, v in list(src_dict.items())}
trg_dict = {v: k for k, v in list(trg_dict.items())}
return src_dict, trg_dict
......
......@@ -70,7 +70,9 @@ def __build_dict(tar_file, dict_size, save_path, lang):
fout.write("%s\n%s\n%s\n" % (START_MARK, END_MARK, UNK_MARK))
for idx, word in enumerate(
sorted(
word_dict.iteritems(), key=lambda x: x[1], reverse=True)):
iter(list(word_dict.items())),
key=lambda x: x[1],
reverse=True)):
if idx + 3 == dict_size: break
fout.write("%s\n" % (word[0]))
......
......@@ -14,54 +14,52 @@
from __future__ import print_function
# import all class inside framework into fluid module
import framework
from framework import *
from . import framework
from .framework import *
# import all class inside executor into fluid module
import executor
from executor import *
import trainer
from trainer import Trainer
from trainer import BeginEpochEvent
from trainer import EndEpochEvent
from trainer import BeginStepEvent
from trainer import EndStepEvent
from trainer import CheckpointConfig
import inferencer
from inferencer import Inferencer
import io
import evaluator
import initializer
import layers
import contrib
import nets
import optimizer
import backward
import regularizer
import average
import metrics
import transpiler
from param_attr import ParamAttr, WeightNormParamAttr
from data_feeder import DataFeeder
from core import LoDTensor, LoDTensorArray, CPUPlace, CUDAPlace, CUDAPinnedPlace, Scope
from transpiler import DistributeTranspiler, InferenceTranspiler, \
from . import executor
from .executor import *
from . import trainer
from .trainer import Trainer
from .trainer import BeginEpochEvent
from .trainer import EndEpochEvent
from .trainer import BeginStepEvent
from .trainer import EndStepEvent
from .trainer import CheckpointConfig
from . import inferencer
from .inferencer import Inferencer
from . import io
from . import evaluator
from . import initializer
from . import layers
from . import contrib
from . import nets
from . import optimizer
from . import backward
from . import regularizer
from . import average
from . import metrics
from . import transpiler
from .param_attr import ParamAttr, WeightNormParamAttr
from .data_feeder import DataFeeder
from .core import LoDTensor, LoDTensorArray, CPUPlace, CUDAPlace, CUDAPinnedPlace, Scope
from .transpiler import DistributeTranspiler, InferenceTranspiler, \
memory_optimize, release_memory, DistributeTranspilerConfig
from concurrency import (Go, make_channel, channel_send, channel_recv,
channel_close, Select)
from lod_tensor import create_lod_tensor, create_random_int_lodtensor
import clip
import profiler
import unique_name
import recordio_writer
import parallel_executor
from parallel_executor import *
from .lod_tensor import create_lod_tensor, create_random_int_lodtensor
from . import clip
from . import profiler
from . import unique_name
from . import recordio_writer
from . import parallel_executor
from .parallel_executor import *
from paddle.fluid.layers.math_op_patch import monkey_patch_variable
Tensor = LoDTensor
__all__ = framework.__all__ + executor.__all__ + concurrency.__all__ + \
__all__ = framework.__all__ + executor.__all__ + \
trainer.__all__ + inferencer.__all__ + transpiler.__all__ + \
parallel_executor.__all__ + lod_tensor.__all__ + [
'io',
......@@ -99,8 +97,8 @@ def __bootstrap__():
None
"""
import sys
import core
import os
from . import core
in_test = 'unittest' in sys.modules
......@@ -128,7 +126,8 @@ def __bootstrap__():
]
if core.is_compiled_with_dist():
read_env_flags.append('rpc_deadline')
read_env_flags.append('listen_and_serv_profile_period')
read_env_flags.append('rpc_server_profile_period')
read_env_flags.append('rpc_server_profile_path')
if core.is_compiled_with_cuda():
read_env_flags += [
......
......@@ -12,6 +12,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import functools
import sys
......@@ -28,7 +29,7 @@ def deprecated(since, instead, extra_message=""):
@functools.wraps(func)
def wrapper(*args, **kwargs):
print >> sys.stderr, err_msg
print(err_msg, file=sys.stderr)
return func(*args, **kwargs)
wrapper.__doc__ += "\n "
......
......@@ -16,7 +16,8 @@ from paddle.fluid import framework as framework
from . import core
import collections
import copy
import unique_name
import six
from . import unique_name
__all__ = ['append_backward']
......@@ -44,17 +45,25 @@ def _create_op_desc_(op_type, inputs, outputs, attrs):
"""
op_desc = core.OpDesc()
op_desc.set_type(op_type)
for para, args in inputs.iteritems():
op_desc.set_input(para, args)
for para, args in outputs.iteritems():
op_desc.set_output(para, args)
for para, args in list(inputs.items()):
op_desc.set_input(
para,
list(
map(lambda arg: arg.decode() if isinstance(arg, six.binary_type) else arg,
args)))
for para, args in list(outputs.items()):
op_desc.set_output(
para,
list(
map(lambda arg: arg.decode() if isinstance(arg, six.binary_type) else arg,
args)))
op_role_attr_name = core.op_proto_and_checker_maker.kOpRoleAttrName()
if op_role_attr_name not in attrs:
attrs[
op_role_attr_name] = core.op_proto_and_checker_maker.OpRole.Backward
for name, val in attrs.iteritems():
for name, val in list(attrs.items()):
if isinstance(val, framework.Block):
op_desc.set_block_attr(name, val.desc)
else:
......@@ -105,7 +114,9 @@ def _strip_grad_suffix_(name):
e.g. x@GRAD ==> x
y@GRAD@RENAME@1 ==> y
"""
pos = name.find(core.grad_var_suffix())
if isinstance(name, six.text_type):
name = name.encode()
pos = name.find(six.b(core.grad_var_suffix()))
return name[:pos] if pos != -1 else name
......@@ -114,7 +125,9 @@ def _append_grad_suffix_(name):
Append grad suffix to the given variable name
e.g. x ==> x@GRAD
"""
return name + core.grad_var_suffix()
if isinstance(name, six.text_type):
name = name.encode()
return name + six.b(core.grad_var_suffix())
def _addup_repetitive_outputs_(op_descs):
......@@ -174,7 +187,7 @@ def _addup_repetitive_outputs_(op_descs):
op_desc.set_output(param_name, arg_names)
renamed_vars[var_name].append(new_name)
for var_name, inputs in renamed_vars.iteritems():
for var_name, inputs in list(renamed_vars.items()):
if len(inputs) > 1:
pending_sum_ops.append(
(_create_op_desc_("sum", {"X": inputs}, {"Out": [var_name]},
......@@ -198,16 +211,19 @@ def _remove_no_grad_branch_(op_descs, no_grad_set):
out_arg_names = op_desc.output_arg_names()
if len(out_arg_names) == 0 or _all_in_set_(out_arg_names, no_grad_set):
return True
if _all_in_set_(
filter(lambda name: name.find(core.grad_var_suffix()) != -1,
op_desc.input_arg_names()), no_grad_set):
if _all_in_set_([
name for name in op_desc.input_arg_names()
if name.find(core.grad_var_suffix()) != -1
], no_grad_set):
no_grad_set.update(out_arg_names)
return True
return False
# Remove ops whose outputs are all in no_grad_dict
op_descs = filter(
lambda op_desc: not _op_can_be_removed_(op_desc, no_grad_set), op_descs)
op_descs = [
op_desc for op_desc in op_descs
if not _op_can_be_removed_(op_desc, no_grad_set)
]
# Insert fill_zeros_like_op
to_insert = []
for idx, op_desc in enumerate(op_descs):
......@@ -217,12 +233,12 @@ def _remove_no_grad_branch_(op_descs, no_grad_set):
"X": [_strip_grad_suffix_(arg)]
}, {"Out": [arg]}, {}), idx))
map(lambda p: op_descs.insert(p[1], p[0]), reversed(to_insert))
list([op_descs.insert(p[1], p[0]) for p in reversed(to_insert)])
return op_descs
import proto.framework_pb2 as framework_pb2
from .proto import framework_pb2
def serialize_op_decs(op_desc):
......@@ -244,8 +260,10 @@ def _callback_lookup_(op):
if op.type == 'parallel_do' and op.attr('use_nccl'):
all_vars = op.block.vars
param_names = set(op.input('parameters'))
param_names = filter(lambda name: all_vars[name].stop_gradient is False,
param_names)
param_names = [
name for name in param_names
if all_vars[name].stop_gradient is False
]
param_grad_names = [n + "@GRAD" for n in param_names]
class ParallelDoCallBack(object):
......@@ -326,7 +344,7 @@ def _append_backward_ops_(block,
grad_sub_block_list = []
# If the op has its own sub-block, deal with the sub-block first
if op.has_attr("sub_block"):
sub_block = program.block(op.block_attr("sub_block"))
sub_block = program.block(op.block_attr_id("sub_block"))
grad_sub_block = program.create_block()
grad_sub_block._set_forward_block_idx(sub_block.idx)
cb = _callback_lookup_(op)
......@@ -388,7 +406,7 @@ def _append_backward_vars_(block, start_op_idx, grad_to_var, grad_info_map):
for op_idx in range(start_op_idx, block.desc.op_size()):
op_desc = block.desc.op(op_idx)
if op_desc.has_attr("sub_block"):
sub_block = block.program.block(op_desc.block_attr("sub_block"))
sub_block = block.program.block(op_desc.block_attr_id("sub_block"))
_append_backward_vars_(sub_block, 0, grad_to_var, grad_info_map)
new_vars = set()
# create new gradient variables
......@@ -399,7 +417,7 @@ def _append_backward_vars_(block, start_op_idx, grad_to_var, grad_info_map):
continue
block.desc.var(grad_var_name)
new_vars.add(grad_var_name)
if not grad_to_var.has_key(grad_var_name):
if grad_var_name not in grad_to_var:
continue
grad_info_map[grad_to_var[grad_var_name]] = (grad_var_name, block)
# infer_shape and infer_type
......@@ -427,7 +445,7 @@ def _rename_grad_(block, start_op_idx, grad_to_var, target_grad_map):
op_desc.rename_output(name, new_name)
var_map[name] = new_name
for g, ng in var_map.iteritems():
for g, ng in list(var_map.items()):
if g in grad_to_var:
grad_to_var[ng] = grad_to_var[g]
grad_to_var.pop(g)
......@@ -439,7 +457,7 @@ def _get_stop_gradients_(program):
for block in program.blocks:
assert isinstance(block, framework.Block)
block_no_grad_set = set()
for var in block.vars.itervalues():
for var in list(block.vars.values()):
assert isinstance(var, framework.Variable)
if var.stop_gradient:
block_no_grad_set.add(_append_grad_suffix_(var.name))
......@@ -452,51 +470,51 @@ def append_backward(loss, parameter_list=None, no_grad_set=None,
"""
Append backward part to main_program.
A complete neural network training is made up of forward and backward
propagation. However, when we configure a network, we only need to
specify its forwrd part. The backward part is generated automatically
A complete neural network training is made up of forward and backward
propagation. However, when we configure a network, we only need to
specify its forwrd part. The backward part is generated automatically
according to the forward part by this function.
In most cases, users do not need to invoke this function manually. It
In most cases, users do not need to invoke this function manually. It
will be automatically invoked by the optimizer's `minimize` function.
Args:
loss(Variable): The loss variable of the network.
parameter_list(list[string]|None): Names of parameters that need
to be updated by optimizers.
If it is None, all parameters
parameter_list(list[string]|None): Names of parameters that need
to be updated by optimizers.
If it is None, all parameters
will be updated.
Default: None
no_grad_set(set|None): Variables in the Block 0 whose gradients
should be ignored. All variables with
`step_gradient=True` from all blocks will
no_grad_set(set|None): Variables in the Block 0 whose gradients
should be ignored. All variables with
`step_gradient=True` from all blocks will
be automatically added into this set.
Default: None
callbacks(list[callable object]|None): The callbacks are used for
doing some custom jobs during
backward part building. All
callable objects in it will
be invoked once each time a
new gradient operator is added
into the program. The callable
object must has two input
parameters: 'block' and 'context'.
The 'block' is the block which
the new gradient operator will
be added to. The 'context' is a
map, whose keys are gradient
variable names and values are
callbacks(list[callable object]|None): The callbacks are used for
doing some custom jobs during
backward part building. All
callable objects in it will
be invoked once each time a
new gradient operator is added
into the program. The callable
object must has two input
parameters: 'block' and 'context'.
The 'block' is the block which
the new gradient operator will
be added to. The 'context' is a
map, whose keys are gradient
variable names and values are
corresponding original variables.
In addition to this, the 'context'
has another special key-value pair:
the key is string '__current_op_desc__'
and the value is the op_desc of the
gradient operator who has just
triggered the callable object.
In addition to this, the 'context'
has another special key-value pair:
the key is string '__current_op_desc__'
and the value is the op_desc of the
gradient operator who has just
triggered the callable object.
Returns:
list[(Variable,Variable)]: Pairs of parameter and its
corresponding gradients. The key is the parameter and the
list[(Variable,Variable)]: Pairs of parameter and its
corresponding gradients. The key is the parameter and the
value is gradient variable.
Raises:
......@@ -535,7 +553,7 @@ def append_backward(loss, parameter_list=None, no_grad_set=None,
no_grad_set = set()
no_grad_set = copy.copy(no_grad_set)
no_grad_dict = _get_stop_gradients_(program)
no_grad_dict[0].update(map(_append_grad_suffix_, no_grad_set))
no_grad_dict[0].update(list(map(_append_grad_suffix_, no_grad_set)))
grad_info_map = dict()
root_block = program.block(0)
......@@ -558,7 +576,7 @@ def append_backward(loss, parameter_list=None, no_grad_set=None,
block_no_grad_set = set(map(_strip_grad_suffix_, no_grad_dict[0]))
op_path = _find_op_path_(root_block, [loss], [], block_no_grad_set)
no_grad_dict[0].update(map(_append_grad_suffix_, block_no_grad_set))
no_grad_dict[0].update(list(map(_append_grad_suffix_, block_no_grad_set)))
_append_backward_ops_(root_block, op_path, root_block, no_grad_dict,
grad_to_var, callbacks)
......@@ -697,7 +715,7 @@ def calc_gradient(targets, inputs, target_gradients=None, no_grad_set=None):
no_grad_set = set()
no_grad_set = copy.copy(no_grad_set)
no_grad_dict = _get_stop_gradients_(prog)
no_grad_dict[0].update(map(_append_grad_suffix_, no_grad_set))
no_grad_dict[0].update(list(map(_append_grad_suffix_, no_grad_set)))
fwd_op_num = block.desc.op_size()
......@@ -731,7 +749,7 @@ def calc_gradient(targets, inputs, target_gradients=None, no_grad_set=None):
block_no_grad_set = set(map(_strip_grad_suffix_, no_grad_dict[0]))
op_path = _find_op_path_(block, targets, inputs, block_no_grad_set)
no_grad_dict[0].update(map(_append_grad_suffix_, block_no_grad_set))
no_grad_dict[0].update(list(map(_append_grad_suffix_, block_no_grad_set)))
grad_to_var = dict()
grad_info_map = dict()
_append_backward_ops_(block, op_path, block, no_grad_dict, grad_to_var)
......
......@@ -13,10 +13,11 @@
# limitations under the License.
import copy
import six
import functools
import layers
import framework
from . import layers
from . import framework
from . import core
__all__ = [
......@@ -80,8 +81,7 @@ def error_clip_callback(block, context):
# the context is a grad_to_var map
grad_to_var = context
op_desc = block.desc.op(block.desc.op_size() - 1)
for grad_n in filter(lambda n: grad_to_var.has_key(n),
op_desc.output_arg_names()):
for grad_n in [n for n in op_desc.output_arg_names() if n in grad_to_var]:
fwd_var = block._var_recursive(grad_to_var[grad_n])
error_clip = getattr(fwd_var, "error_clip", None)
if not (error_clip is None or isinstance(error_clip,
......@@ -247,8 +247,8 @@ class GradientClipByGlobalNorm(BaseGradientClipAttr):
"""
def __init__(self, clip_norm, group_name="default_group"):
if not isinstance(group_name, basestring):
raise TypeError("'group_name' must be a basestring.")
if not isinstance(group_name, six.string_types):
raise TypeError("'group_name' must be a %s." % (six.string_types))
self.clip_norm = clip_norm
self.group_name = group_name
......@@ -284,7 +284,7 @@ class GradientClipByGlobalNorm(BaseGradientClipAttr):
x=clip_var,
y=layers.elementwise_max(
x=clip_var, y=group_norm_var))
assert group_scale_var.shape == (1L, )
assert group_scale_var.shape == (1, )
self.context[group_scale_name] = group_scale_var
new_grad = layers.elementwise_mul(
......@@ -313,7 +313,7 @@ def set_gradient_clip(clip, param_list=None, program=None):
program = framework.default_main_program()
if param_list is None:
param_list = program.block(0).all_parameters()
if all(isinstance(elem, basestring) for elem in param_list):
if all(isinstance(elem, six.string_types) for elem in param_list):
param_list = [program.block(0).var(elem) for elem in param_list]
if not all(isinstance(elem, framework.Parameter) for elem in param_list):
raise TypeError(
......
......@@ -12,15 +12,14 @@
# See the License for the specific language governing permissions and
# limitations under the License.
from layers.control_flow import BlockGuard, equal
from .layers.control_flow import BlockGuard, equal
from .framework import Operator
from layer_helper import LayerHelper, unique_name
from layers import fill_constant
import core
from .layer_helper import LayerHelper, unique_name
from .layers import fill_constant
from . import core
__all__ = [
'Go', 'make_channel', 'channel_send', 'channel_recv', 'channel_close',
'Select'
'make_channel', 'channel_send', 'channel_recv', 'channel_close', 'Select'
]
......@@ -35,10 +34,10 @@ class Go(BlockGuard):
def __exit__(self, exc_type, exc_val, exc_tb):
if exc_type is not None:
return False
self.construct_go_op()
self._construct_go_op()
return super(Go, self).__exit__(exc_type, exc_val, exc_tb)
def construct_go_op(self):
def _construct_go_op(self):
main_program = self.helper.main_program
go_block = main_program.current_block()
parent_block = main_program.block(main_program.current_block()
......
......@@ -12,9 +12,9 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import decoder
from decoder import *
import memory_usage_calc
from memory_usage_calc import *
from . import decoder
from .decoder import *
from . import memory_usage_calc
from .memory_usage_calc import *
__all__ = decoder.__all__ + memory_usage_calc.__all__
......@@ -12,7 +12,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import beam_search_decoder
from beam_search_decoder import *
from . import beam_search_decoder
from .beam_search_decoder import *
__all__ = beam_search_decoder.__all__
......@@ -22,6 +22,7 @@ This API is still under active development and may change drastically.
import contextlib
import numpy as np
import six
from ... import layers
from ...framework import Variable
......@@ -191,7 +192,7 @@ class StateCell(object):
self._helper = LayerHelper('state_cell', name=name)
self._cur_states = {}
self._state_names = []
for state_name, state in states.items():
for state_name, state in six.iteritems(states):
if not isinstance(state, InitState):
raise ValueError('state must be an InitState object.')
self._cur_states[state_name] = state
......@@ -346,7 +347,7 @@ class StateCell(object):
if self._in_decoder and not self._switched_decoder:
self._switch_decoder()
for input_name, input_value in inputs.items():
for input_name, input_value in six.iteritems(inputs):
if input_name not in self._inputs:
raise ValueError('Unknown input %s. '
'Please make sure %s in input '
......@@ -361,7 +362,7 @@ class StateCell(object):
if self._in_decoder and not self._switched_decoder:
self._switched_decoder()
for state_name, decoder_state in self._states_holder.items():
for state_name, decoder_state in six.iteritems(self._states_holder):
if id(self._cur_decoder_obj) not in decoder_state:
raise ValueError('Unknown decoder object, please make sure '
'switch_decoder been invoked.')
......@@ -671,7 +672,7 @@ class BeamSearchDecoder(object):
feed_dict = {}
update_dict = {}
for init_var_name, init_var in self._input_var_dict.items():
for init_var_name, init_var in six.iteritems(self._input_var_dict):
if init_var_name not in self.state_cell._inputs:
raise ValueError('Variable ' + init_var_name +
' not found in StateCell!\n')
......@@ -721,7 +722,8 @@ class BeamSearchDecoder(object):
self.state_cell.update_states()
self.update_array(prev_ids, selected_ids)
self.update_array(prev_scores, selected_scores)
for update_name, var_to_update in update_dict.items():
for update_name, var_to_update in six.iteritems(
update_dict):
self.update_array(var_to_update, feed_dict[update_name])
def read_array(self, init, is_ids=False, is_scores=False):
......
......@@ -12,14 +12,14 @@
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import core
from . import core
import numpy
import os
import six.moves as six
import six
from six.moves import zip, range, xrange
import multiprocessing
from framework import Variable, default_main_program
from .framework import Variable, default_main_program
__all__ = ['DataFeeder']
......@@ -53,7 +53,7 @@ class DataToLoDTensorConverter(object):
self.data = []
self.lod = []
for i in six.range(lod_level):
for i in six.moves.range(lod_level):
self.lod.append([])
def feed(self, data):
......@@ -142,7 +142,7 @@ class DataFeeder(object):
if program is None:
program = default_main_program()
for each_var in feed_list:
if isinstance(each_var, basestring):
if isinstance(each_var, six.string_types):
each_var = program.block(0).var(each_var)
if not isinstance(each_var, Variable):
raise TypeError("Feed list should contain a list of variable")
......@@ -174,7 +174,7 @@ class DataFeeder(object):
dict: the result of conversion.
"""
converter = []
for lod_level, shape, dtype in six.zip(
for lod_level, shape, dtype in six.moves.zip(
self.feed_lod_level, self.feed_shapes, self.feed_dtypes):
converter.append(
DataToLoDTensorConverter(
......@@ -187,10 +187,12 @@ class DataFeeder(object):
assert len(each_sample) == len(converter), (
"The number of fields in data (%s) does not match " +
"len(feed_list) (%s)") % (len(each_sample), len(converter))
for each_converter, each_slot in six.zip(converter, each_sample):
for each_converter, each_slot in six.moves.zip(converter,
each_sample):
each_converter.feed(each_slot)
ret_dict = {}
for each_name, each_converter in six.zip(self.feed_names, converter):
for each_name, each_converter in six.moves.zip(self.feed_names,
converter):
ret_dict[each_name] = each_converter.done()
return ret_dict
......@@ -212,12 +214,14 @@ class DataFeeder(object):
if isinstance(self.place, core.CUDAPlace):
places = [
core.CUDAPlace(i)
for i in six.xrange(self._get_number_of_places_(num_places))
for i in six.moves.xrange(
self._get_number_of_places_(num_places))
]
else:
places = [
core.CPUPlace()
for _ in six.xrange(self._get_number_of_places_(num_places))
for _ in six.moves.xrange(
self._get_number_of_places_(num_places))
]
if len(iterable) != len(places):
......@@ -227,7 +231,7 @@ class DataFeeder(object):
"must be same.")
place = self.place
for p, batch in six.zip(places, iterable):
for p, batch in six.moves.zip(places, iterable):
self.place = p
yield self.feed(batch)
self.place = place
......
......@@ -14,8 +14,8 @@
import sys
import re
from graphviz import GraphPreviewGenerator
import proto.framework_pb2 as framework_pb2
from .graphviz import GraphPreviewGenerator
from .proto import framework_pb2
from google.protobuf import text_format
_vartype2str_ = [
......
......@@ -15,11 +15,11 @@
import warnings
import numpy as np
import layers
from framework import Program, Variable, program_guard
import unique_name
from layer_helper import LayerHelper
from initializer import Constant
from . import layers
from .framework import Program, Variable, program_guard
from . import unique_name
from .layer_helper import LayerHelper
from .initializer import Constant
__all__ = [
'ChunkEvaluator',
......
......@@ -14,12 +14,11 @@
import numpy as np
import contextlib
from framework import Program, default_main_program, Variable
import six
from .framework import Program, default_main_program, Variable
from . import core
__all__ = [
'Executor', 'global_scope', 'scope_guard', '_switch_scope', 'fetch_var'
]
__all__ = ['Executor', 'global_scope', 'scope_guard', '_switch_scope']
g_scope = core.Scope()
......@@ -170,7 +169,7 @@ def has_fetch_operators(block, fetch_targets, fetch_holder_name):
return fetch_count > 0
def fetch_var(name, scope=None, return_numpy=True):
def _fetch_var(name, scope=None, return_numpy=True):
"""
Fetch the value of the variable with the given name from the
given scope.
......@@ -204,23 +203,54 @@ def fetch_var(name, scope=None, return_numpy=True):
def _get_program_cache_key(feed, fetch_list):
feed_var_names = feed.keys()
feed_var_names = list(feed.keys())
def to_name_str(var):
if isinstance(var, Variable):
return var.desc.name()
elif isinstance(var, str):
return var
elif isinstance(var, basestring):
elif isinstance(var, six.string_types):
return str(var)
else:
raise TypeError(str(var) + " should be Variable or str")
fetch_var_names = map(to_name_str, fetch_list)
fetch_var_names = list(map(to_name_str, fetch_list))
return str(feed_var_names + fetch_var_names)
def _as_lodtensor(data, place):
"""
Convert numpy.ndarray to Tensor, its only support Tensor without LoD information.
For higher dimensional sequence data, please use LoDTensor directly.
Examples:
>>> import paddle.fluid as fluid
>>> place = fluid.CPUPlace()
>>> exe = fluid.executor(place)
>>> data = np.array(size=(100, 200, 300))
>>> np_outs = map(lambda x: fluid.executor._as_lodtensor(x, place), data)
>>> ...
Args:
data(numpy.ndarray): a instance of array
Returns:
LoDTensor
"""
if isinstance(data, list):
raise RuntimeError("Some of your feed data hold LoD information. \
They can not be completely cast from a list of Python \
ndarray to LoDTensor. Please convert data to LoDTensor \
directly before feeding the data.\
")
# single tensor case
tensor = core.LoDTensor()
tensor.set(data, place)
return tensor
class Executor(object):
"""
An Executor in Python, only support the single-GPU running. For multi-cards, please refer to
......@@ -229,8 +259,8 @@ class Executor(object):
to feed map and fetch_list. Feed map provides input data for the program. fetch_list provides
the variables(or names) that user want to get after program run. Note: the executor will run all
operators in the program but not only the operators dependent by the fetch_list.
It store the global variables into the global scope, and create a local scope for the temporary
variables. The local scope contents will be discarded after every minibatch forward/backward finished.
It store the global variables into the global scope, and create a local scope for the temporary
variables. The local scope contents will be discarded after every minibatch forward/backward finished.
But the global scope variables will be persistent through different runs.
All of ops in program will be running in sequence.
......@@ -249,35 +279,6 @@ class Executor(object):
self.program_caches = dict()
self._closed = False
def as_lodtensor(self, data):
"""
Convert numpy.ndarray to Tensor, its only support Tensor without LoD information.
For higher dimensional sequence data, please use LoDTensor directly.
Examples:
>>> import paddle.fluid as fluid
>>> exe = fluid.executor(fluid.CPUPlace())
>>> data = np.array(size=(100, 200, 300))
>>> np_outs = map(lambda x: exe.as_lodtensor(x), data)
>>> ...
Args:
data(numpy.ndarray): a instance of array
Returns:
LoDTensor
"""
if isinstance(data, list):
raise RuntimeError("Some of your feed data hold LoD information. \
They can not be completely cast from a list of Python \
ndarray to LoDTensor. Please convert data to LoDTensor \
directly before feeding the data.\
")
# single tensor case
tensor = core.LoDTensor()
tensor.set(data, self.place)
return tensor
def _get_program_cache(self, program_cache_key):
return self.program_caches.get(program_cache_key, None)
......@@ -336,7 +337,7 @@ class Executor(object):
feed_target_name = op.desc.output('Out')[0]
cur_feed = feed[feed_target_name]
if not isinstance(cur_feed, core.LoDTensor):
cur_feed = self.as_lodtensor(cur_feed)
cur_feed = _as_lodtensor(cur_feed, self.place)
idx = op.desc.attr('col')
core.set_feed_variable(scope, cur_feed, feed_var_name, idx)
else:
......@@ -345,7 +346,7 @@ class Executor(object):
def _fetch_data(self, fetch_list, fetch_var_name, scope):
outs = [
core.get_fetch_variable(scope, fetch_var_name, i)
for i in xrange(len(fetch_list))
for i in range(len(fetch_list))
]
return outs
......
此差异已折叠。
......@@ -14,12 +14,13 @@
import os
import random
import six
import subprocess
import logging
def crepr(v):
if type(v) is str or type(v) is unicode:
if isinstance(v, six.string_types):
return '"%s"' % v
return str(v)
......@@ -104,7 +105,7 @@ class Graph(object):
def _rank_repr(self):
ranks = sorted(
self.rank_groups.items(),
list(self.rank_groups.items()),
cmp=lambda a, b: a[1].priority > b[1].priority)
repr = []
for x in ranks:
......@@ -148,7 +149,7 @@ class Node(object):
name=self.name,
label=self.label,
extra=',' + ','.join("%s=%s" % (key, crepr(value))
for key, value in self.attrs.items())
for key, value in list(self.attrs.items()))
if self.attrs else "")
return reprs
......@@ -172,7 +173,7 @@ class Edge(object):
target=self.target.name,
extra="" if not self.attrs else
"[" + ','.join("{}={}".format(attr[0], crepr(attr[1]))
for attr in self.attrs.items()) + "]")
for attr in list(self.attrs.items())) + "]")
return repr
......
......@@ -14,14 +14,14 @@
import contextlib
import core
import executor
import framework
import io
import parallel_executor
import unique_name
from trainer import check_and_get_place
from . import core
from . import executor
from . import framework
from . import io
from . import parallel_executor
from . import unique_name
from .trainer import check_and_get_place
__all__ = ['Inferencer', ]
......
......@@ -12,11 +12,11 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import framework
from . import framework
import numpy as np
import contextlib
from framework import convert_np_dtype_to_dtype_
from core import VarDesc
from .framework import convert_np_dtype_to_dtype_
from .core import VarDesc
__all__ = [
'Constant', 'Uniform', 'Normal', 'Xavier', 'Bilinear', 'MSRA',
......@@ -264,7 +264,8 @@ class NormalInitializer(Initializer):
"dtype": int(var.dtype),
"mean": self._mean,
"std": self._std_dev,
"seed": self._seed
"seed": self._seed,
"use_mkldnn": False
})
var.op = op
return op
......
此差异已折叠。
......@@ -14,12 +14,14 @@
import copy
import itertools
import six
from framework import Variable, Parameter, default_main_program, default_startup_program, dtype_is_floating
import unique_name
from .framework import Variable, Parameter, default_main_program, default_startup_program, dtype_is_floating
from . import unique_name
from paddle.fluid.initializer import Constant, Xavier
from param_attr import ParamAttr, WeightNormParamAttr
import core
from .param_attr import ParamAttr, WeightNormParamAttr
from . import core
from six.moves import zip
class LayerHelper(object):
......@@ -83,7 +85,7 @@ class LayerHelper(object):
raise ValueError("parameter number mismatch")
elif len(param_attr) == 1 and length != 1:
tmp = [None] * length
for i in xrange(length):
for i in range(length):
tmp[i] = copy.deepcopy(param_attr[0])
param_attr = tmp
return param_attr
......@@ -91,7 +93,7 @@ class LayerHelper(object):
def iter_inputs_and_params(self, input_param_name='input'):
inputs = self.multiple_input(input_param_name)
param_attrs = self.multiple_param_attr(len(inputs))
for ipt, param_attr in itertools.izip(inputs, param_attrs):
for ipt, param_attr in zip(inputs, param_attrs):
yield ipt, param_attr
def input_dtype(self, input_param_name='input'):
......@@ -218,7 +220,7 @@ class LayerHelper(object):
norm = __norm_op(reshape, dim=0, block=block)
__reshape_op(norm, out=out, shape=out_shape, block=block)
else:
perm = range(len(x.shape))
perm = list(range(len(x.shape)))
perm[0], perm[dim] = dim, 0
transpose = __transpose_op(x, perm, block=block)
norm = __norm_op(transpose, dim=0, block=block)
......@@ -397,8 +399,10 @@ class LayerHelper(object):
act = self.kwargs.get('act', None)
if act is None:
return input_var
if isinstance(act, basestring):
if isinstance(act, six.string_types):
act = {'type': act}
else:
raise TypeError(str(act) + " should be unicode or str")
if 'use_cudnn' in self.kwargs and self.kwargs.get('use_cudnn'):
act['use_cudnn'] = self.kwargs.get('use_cudnn')
......
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册