diff --git a/CMakeLists.txt b/CMakeLists.txt index ed704585d8a6bf3befd9a549aa5a62a33fea3da9..291a960b1471b22a6cb53c4ca49b45609afb4dc6 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -41,6 +41,7 @@ option(WITH_GPU "Compile PaddlePaddle with NVIDIA GPU" ${CUDA_F option(WITH_AMD_GPU "Compile PaddlePaddle with AMD GPU" OFF) option(WITH_AVX "Compile PaddlePaddle with AVX intrinsics" ${AVX_FOUND}) option(WITH_MKL "Compile PaddlePaddle with MKL support." ${AVX_FOUND}) +option(WITH_NGRAPH "Compile PaddlePaddle with nGraph support." OFF) option(WITH_DSO "Compile PaddlePaddle with dynamic linked CUDA" ON) option(WITH_TESTING "Compile PaddlePaddle with unit testing" OFF) option(WITH_SWIG_PY "Compile PaddlePaddle with inference api" ON) @@ -103,6 +104,8 @@ if(ANDROID OR IOS) "Disable RDMA when cross-compiling for Android and iOS" FORCE) set(WITH_MKL OFF CACHE STRING "Disable MKL when cross-compiling for Android and iOS" FORCE) + set(WITH_NGRAPH OFF CACHE STRING + "Disable nGraph when cross-compiling for Android and iOS" FORCE) set(WITH_GOLANG OFF CACHE STRING "Disable golang when cross-compiling for Android and iOS" FORCE) @@ -171,6 +174,7 @@ include(external/protobuf) # download, build, install protobuf include(external/python) # download, build, install python include(external/openblas) # download, build, install openblas include(external/mkldnn) # download, build, install mkldnn +include(external/ngraph) # download, build, install nGraph include(external/swig) # download, build, install swig include(external/boost) # download boost include(external/any) # download libn::any diff --git a/cmake/external/mkldnn.cmake b/cmake/external/mkldnn.cmake index 9fea9ca05bce921b105a0f092c321b0c3a55c63c..785148d4f9f44032e2ce5bf93f0dc80fc865808b 100644 --- a/cmake/external/mkldnn.cmake +++ b/cmake/external/mkldnn.cmake @@ -37,7 +37,6 @@ SET(CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE) SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${MKLDNN_INSTALL_DIR}/lib") INCLUDE_DIRECTORIES(${MKLDNN_INC_DIR}) # For MKLDNN code to include internal headers. -INCLUDE_DIRECTORIES(${THIRD_PARTY_PATH}/install) # For Paddle code to include mkldnn.h IF(${CBLAS_PROVIDER} STREQUAL "MKLML") SET(MKLDNN_DEPENDS ${MKLML_PROJECT}) diff --git a/cmake/external/ngraph.cmake b/cmake/external/ngraph.cmake new file mode 100644 index 0000000000000000000000000000000000000000..2e335579f32df4f146c8d88e05e684a9a8105e20 --- /dev/null +++ b/cmake/external/ngraph.cmake @@ -0,0 +1,92 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +add_library(ngraph INTERFACE) + +IF(WIN32 OR APPLE) + MESSAGE(WARNING + "Windows or Mac is not supported with nGraph in Paddle yet." + "Force WITH_NGRAPH=OFF") + SET(WITH_NGRAPH OFF CACHE STRING "Disable nGraph in Windows and MacOS" FORCE) +ENDIF() + +IF(${WITH_NGRAPH} AND NOT ${WITH_MKLDNN}) + MESSAGE(WARNING + "nGraph needs mkl-dnn to be enabled." + "Force WITH_NGRAPH=OFF") + SET(WITH_NGRAPH OFF CACHE STRING "Disable nGraph if mkl-dnn is disabled" FORCE) +ENDIF() + +IF(NOT ${WITH_NGRAPH}) + return() +ENDIF() + +INCLUDE(ExternalProject) + +SET(NGRAPH_PROJECT "extern_ngraph") +SET(NGRAPH_VERSION "0.9") +SET(NGRAPH_GIT_TAG "f9fd9d4cc318dc59dd4b68448e7fbb5f67a28bd0") +SET(NGRAPH_SOURCES_DIR ${THIRD_PARTY_PATH}/ngraph) +SET(NGRAPH_INSTALL_DIR ${THIRD_PARTY_PATH}/install/ngraph) +SET(NGRAPH_INC_DIR ${NGRAPH_INSTALL_DIR}/include) +SET(NGRAPH_SHARED_LIB_NAME libngraph.so.${NGRAPH_VERSION}) +SET(NGRAPH_CPU_LIB_NAME libcpu_backend.so) +SET(NGRAPH_TBB_LIB_NAME libtbb.so.2) +SET(NGRAPH_GIT_REPO "https://github.com/NervanaSystems/ngraph.git") + +ExternalProject_Add( + ${NGRAPH_PROJECT} + ${EXTERNAL_PROJECT_LOG_ARGS} + DEPENDS ${MKLDNN_PROJECT} ${MKLML_PROJECT} + GIT_REPOSITORY ${NGRAPH_GIT_REPO} + GIT_TAG ${NGRAPH_GIT_TAG} + PREFIX ${NGRAPH_SOURCES_DIR} + UPDATE_COMMAND "" + CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${NGRAPH_INSTALL_DIR} + CMAKE_ARGS -DNGRAPH_UNIT_TEST_ENABLE=FALSE + CMAKE_ARGS -DNGRAPH_TOOLS_ENABLE=FALSE + CMAKE_ARGS -DNGRAPH_INTERPRETER_ENABLE=FALSE + CMAKE_ARGS -DNGRAPH_DEX_ONLY=TRUE + CMAKE_ARGS -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE} + CMAKE_ARGS -DMKLDNN_INCLUDE_DIR=${MKLDNN_INC_DIR} + CMAKE_ARGS -DMKLDNN_LIB_DIR=${MKLDNN_INSTALL_DIR}/lib +) + +if(UNIX AND NOT APPLE) + include(GNUInstallDirs) + SET(NGRAPH_LIB_DIR ${NGRAPH_INSTALL_DIR}/${CMAKE_INSTALL_LIBDIR}) +else() + SET(NGRAPH_LIB_DIR ${NGRAPH_INSTALL_DIR}/lib) +endif() +MESSAGE(STATUS "nGraph lib will be installed at: ${NGRAPH_LIB_DIR}") + +SET(NGRAPH_SHARED_LIB ${NGRAPH_LIB_DIR}/${NGRAPH_SHARED_LIB_NAME}) +SET(NGRAPH_CPU_LIB ${NGRAPH_LIB_DIR}/${NGRAPH_CPU_LIB_NAME}) +SET(NGRAPH_TBB_LIB ${NGRAPH_LIB_DIR}/${NGRAPH_TBB_LIB_NAME}) + +# Workaround for nGraph expecting mklml to be in mkldnn install directory. +ExternalProject_Add_Step( + ${NGRAPH_PROJECT} + PrepareMKL + COMMAND ${CMAKE_COMMAND} -E create_symlink ${MKLML_LIB} ${MKLDNN_INSTALL_DIR}/lib/libmklml_intel.so + COMMAND ${CMAKE_COMMAND} -E create_symlink ${MKLML_IOMP_LIB} ${MKLDNN_INSTALL_DIR}/lib/libiomp5.so + DEPENDEES download + DEPENDERS configure +) + +add_dependencies(ngraph ${NGRAPH_PROJECT}) +target_compile_definitions(ngraph INTERFACE -DPADDLE_WITH_NGRAPH) +target_include_directories(ngraph INTERFACE ${NGRAPH_INC_DIR}) +target_link_libraries(ngraph INTERFACE ${NGRAPH_SHARED_LIB}) +LIST(APPEND external_project_dependencies ngraph) diff --git a/cmake/external/protobuf.cmake b/cmake/external/protobuf.cmake index 550b0dada8e90c1e2b33705fd53c065672113b45..45ef9b4550291cadaa9571f05dbaefdf4a0c223a 100644 --- a/cmake/external/protobuf.cmake +++ b/cmake/external/protobuf.cmake @@ -30,66 +30,61 @@ UNSET_VAR(PROTOBUF_LITE_LIBRARY) UNSET_VAR(PROTOBUF_LIBRARY) UNSET_VAR(PROTOBUF_INCLUDE_DIR) UNSET_VAR(Protobuf_PROTOC_EXECUTABLE) +function(protobuf_generate_python SRCS) + # shameless copy from https://github.com/Kitware/CMake/blob/master/Modules/FindProtobuf.cmake + if(NOT ARGN) + message(SEND_ERROR "Error: PROTOBUF_GENERATE_PYTHON() called without any proto files") + return() + endif() -if(NOT COMMAND protobuf_generate_python) # before cmake 3.4, protobuf_genrerate_python is not defined. - function(protobuf_generate_python SRCS) - # shameless copy from https://github.com/Kitware/CMake/blob/master/Modules/FindProtobuf.cmake - if(NOT ARGN) - message(SEND_ERROR "Error: PROTOBUF_GENERATE_PYTHON() called without any proto files") - return() - endif() - - if(PROTOBUF_GENERATE_CPP_APPEND_PATH) - # Create an include path for each file specified - foreach(FIL ${ARGN}) - get_filename_component(ABS_FIL ${FIL} ABSOLUTE) - get_filename_component(ABS_PATH ${ABS_FIL} PATH) - list(FIND _protobuf_include_path ${ABS_PATH} _contains_already) - if(${_contains_already} EQUAL -1) - list(APPEND _protobuf_include_path -I ${ABS_PATH}) - endif() - endforeach() - else() - set(_protobuf_include_path -I ${CMAKE_CURRENT_SOURCE_DIR}) - endif() - - if(DEFINED PROTOBUF_IMPORT_DIRS AND NOT DEFINED Protobuf_IMPORT_DIRS) - set(Protobuf_IMPORT_DIRS "${PROTOBUF_IMPORT_DIRS}") - endif() - - if(DEFINED Protobuf_IMPORT_DIRS) - foreach(DIR ${Protobuf_IMPORT_DIRS}) - get_filename_component(ABS_PATH ${DIR} ABSOLUTE) - list(FIND _protobuf_include_path ${ABS_PATH} _contains_already) - if(${_contains_already} EQUAL -1) - list(APPEND _protobuf_include_path -I ${ABS_PATH}) - endif() - endforeach() - endif() - - set(${SRCS}) + if(PROTOBUF_GENERATE_CPP_APPEND_PATH) + # Create an include path for each file specified foreach(FIL ${ARGN}) get_filename_component(ABS_FIL ${FIL} ABSOLUTE) - get_filename_component(FIL_WE ${FIL} NAME_WE) - if(NOT PROTOBUF_GENERATE_CPP_APPEND_PATH) - get_filename_component(FIL_DIR ${FIL} DIRECTORY) - if(FIL_DIR) - set(FIL_WE "${FIL_DIR}/${FIL_WE}") - endif() + get_filename_component(ABS_PATH ${ABS_FIL} PATH) + list(FIND _protobuf_include_path ${ABS_PATH} _contains_already) + if(${_contains_already} EQUAL -1) + list(APPEND _protobuf_include_path -I ${ABS_PATH}) endif() + endforeach() + else() + set(_protobuf_include_path -I ${CMAKE_CURRENT_SOURCE_DIR}) + endif() + if(DEFINED PROTOBUF_IMPORT_DIRS AND NOT DEFINED Protobuf_IMPORT_DIRS) + set(Protobuf_IMPORT_DIRS "${PROTOBUF_IMPORT_DIRS}") + endif() - list(APPEND ${SRCS} "${CMAKE_CURRENT_BINARY_DIR}/${FIL_WE}_pb2.py") - add_custom_command( - OUTPUT "${CMAKE_CURRENT_BINARY_DIR}/${FIL_WE}_pb2.py" - COMMAND ${Protobuf_PROTOC_EXECUTABLE} --python_out ${CMAKE_CURRENT_BINARY_DIR} ${_protobuf_include_path} ${ABS_FIL} - DEPENDS ${ABS_FIL} ${Protobuf_PROTOC_EXECUTABLE} - COMMENT "Running Python protocol buffer compiler on ${FIL}" - VERBATIM ) + if(DEFINED Protobuf_IMPORT_DIRS) + foreach(DIR ${Protobuf_IMPORT_DIRS}) + get_filename_component(ABS_PATH ${DIR} ABSOLUTE) + list(FIND _protobuf_include_path ${ABS_PATH} _contains_already) + if(${_contains_already} EQUAL -1) + list(APPEND _protobuf_include_path -I ${ABS_PATH}) + endif() endforeach() + endif() - set(${SRCS} ${${SRCS}} PARENT_SCOPE) - endfunction() -endif() + set(${SRCS}) + foreach(FIL ${ARGN}) + get_filename_component(ABS_FIL ${FIL} ABSOLUTE) + get_filename_component(FIL_WE ${FIL} NAME_WE) + if(NOT PROTOBUF_GENERATE_CPP_APPEND_PATH) + get_filename_component(FIL_DIR ${FIL} DIRECTORY) + if(FIL_DIR) + set(FIL_WE "${FIL_DIR}/${FIL_WE}") + endif() + endif() + list(APPEND ${SRCS} "${CMAKE_CURRENT_BINARY_DIR}/${FIL_WE}_pb2.py") + add_custom_command( + OUTPUT "${CMAKE_CURRENT_BINARY_DIR}/${FIL_WE}_pb2.py" + COMMAND ${PROTOBUF_PROTOC_EXECUTABLE} --python_out ${CMAKE_CURRENT_BINARY_DIR} ${_protobuf_include_path} ${ABS_FIL} + DEPENDS ${ABS_FIL} ${PROTOBUF_PROTOC_EXECUTABLE} + COMMENT "Running Python protocol buffer compiler on ${FIL}" + VERBATIM ) + endforeach() + + set(${SRCS} ${${SRCS}} PARENT_SCOPE) +endfunction() # Print and set the protobuf library information, # finish this cmake process and exit from this file. @@ -126,6 +121,7 @@ macro(PROMPT_PROTOBUF_LIB) # FIND_Protobuf.cmake uses `Protobuf_PROTOC_EXECUTABLE`. # make `protobuf_generate_cpp` happy. SET(Protobuf_PROTOC_EXECUTABLE ${PROTOBUF_PROTOC_EXECUTABLE}) + FOREACH(dep ${protobuf_DEPS}) ADD_DEPENDENCIES(protobuf ${dep}) ADD_DEPENDENCIES(protobuf_lite ${dep}) diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec index 88a2c740e08f7c2c6ac831c65a0ef992064b3a61..de32a5d5a297b63a80aa41fc99fcacf60bbf2488 100644 --- a/paddle/fluid/API.spec +++ b/paddle/fluid/API.spec @@ -118,9 +118,10 @@ paddle.fluid.layers.label_smooth ArgSpec(args=['label', 'prior_dist', 'epsilon', paddle.fluid.layers.roi_pool ArgSpec(args=['input', 'rois', 'pooled_height', 'pooled_width', 'spatial_scale'], varargs=None, keywords=None, defaults=(1, 1, 1.0)) paddle.fluid.layers.roi_align ArgSpec(args=['input', 'rois', 'pooled_height', 'pooled_width', 'spatial_scale', 'sampling_ratio', 'name'], varargs=None, keywords=None, defaults=(1, 1, 1.0, -1, None)) paddle.fluid.layers.dice_loss ArgSpec(args=['input', 'label', 'epsilon'], varargs=None, keywords=None, defaults=(1e-05,)) -paddle.fluid.layers.image_resize ArgSpec(args=['input', 'out_shape', 'scale', 'name', 'resample'], varargs=None, keywords=None, defaults=(None, None, None, 'BILINEAR')) +paddle.fluid.layers.image_resize ArgSpec(args=['input', 'out_shape', 'scale', 'name', 'resample', 'actual_shape'], varargs=None, keywords=None, defaults=(None, None, None, 'BILINEAR', None)) paddle.fluid.layers.image_resize_short ArgSpec(args=['input', 'out_short_len', 'resample'], varargs=None, keywords=None, defaults=('BILINEAR',)) -paddle.fluid.layers.resize_bilinear ArgSpec(args=['input', 'out_shape', 'scale', 'name'], varargs=None, keywords=None, defaults=(None, None, None)) +paddle.fluid.layers.resize_bilinear ArgSpec(args=['input', 'out_shape', 'scale', 'name', 'actual_shape'], varargs=None, keywords=None, defaults=(None, None, None, None)) +paddle.fluid.layers.resize_nearest ArgSpec(args=['input', 'out_shape', 'scale', 'name', 'actual_shape'], varargs=None, keywords=None, defaults=(None, None, None, None)) paddle.fluid.layers.gather ArgSpec(args=['input', 'index'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.scatter ArgSpec(args=['input', 'index', 'updates', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.sequence_scatter ArgSpec(args=['input', 'index', 'updates', 'name'], varargs=None, keywords=None, defaults=(None,)) @@ -178,10 +179,12 @@ paddle.fluid.layers.space_to_depth ArgSpec(args=['x', 'blocksize', 'name'], vara paddle.fluid.layers.affine_grid ArgSpec(args=['theta', 'out_shape', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.sequence_reverse ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.affine_channel ArgSpec(args=['x', 'scale', 'bias', 'data_layout', 'name'], varargs=None, keywords=None, defaults=(None, None, 'NCHW', None)) +paddle.fluid.layers.similarity_focus ArgSpec(args=['input', 'axis', 'indexes', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.hash ArgSpec(args=['input', 'hash_size', 'num_hash', 'name'], varargs=None, keywords=None, defaults=(1, None)) paddle.fluid.layers.grid_sampler ArgSpec(args=['x', 'grid', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.log_loss ArgSpec(args=['input', 'label', 'epsilon', 'name'], varargs=None, keywords=None, defaults=(0.0001, None)) paddle.fluid.layers.add_position_encoding ArgSpec(args=['input', 'alpha', 'beta', 'name'], varargs=None, keywords=None, defaults=(None,)) +paddle.fluid.layers.bilinear_tensor_product ArgSpec(args=['x', 'y', 'size', 'act', 'name', 'param_attr', 'bias_attr'], varargs=None, keywords=None, defaults=(None, None, None, None)) paddle.fluid.layers.data ArgSpec(args=['name', 'shape', 'append_batch_size', 'dtype', 'lod_level', 'type', 'stop_gradient'], varargs=None, keywords=None, defaults=(True, 'float32', 0, VarType.LOD_TENSOR, True)) paddle.fluid.layers.open_files ArgSpec(args=['filenames', 'shapes', 'lod_levels', 'dtypes', 'thread_num', 'buffer_size', 'pass_num', 'is_test'], varargs=None, keywords=None, defaults=(None, None, 1, None)) paddle.fluid.layers.read_file ArgSpec(args=['reader'], varargs=None, keywords=None, defaults=None) @@ -200,6 +203,7 @@ paddle.fluid.layers.create_tensor ArgSpec(args=['dtype', 'name', 'persistable'], paddle.fluid.layers.create_parameter ArgSpec(args=['shape', 'dtype', 'name', 'attr', 'is_bias', 'default_initializer'], varargs=None, keywords=None, defaults=(None, None, False, None)) paddle.fluid.layers.create_global_var ArgSpec(args=['shape', 'value', 'dtype', 'persistable', 'force_cpu', 'name'], varargs=None, keywords=None, defaults=(False, False, None)) paddle.fluid.layers.cast ArgSpec(args=['x', 'dtype'], varargs=None, keywords=None, defaults=None) +paddle.fluid.layers.tensor_array_to_tensor ArgSpec(args=['input', 'axis', 'name'], varargs=None, keywords=None, defaults=(1, None)) paddle.fluid.layers.concat ArgSpec(args=['input', 'axis', 'name'], varargs=None, keywords=None, defaults=(0, None)) paddle.fluid.layers.sums ArgSpec(args=['input', 'out'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.assign ArgSpec(args=['input', 'output'], varargs=None, keywords=None, defaults=(None,)) diff --git a/paddle/fluid/framework/data_device_transform.cc b/paddle/fluid/framework/data_device_transform.cc index fee6ba40047053ed5662fe044eceb0c687bd4db9..57ff061fe5e612495add86df8f82fe7d9f9107dc 100644 --- a/paddle/fluid/framework/data_device_transform.cc +++ b/paddle/fluid/framework/data_device_transform.cc @@ -18,8 +18,8 @@ namespace framework { void TransDataDevice(const Tensor &in, const platform::Place &dst_place, Tensor *out) { - VLOG(3) << "DeviceTransform in, src_place " << in.place() - << " dst_place: " << dst_place; + VLOG(30) << "DeviceTransform in, src_place " << in.place() + << " dst_place: " << dst_place; PADDLE_ENFORCE_NE( in.place().which(), dst_place.which(), diff --git a/paddle/fluid/framework/data_device_transform_test.cu b/paddle/fluid/framework/data_device_transform_test.cu index f2c55e533a2747325b1b16fdada37945a8ed3c42..21e0cb3f91cc0ae05513c3bbd470650ca71194d7 100644 --- a/paddle/fluid/framework/data_device_transform_test.cu +++ b/paddle/fluid/framework/data_device_transform_test.cu @@ -49,10 +49,10 @@ class TestOpWithKernel : public OperatorWithKernel { OpKernelType GetExpectedKernelType( const ExecutionContext& ctx) const override { if (Attr("use_gpu")) { - VLOG(3) << "force use gpu kernel"; + VLOG(30) << "force use gpu kernel"; return OpKernelType(proto::VarType::FP32, platform::CUDAPlace(0)); } else { - VLOG(3) << "use default kernel"; + VLOG(30) << "use default kernel"; return OpKernelType(proto::VarType::FP32, ctx.Input("input")->place()); } @@ -148,7 +148,7 @@ TEST(Operator, CPUtoGPU) { // get output auto* output2 = scope.Var("OUT2"); gpu_op->Run(scope, cuda_place); - VLOG(3) << "after gpu_op run"; + VLOG(30) << "after gpu_op run"; // auto* output2_ptr = output2->Get().data(); paddle::platform::DeviceContextPool& pool = diff --git a/paddle/fluid/framework/details/broadcast_op_handle.cc b/paddle/fluid/framework/details/broadcast_op_handle.cc index 7f0d06c892541a2697a4ed083f6f4c0fc774a2a4..8e5e5427659387d63eac21a200c1a20da493e539 100644 --- a/paddle/fluid/framework/details/broadcast_op_handle.cc +++ b/paddle/fluid/framework/details/broadcast_op_handle.cc @@ -60,7 +60,7 @@ void BroadcastOpHandle::BroadcastOneVar( PADDLE_ENFORCE_NOT_NULL(in_var); Tensor &in_tensor = VariableVisitor::GetMutableTensor(in_var); if (UNLIKELY(!in_tensor.IsInitialized())) { - VLOG(3) << "in var " << in_var_handle.name_ << "not inited, return!"; + VLOG(30) << "in var " << in_var_handle.name_ << "not inited, return!"; return; } diff --git a/paddle/fluid/framework/details/modify_op_lock_and_record_event_pass.cc b/paddle/fluid/framework/details/modify_op_lock_and_record_event_pass.cc index 67aad9f94f088f4b50e1ce2728d83de98a3c60ad..bf3f3637b551a8a8084e6e4f1ca6a94b65361f17 100644 --- a/paddle/fluid/framework/details/modify_op_lock_and_record_event_pass.cc +++ b/paddle/fluid/framework/details/modify_op_lock_and_record_event_pass.cc @@ -45,8 +45,8 @@ std::unique_ptr ModifyOpLockAndRecordEventPass::ApplyImpl( IsLockAndRecordEventFreeComputationOpHandle(compute_op, graph_view); compute_op->SetLockAndRecordEventFree(is_lock_and_record_event_free); if (is_lock_and_record_event_free) { - VLOG(10) << "Set is_lock_and_record_event_free be true in op " - << compute_op->DebugString(); + VLOG(100) << "Set is_lock_and_record_event_free be true in op " + << compute_op->DebugString(); } } return ir_graph; diff --git a/paddle/fluid/framework/details/multi_devices_graph_pass.cc b/paddle/fluid/framework/details/multi_devices_graph_pass.cc index 3dc177a8cb7a1e994aca5304240f1eb61ba23f02..8c98b781301e884d5d5c7d141f3d901d74d51285 100644 --- a/paddle/fluid/framework/details/multi_devices_graph_pass.cc +++ b/paddle/fluid/framework/details/multi_devices_graph_pass.cc @@ -399,7 +399,7 @@ std::unique_ptr MultiDevSSAGraphBuilder::ApplyImpl( for (size_t i = 0; i < backward_vars.size(); i += 2) { auto &p_name = backward_vars[i]; auto &g_name = backward_vars[i + 1]; - VLOG(10) << "Bcast " << g_name << " for parameter " << p_name; + VLOG(100) << "Bcast " << g_name << " for parameter " << p_name; switch (strategy_.reduce_) { case BuildStrategy::ReduceStrategy::kReduce: @@ -809,8 +809,8 @@ int MultiDevSSAGraphBuilder::CreateRPCOp( node->Op()->GetAttr(OpProtoAndCheckerMaker::OpRoleVarAttrName())); PADDLE_ENFORCE_EQ(send_param_grad.size(), 2U); op_dev_id = GetAppropriateDeviceID({send_param_grad[1]}); - VLOG(10) << "send grad " << input_var_names[0] << " origin " - << send_param_grad[1] << " place: " << op_dev_id; + VLOG(100) << "send grad " << input_var_names[0] << " origin " + << send_param_grad[1] << " place: " << op_dev_id; for (auto &varname : input_var_names) { sharded_var_device->emplace(varname, op_dev_id); } @@ -826,9 +826,9 @@ int MultiDevSSAGraphBuilder::CreateRPCOp( if (recv_param_grad.size() == 2U) { op_dev_id = GetVarDeviceID(*result, recv_param_grad[1], *sharded_var_device); - VLOG(10) << "recv param " << recv_param_grad[0] - << " get grad place: " << recv_param_grad[1] - << " place: " << op_dev_id; + VLOG(100) << "recv param " << recv_param_grad[0] + << " get grad place: " << recv_param_grad[1] + << " place: " << op_dev_id; } else { op_dev_id = GetAppropriateDeviceID(output_var_names); } diff --git a/paddle/fluid/framework/details/reference_count_pass.cc b/paddle/fluid/framework/details/reference_count_pass.cc index 08783fb5f8b18329c9167edb0dac39b7dd42a746..28443cc886e4c3f5db707d6d8fe9971618d8c2f7 100644 --- a/paddle/fluid/framework/details/reference_count_pass.cc +++ b/paddle/fluid/framework/details/reference_count_pass.cc @@ -140,8 +140,8 @@ std::unique_ptr ReferenceCountPass::ApplyImpl( if (next_compute_op != nullptr) { if (compute_ref_cnt_map.count(next_compute_op)) { compute_ref_cnt_map[next_compute_op]->AddVar(var_name); - VLOG(5) << "Add reference count of " << var_name << " to Operator " - << next_compute_op->Name(); + VLOG(50) << "Add reference count of " << var_name << " to Operator " + << next_compute_op->Name(); } else { // Create new reference_count_op_handle ir::Node *ref_cnt_node = graph->CreateEmptyNode( diff --git a/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc b/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc index ef1626599795a553e654fe5d3ed74ef3a3a67d78..6ab6cb2332b0af3fa16b986f115513ee098fae4f 100644 --- a/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc +++ b/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc @@ -51,7 +51,7 @@ void ScaleLossGradOpHandle::RunImpl() { ->stream(); memory::Copy(boost::get(place_), tmp, platform::CPUPlace(), &coeff_, sizeof(float), stream); - VLOG(10) << place_ << "RUN Scale loss grad op"; + VLOG(100) << place_ << "RUN Scale loss grad op"; }); #endif } diff --git a/paddle/fluid/framework/details/sequential_execution_pass.cc b/paddle/fluid/framework/details/sequential_execution_pass.cc index cc2c8bfef9f9f54c2e499467df0d22ce3f69d6b8..f78a47bb78e6f1d81db6abed11a7762f21dd2226 100644 --- a/paddle/fluid/framework/details/sequential_execution_pass.cc +++ b/paddle/fluid/framework/details/sequential_execution_pass.cc @@ -94,8 +94,8 @@ std::unique_ptr SequentialExecutionPass::ApplyImpl( op_node_list[i - 1]->outputs.push_back(dep_var); dep_var->outputs.push_back(op_node_list[i]); dep_var->inputs.push_back(op_node_list[i - 1]); - VLOG(10) << "Add dependencies between " << op_node_list[i - 1]->Name() - << " and " << op_node_list[i]->Name(); + VLOG(100) << "Add dependencies between " << op_node_list[i - 1]->Name() + << " and " << op_node_list[i]->Name(); } return graph; } diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc index 677a2937945b03fa577317cb4f26e09354d06957..f781f02a076594b5a70fd4863ebf273e88607dfd 100644 --- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc +++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc @@ -210,16 +210,16 @@ void ThreadedSSAGraphExecutor::RunOp( details::OpHandleBase *op) { auto op_run = [ready_var_q, op, this] { try { - if (VLOG_IS_ON(10)) { - VLOG(10) << op << " " << op->Name() << " : " << op->DebugString(); + if (VLOG_IS_ON(100)) { + VLOG(100) << op << " " << op->Name() << " : " << op->DebugString(); } if (LIKELY(!strategy_.dry_run_)) { op->Run(strategy_.use_cuda_); } - VLOG(10) << op << " " << op->Name() << " Done "; + VLOG(100) << op << " " << op->Name() << " Done "; running_ops_--; ready_var_q->Extend(op->Outputs()); - VLOG(10) << op << " " << op->Name() << "Signal posted"; + VLOG(100) << op << " " << op->Name() << "Signal posted"; } catch (...) { exception_holder_.Catch(std::current_exception()); } diff --git a/paddle/fluid/framework/executor.cc b/paddle/fluid/framework/executor.cc index 8ed0ba1dfa68b3e22f370c3f2dd0f83c3e5506b0..fc6b32528661fb56b39d007465046ac6fb893046 100644 --- a/paddle/fluid/framework/executor.cc +++ b/paddle/fluid/framework/executor.cc @@ -43,7 +43,7 @@ ExecutorPrepareContext::ExecutorPrepareContext( } ExecutorPrepareContext::~ExecutorPrepareContext() { - VLOG(5) << "destroy ExecutorPrepareContext"; + VLOG(50) << "destroy ExecutorPrepareContext"; } template @@ -60,7 +60,7 @@ static void DeleteUnusedTensors(const Scope& scope, const OperatorBase* op, if ((it->second)-- == 1) { auto* var = scope.FindVar(name); if (var != nullptr) { - VLOG(10) << "Erase tensor \'" << name << "\'"; + VLOG(100) << "Erase tensor \'" << name << "\'"; if (var->IsType()) { erase_tensors.insert(var->GetMutable()); } else if (var->IsType()) { @@ -141,21 +141,21 @@ void Executor::CreateVariables(const ProgramDesc& pdesc, Scope* scope, if (var->Persistable()) { auto* ptr = const_cast(ancestor_scope)->Var(var->Name()); InitializeVariable(ptr, var->GetType()); - VLOG(3) << "Create Variable " << var->Name() - << " global, which pointer is " << ptr; + VLOG(30) << "Create Variable " << var->Name() + << " global, which pointer is " << ptr; } else { auto* ptr = scope->Var(var->Name()); InitializeVariable(ptr, var->GetType()); - VLOG(3) << "Create Variable " << var->Name() - << " locally, which pointer is " << ptr; + VLOG(30) << "Create Variable " << var->Name() + << " locally, which pointer is " << ptr; } } } else { for (auto& var : global_block.AllVars()) { auto* ptr = scope->Var(var->Name()); InitializeVariable(ptr, var->GetType()); - VLOG(3) << "Create variable " << var->Name() << ", which pointer is " - << ptr; + VLOG(30) << "Create variable " << var->Name() << ", which pointer is " + << ptr; } } } @@ -286,7 +286,7 @@ void Executor::Run(const ProgramDesc& program, Scope* scope, int i = 0; for (auto& feed_target : (*feed_targets)) { std::string var_name = feed_target.first; - VLOG(3) << "feed target's name: " << var_name; + VLOG(30) << "feed target's name: " << var_name; // prepend feed op auto* op = global_block->PrependOp(); @@ -309,7 +309,7 @@ void Executor::Run(const ProgramDesc& program, Scope* scope, int i = 0; for (auto& fetch_target : (*fetch_targets)) { std::string var_name = fetch_target.first; - VLOG(3) << "fetch target's name: " << var_name; + VLOG(30) << "fetch target's name: " << var_name; // append fetch op auto* op = global_block->AppendOp(); @@ -398,8 +398,8 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope, } if (FLAGS_benchmark) { - VLOG(2) << "Memory used after operator " + op->Type() + " running: " - << memory::memory_usage(place_); + VLOG(20) << "Memory used after operator " + op->Type() + " running: " + << memory::memory_usage(place_); } } @@ -424,10 +424,10 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope, } if (FLAGS_benchmark) { - VLOG(2) << "-------------------------------------------------------"; - VLOG(2) << "Memory used after deleting local scope: " - << memory::memory_usage(place_); - VLOG(2) << "-------------------------------------------------------"; + VLOG(20) << "-------------------------------------------------------"; + VLOG(20) << "Memory used after deleting local scope: " + << memory::memory_usage(place_); + VLOG(20) << "-------------------------------------------------------"; } } @@ -471,7 +471,7 @@ void Executor::RunPreparedContext( void Executor::EnableMKLDNN(const ProgramDesc& program) { #ifdef PADDLE_WITH_MKLDNN - VLOG(3) << "use_mkldnn=True"; + VLOG(30) << "use_mkldnn=True"; for (size_t bid = 0; bid < program.Size(); ++bid) { auto* block = const_cast(program).MutableBlock(bid); for (auto* op : block->AllOps()) { diff --git a/paddle/fluid/framework/feed_fetch_method.cc b/paddle/fluid/framework/feed_fetch_method.cc index 3e9353f5cf67d8de62c5551f12ea786e49190549..1f3c19c0d5901cec9acc4ac9c5dab538d620c956 100644 --- a/paddle/fluid/framework/feed_fetch_method.cc +++ b/paddle/fluid/framework/feed_fetch_method.cc @@ -25,7 +25,7 @@ void SetFeedVariable(Scope* scope, const LoDTensor& input, const std::string& var_name, size_t index) { // If var_name Variable is not found in GlobalScope, a new variable will // be created. - VLOG(3) << "SetFeedVariable name=" << var_name << " index=" << index; + VLOG(30) << "SetFeedVariable name=" << var_name << " index=" << index; Variable* g_feed_value = scope->Var(var_name); auto& feed_inputs = *(g_feed_value->GetMutable()); if (index >= feed_inputs.size()) { @@ -47,8 +47,8 @@ LoDTensor& GetFetchVariable(const Scope& scope, const std::string& var_name, typeid(FeedFetchList).name()); auto& fetch_outputs = *g_fetch_value->GetMutable(); auto& tensor = fetch_outputs[index]; - VLOG(3) << "Fetch " << var_name << " with index " << index - << " shape= " << tensor.dims(); + VLOG(30) << "Fetch " << var_name << " with index " << index + << " shape= " << tensor.dims(); PADDLE_ENFORCE_LT(index, fetch_outputs.size()); return tensor; } diff --git a/paddle/fluid/framework/ir/attention_lstm_fuse_pass.cc b/paddle/fluid/framework/ir/attention_lstm_fuse_pass.cc index 6090f1fe76a49dddad0640123b1fa4db8c489634..6b284b1c1a4a37803229f4d55b100ca1da3a741d 100644 --- a/paddle/fluid/framework/ir/attention_lstm_fuse_pass.cc +++ b/paddle/fluid/framework/ir/attention_lstm_fuse_pass.cc @@ -147,19 +147,19 @@ void PrepareParameters(Graph* graph, const Param& param) { scope->Var(param.LSTMX)->GetMutable(); scope->Var(param.LSTMOUT)->GetMutable(); -#define GATE_W(name__) \ - auto* W_##name__##_w0 = scope->FindVar(#name__ ".w_0"); \ - auto* W_##name__##_w1 = scope->FindVar(#name__ ".w_1"); \ - auto* W_##name__##_b0 = scope->FindVar(#name__ ".b_0"); \ - CHECK_P3(W_##name__##_w0, W_##name__##_w1, W_##name__##_b0); \ - VLOG(4) << #name__ "_w0" \ - << " shape: " << W_##name__##_w0->Get().dims(); \ - VLOG(4) << #name__ "_w1" \ - << " shape: " << W_##name__##_w1->Get().dims(); \ - VLOG(4) << #name__ "_b0" \ - << " shape: " << W_##name__##_b0->Get().dims(); \ - auto& W_##name__##_w0_t = W_##name__##_w0->Get(); \ - auto& W_##name__##_w1_t = W_##name__##_w1->Get(); \ +#define GATE_W(name__) \ + auto* W_##name__##_w0 = scope->FindVar(#name__ ".w_0"); \ + auto* W_##name__##_w1 = scope->FindVar(#name__ ".w_1"); \ + auto* W_##name__##_b0 = scope->FindVar(#name__ ".b_0"); \ + CHECK_P3(W_##name__##_w0, W_##name__##_w1, W_##name__##_b0); \ + VLOG(40) << #name__ "_w0" \ + << " shape: " << W_##name__##_w0->Get().dims(); \ + VLOG(40) << #name__ "_w1" \ + << " shape: " << W_##name__##_w1->Get().dims(); \ + VLOG(40) << #name__ "_b0" \ + << " shape: " << W_##name__##_b0->Get().dims(); \ + auto& W_##name__##_w0_t = W_##name__##_w0->Get(); \ + auto& W_##name__##_w1_t = W_##name__##_w1->Get(); \ auto& W_##name__##_b0_t = W_##name__##_b0->Get(); GATE_W(forget); @@ -208,7 +208,7 @@ void PrepareLSTMWeight(const LoDTensor& W_forget_w0, int D = W_forget_w0.dims()[0]; int M = W_forget_w1.dims()[0]; out->Resize(make_ddim({D + M, 4 * D})); - VLOG(3) << "LSTMWeight resized to " << out->dims(); + VLOG(30) << "LSTMWeight resized to " << out->dims(); float* out_data = out->mutable_data(platform::CPUPlace()); std::array tensors( diff --git a/paddle/fluid/framework/ir/conv_bias_mkldnn_fuse_pass.cc b/paddle/fluid/framework/ir/conv_bias_mkldnn_fuse_pass.cc index 449cc78be15bcd2575ce2e6846b41e475f8921f6..c9c4d5afe5a0cd67ea14ae7abcf2b2bad1407e39 100644 --- a/paddle/fluid/framework/ir/conv_bias_mkldnn_fuse_pass.cc +++ b/paddle/fluid/framework/ir/conv_bias_mkldnn_fuse_pass.cc @@ -57,7 +57,7 @@ std::unique_ptr ConvBiasFusePass::ApplyImpl( int found_conv_bias_count = 0; auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph, Graph* g) { - VLOG(4) << "handle ConvBias fuse"; + VLOG(40) << "handle ConvBias fuse"; GET_IR_NODE_FROM_SUBGRAPH(conv_weight, conv_weight, conv_bias_pattern); // Filter GET_IR_NODE_FROM_SUBGRAPH(conv_out, conv_out, conv_bias_pattern); // tmp @@ -74,7 +74,7 @@ std::unique_ptr ConvBiasFusePass::ApplyImpl( // check if fuse can be done and if MKL-DNN should be used FuseOptions fuse_option = FindFuseOption(*conv, *eltwise); if (fuse_option == DO_NOT_FUSE || fuse_option == FUSE_NATIVE) { - VLOG(3) << "do not perform conv+bias fuse"; + VLOG(30) << "do not perform conv+bias fuse"; return; } diff --git a/paddle/fluid/framework/ir/conv_bn_fuse_pass.cc b/paddle/fluid/framework/ir/conv_bn_fuse_pass.cc index 846a14e365e6bd7f056d409130a3b246371931da..34b4c26ae3a8c281cd2729f67e49c78a8f440cc5 100644 --- a/paddle/fluid/framework/ir/conv_bn_fuse_pass.cc +++ b/paddle/fluid/framework/ir/conv_bn_fuse_pass.cc @@ -121,7 +121,7 @@ std::unique_ptr ConvBNFusePass::ApplyImpl( int found_conv_bn_count = 0; auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph, Graph* g) { - VLOG(4) << "handle ConvBN fuse"; + VLOG(40) << "handle ConvBN fuse"; // conv, batch_norm, // conv_weight, conv_out, @@ -133,7 +133,7 @@ std::unique_ptr ConvBNFusePass::ApplyImpl( // check if fuse can be done and if MKL-DNN should be used FuseOptions fuse_option = FindFuseOption(*conv, *batch_norm); if (fuse_option == DO_NOT_FUSE) { - VLOG(3) << "do not perform conv+bn fuse"; + VLOG(30) << "do not perform conv+bn fuse"; return; } @@ -241,7 +241,7 @@ std::unique_ptr ConvEltwiseAddBNFusePass::ApplyImpl( int found_conv_bn_count = 0; auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph, Graph* g) { - VLOG(4) << "handle ConvBN fuse"; + VLOG(40) << "handle ConvBN fuse"; // conv, batch_norm, // conv_weight, conv_out, diff --git a/paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass.cc b/paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass.cc index e359a3832ee8d549f8c58d63bc1cc6564ecadede..048868e1f913e9df3d985b9e66c075a02a7f0bcb 100644 --- a/paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass.cc +++ b/paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass.cc @@ -38,7 +38,7 @@ std::unique_ptr ConvReLUFusePass::ApplyImpl( int found_conv_relu_count = 0; auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph, Graph* g) { - VLOG(4) << "handle ConvReLU fuse"; + VLOG(40) << "handle ConvReLU fuse"; GET_IR_NODE_FROM_SUBGRAPH(conv_weight, conv_weight, conv_relu_pattern); // Filter GET_IR_NODE_FROM_SUBGRAPH(conv_out, conv_out, conv_relu_pattern); // tmp @@ -48,7 +48,7 @@ std::unique_ptr ConvReLUFusePass::ApplyImpl( FuseOptions fuse_option = FindFuseOption(*conv, *relu); if (fuse_option == DO_NOT_FUSE) { - VLOG(3) << "do not perform conv+relu fuse"; + VLOG(30) << "do not perform conv+relu fuse"; return; } diff --git a/paddle/fluid/framework/ir/depthwise_conv_mkldnn_pass.cc b/paddle/fluid/framework/ir/depthwise_conv_mkldnn_pass.cc index 19056e18aa892dbc83dfbf7305b6ad8b6b6bc51c..5f3334578d10f64b197215bfc11d08e30747cb90 100644 --- a/paddle/fluid/framework/ir/depthwise_conv_mkldnn_pass.cc +++ b/paddle/fluid/framework/ir/depthwise_conv_mkldnn_pass.cc @@ -39,7 +39,7 @@ std::unique_ptr DepthwiseConvMKLDNNPass::ApplyImpl( int found_depthwise_conv_mkldnn_count = 0; auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph, Graph* g) { - VLOG(3) << "handle DepthwiseConvMKLDNN fuse"; + VLOG(30) << "handle DepthwiseConvMKLDNN fuse"; GET_NODE(depthwise_conv, (*pattern)); depthwise_conv->Op()->SetType("conv2d"); found_depthwise_conv_mkldnn_count++; diff --git a/paddle/fluid/framework/ir/fc_fuse_pass.cc b/paddle/fluid/framework/ir/fc_fuse_pass.cc index ca704c7f5631bbaa88f1bc2caaa22fd021de11c4..3348abb19b3339b2b3e8b50485133b15a1973a32 100644 --- a/paddle/fluid/framework/ir/fc_fuse_pass.cc +++ b/paddle/fluid/framework/ir/fc_fuse_pass.cc @@ -39,7 +39,7 @@ std::unique_ptr FCFusePass::ApplyImpl( int found_fc_count = 0; auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph, Graph* g) { - VLOG(4) << "handle FC fuse"; + VLOG(40) << "handle FC fuse"; GET_IR_NODE_FROM_SUBGRAPH(w, w, fc_pattern); GET_IR_NODE_FROM_SUBGRAPH(fc_bias, bias, fc_pattern); GET_IR_NODE_FROM_SUBGRAPH(fc_out, Out, fc_pattern); diff --git a/paddle/fluid/framework/ir/fuse_elewise_add_act_pass.cc b/paddle/fluid/framework/ir/fuse_elewise_add_act_pass.cc index 648acc4a759417240d9a39749b059289182ebb1e..8ed68905beed2faedc34f194070cc76e8ff3c32d 100644 --- a/paddle/fluid/framework/ir/fuse_elewise_add_act_pass.cc +++ b/paddle/fluid/framework/ir/fuse_elewise_add_act_pass.cc @@ -61,7 +61,7 @@ std::unique_ptr FuseElewiseAddActPass::FuseElewiseAddAct( auto handler = [&](const GraphPatternDetector::subgraph_t &subgraph, Graph *g) { - VLOG(4) << "handle FuseElewiseAddAct fuse"; + VLOG(40) << "handle FuseElewiseAddAct fuse"; GET_IR_NODE_FROM_SUBGRAPH(ele_y, ele_y, elewise_add_act_pattern); GET_IR_NODE_FROM_SUBGRAPH(ele_out, elewise_add_out, elewise_add_act_pattern); @@ -77,10 +77,10 @@ std::unique_ptr FuseElewiseAddActPass::FuseElewiseAddAct( Node *elewise_add_act_node = CreateFuseElewiseAddActNode( g, act, ele_add, ele_x_n, ele_y_n, ele_out_n, act_out_n); - VLOG(4) << "\n\t " << ele_x_n << " and " << ele_y_n << " -> " - << ele_add->Name() << " -> " << ele_out_n << "\n" - << "\t " << ele_out_n << " -> " << act->Name() << " -> " - << act_out_n; + VLOG(40) << "\n\t " << ele_x_n << " and " << ele_y_n << " -> " + << ele_add->Name() << " -> " << ele_out_n << "\n" + << "\t " << ele_out_n << " -> " << act->Name() << " -> " + << act_out_n; ReLinkNodes(g, ele_out, ele_add, act, elewise_add_act_node); found_elewise_add_act_count++; @@ -113,7 +113,7 @@ std::unique_ptr FuseElewiseAddActPass::FuseActElewiseAdd( auto handler = [&](const GraphPatternDetector::subgraph_t &subgraph, Graph *g) { - VLOG(4) << "handle FuseElewiseAddAct fuse"; + VLOG(40) << "handle FuseElewiseAddAct fuse"; GET_IR_NODE_FROM_SUBGRAPH(act_out, act_out, act_elewise_add_pattern); GET_IR_NODE_FROM_SUBGRAPH(ele_x, ele_x, act_elewise_add_pattern); GET_IR_NODE_FROM_SUBGRAPH(ele_out, elewise_add_out, @@ -129,9 +129,9 @@ std::unique_ptr FuseElewiseAddActPass::FuseActElewiseAdd( Node *elewise_add_act_node = CreateFuseElewiseAddActNode( g, ele_add, act, elewise_add_x_n, act_i_n, act_o_n, elewise_add_out_n); - VLOG(4) << "\n\t " << act_i_n << " -> " << act->Name() << " -> " << act_o_n - << "\n\t " << act_o_n << " and " << elewise_add_x_n << " -> " - << ele_add->Name() << " -> " << elewise_add_out_n; + VLOG(40) << "\n\t " << act_i_n << " -> " << act->Name() << " -> " << act_o_n + << "\n\t " << act_o_n << " and " << elewise_add_x_n << " -> " + << ele_add->Name() << " -> " << elewise_add_out_n; ReLinkNodes(g, act_out, act, ele_add, elewise_add_act_node); found_elewise_add_act_count++; @@ -165,7 +165,7 @@ std::unique_ptr FuseElewiseAddActPass::FuseElewiseAddActInplaceGrad( auto handler = [&](const GraphPatternDetector::subgraph_t &subgraph, Graph *g) { - VLOG(4) << "handle FuseElewiseAddActGrad1 fuse"; + VLOG(40) << "handle FuseElewiseAddActGrad1 fuse"; GET_IR_NODE_FROM_SUBGRAPH(act_out, act_out, elewise_add_act_grad_pattern); GET_IR_NODE_FROM_SUBGRAPH(act_grad, act_grad, elewise_add_act_grad_pattern); GET_IR_NODE_FROM_SUBGRAPH(d_itermediate_out, d_itermediate_out, @@ -208,10 +208,10 @@ std::unique_ptr FuseElewiseAddActPass::FuseElewiseAddActInplaceGrad( auto fused_node = g->CreateOpNode(&desc); - VLOG(4) << "\n\t " << d_act_out_n << " and " << act_out_n << " -> " - << act_grad->Name() << " -> " << d_itermediate_out_n << "\n\t " - << d_itermediate_out_n << " and " << act_out_n << " -> " - << ele_add_grad->Name() << " -> " << d_itermediate_out_n; + VLOG(40) << "\n\t " << d_act_out_n << " and " << act_out_n << " -> " + << act_grad->Name() << " -> " << d_itermediate_out_n << "\n\t " + << d_itermediate_out_n << " and " << act_out_n << " -> " + << ele_add_grad->Name() << " -> " << d_itermediate_out_n; ReLinkNodes(g, d_itermediate_out, act_grad, ele_add_grad, fused_node); found_elewise_add_act_count++; diff --git a/paddle/fluid/framework/ir/graph.cc b/paddle/fluid/framework/ir/graph.cc index 132159b8b272f311060a39b58919c26822bf50ee..a2a8baa5e45d1791120e32c62dd0dbc533668290 100644 --- a/paddle/fluid/framework/ir/graph.cc +++ b/paddle/fluid/framework/ir/graph.cc @@ -92,7 +92,7 @@ Graph::Graph(const ProgramDesc &program) : program_(program) { std::map> Graph::InitFromProgram( const ProgramDesc &program) { - VLOG(3) << "block in program:" << program_.Size(); + VLOG(30) << "block in program:" << program_.Size(); std::unordered_map all_vars; // var nodes for each var name, will have multiple versions in SSA std::map> var_nodes; @@ -160,7 +160,7 @@ void Graph::ResolveHazard( auto it_old = versions.rbegin(); ++it_old; for (; it_old != versions.rend(); it_new = it_old, ++it_old) { - VLOG(3) << "deal with var: " << (*it_new)->Name(); + VLOG(30) << "deal with var: " << (*it_new)->Name(); ir::Node *write_op = (*it_new)->inputs.empty() ? nullptr : (*it_new)->inputs[0]; const auto &read_ops = (*it_old)->outputs; diff --git a/paddle/fluid/framework/ir/graph.h b/paddle/fluid/framework/ir/graph.h index 8830638ec8b70c3fcaaa83c2c3c819e2cc8ab795..6384d89d2f2af4ab1d733af5eb1561cab2d09728 100644 --- a/paddle/fluid/framework/ir/graph.h +++ b/paddle/fluid/framework/ir/graph.h @@ -89,7 +89,7 @@ class Graph { attr_name); attrs_[attr_name] = attr; attr_dels_[attr_name] = [attr, attr_name]() { - VLOG(3) << "deleting " << attr_name; + VLOG(30) << "deleting " << attr_name; delete attr; }; } diff --git a/paddle/fluid/framework/ir/graph_helper.cc b/paddle/fluid/framework/ir/graph_helper.cc index 01e878089171e4620f32b57a65d92d1c86d307db..98112c1ed317c230cb5150e7cbc6d0d173256601 100644 --- a/paddle/fluid/framework/ir/graph_helper.cc +++ b/paddle/fluid/framework/ir/graph_helper.cc @@ -33,8 +33,9 @@ void SortHelper( } } - VLOG(3) << "topology sort insert: " << node->Name() - << reinterpret_cast(node) << " input " << node->inputs.size(); + VLOG(30) << "topology sort insert: " << node->Name() + << reinterpret_cast(node) << " input " + << node->inputs.size(); ret->push_back(node); } @@ -103,9 +104,9 @@ std::map> BuildOperationAdjList( for (auto &var : n->inputs) { for (auto &adj_n : var->inputs) { PADDLE_ENFORCE(adj_n->NodeType() == ir::Node::Type::kOperation); - VLOG(4) << "adj " << adj_n->Name() << reinterpret_cast(adj_n) - << " -> " << n->Name() << reinterpret_cast(n) - << " via " << var->Name() << reinterpret_cast(var); + VLOG(40) << "adj " << adj_n->Name() << reinterpret_cast(adj_n) + << " -> " << n->Name() << reinterpret_cast(n) + << " via " << var->Name() << reinterpret_cast(var); adj_list[n].insert(adj_n); } } @@ -163,10 +164,10 @@ size_t GraphNum(const Graph &graph) { graph_nodes.emplace_back(g_nodes); } - if (VLOG_IS_ON(10)) { - VLOG(10) << "graph_num: " << graph_nodes.size(); + if (VLOG_IS_ON(100)) { + VLOG(100) << "graph_num: " << graph_nodes.size(); for (auto &g_n : graph_nodes) { - VLOG(10) << "graph_nodes: " << g_n.size(); + VLOG(100) << "graph_nodes: " << g_n.size(); if (g_n.size() < 10) { std::stringstream out; for (auto &node : g_n) { @@ -180,7 +181,7 @@ size_t GraphNum(const Graph &graph) { } out << "]"; } - VLOG(10) << out.str(); + VLOG(100) << out.str(); } } } diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.cc b/paddle/fluid/framework/ir/graph_pattern_detector.cc index b20d70132256bd5df7411c46ff4eb246b1f14ba8..30c1047ef53c54161fb4f54498b16ea66ab7c7dd 100644 --- a/paddle/fluid/framework/ir/graph_pattern_detector.cc +++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include #include #include #include @@ -91,19 +92,19 @@ void GraphPatternDetector::operator()(Graph *graph, PrettyLogEndl(Style::detail(), "--- detect %d subgraphs", subgraphs.size()); int id = 0; for (auto &g : subgraphs) { - VLOG(3) << "optimizing #" << id++ << " subgraph"; + VLOG(30) << "optimizing #" << id++ << " subgraph"; handler(g, graph); } } bool GraphPatternDetector::MarkPDNodesInGraph(const ir::Graph &graph) { - VLOG(3) << "mark pdnodes in graph"; + VLOG(30) << "mark pdnodes in graph"; if (graph.Nodes().empty()) return false; for (auto &node : GraphTraits::DFS(graph)) { for (const auto &pdnode : pattern_.nodes()) { if (pdnode->Tell(&node)) { - VLOG(4) << "pdnode " << pdnode->name() << " marked"; + VLOG(40) << "pdnode " << pdnode->name() << " marked"; pdnodes2nodes_[pdnode.get()].insert(&node); } } @@ -111,7 +112,7 @@ bool GraphPatternDetector::MarkPDNodesInGraph(const ir::Graph &graph) { // Check to early stop if some PDNode can't find matched Node. for (auto &pdnode : pattern_.nodes()) { if (!pdnodes2nodes_.count(pdnode.get())) { - VLOG(4) << pdnode->name() << " can't find matched Node, early stop"; + VLOG(40) << pdnode->name() << " can't find matched Node, early stop"; // return false; } } @@ -120,7 +121,7 @@ bool GraphPatternDetector::MarkPDNodesInGraph(const ir::Graph &graph) { GetMarkedNodes(const_cast(&graph)).insert(n); } } - VLOG(3) << pdnodes2nodes_.size() << " nodes marked"; + VLOG(30) << pdnodes2nodes_.size() << " nodes marked"; return !pdnodes2nodes_.empty(); } @@ -213,7 +214,7 @@ GraphPatternDetector::DetectPatterns() { // Extend a PDNode to subgraphs by deducing the connection relations defined // in edges of PDNodes. for (const auto &edge : pattern_.edges()) { - VLOG(4) << "check " << edge.first->name() << " -> " << edge.second->name(); + VLOG(40) << "check " << edge.first->name() << " -> " << edge.second->name(); // TODO(Superjomn) Fix bug here, the groups might be duplicate here. // Each role has two PDNodes, which indicates two roles. // Detect two Nodes that can match these two roles and they are connected. @@ -224,7 +225,7 @@ GraphPatternDetector::DetectPatterns() { // source -> target for (Node *source : pdnodes2nodes_[edge.first]) { for (Node *target : pdnodes2nodes_[edge.second]) { - VLOG(8) << "check " << source->id() << " -- " << target->id(); + VLOG(80) << "check " << source->id() << " -- " << target->id(); // TODO(Superjomn) add some prune strategies. for (const auto &group : pre_groups) { HitGroup new_group = group; @@ -240,12 +241,13 @@ GraphPatternDetector::DetectPatterns() { } } } - VLOG(3) << "step " << step << " get records: " << cur_groups.size(); + VLOG(30) << "step " << step << " get records: " << cur_groups.size(); for (auto &group : cur_groups) { for (auto &item : group.roles) { - VLOG(4) << "node " << item.second->id() << " as " << item.first->name(); + VLOG(40) << "node " << item.second->id() << " as " + << item.first->name(); } - VLOG(4) << "========================================================="; + VLOG(40) << "========================================================="; } } @@ -259,14 +261,16 @@ GraphPatternDetector::DetectPatterns() { return result; } -bool GraphItemCMP(const std::pair &a, +struct GraphItemLessThan { + bool operator()(const std::pair &a, const std::pair &b) { - if (a.first != b.first) { - return a.first < b.first; - } else { - return a.second < b.second; + if (a.first != b.first) { + return a.first < b.first; + } else { + return a.second < b.second; + } } -} +}; // TODO(Superjomn) enhance the function as it marks unique unique as duplicates // see https://github.com/PaddlePaddle/Paddle/issues/13550 @@ -280,7 +284,7 @@ void GraphPatternDetector::UniquePatterns( for (auto &g : *subgraphs) { // Sort the items in the sub-graph, and transform to a string key. std::vector> sorted_keys(g.begin(), g.end()); - std::sort(sorted_keys.begin(), sorted_keys.end(), GraphItemCMP); + std::sort(sorted_keys.begin(), sorted_keys.end(), GraphItemLessThan()); std::stringstream ss; for (auto &item : sorted_keys) { ss << item.first << ":" << item.second; diff --git a/paddle/fluid/framework/ir/graph_viz_pass.cc b/paddle/fluid/framework/ir/graph_viz_pass.cc index 31ed98db72c8fd4af8c970861d386687962001ce..13dd354dc59b2bf00a741c565a4c97719eac76c3 100644 --- a/paddle/fluid/framework/ir/graph_viz_pass.cc +++ b/paddle/fluid/framework/ir/graph_viz_pass.cc @@ -41,7 +41,7 @@ std::string FormatName(const Node* node) { std::unique_ptr GraphVizPass::ApplyImpl( std::unique_ptr graph) const { const std::string graph_viz_path = Get(kGraphVizPath); - VLOG(3) << "draw IR graph viz to " << graph_viz_path; + VLOG(30) << "draw IR graph viz to " << graph_viz_path; std::unique_ptr fout(new std::ofstream(graph_viz_path)); PADDLE_ENFORCE(fout->good()); std::ostream& sout = *fout; diff --git a/paddle/fluid/framework/ir/mkldnn_placement_pass.cc b/paddle/fluid/framework/ir/mkldnn_placement_pass.cc index 65be69b7f5b5e363d5d0753c45f9ff9e3f329fbe..145a3a455c8ae2c1e6a5bc4fefa3491f420af5ba 100644 --- a/paddle/fluid/framework/ir/mkldnn_placement_pass.cc +++ b/paddle/fluid/framework/ir/mkldnn_placement_pass.cc @@ -20,7 +20,7 @@ namespace ir { std::unique_ptr MKLDNNPlacementPass::ApplyImpl( std::unique_ptr graph) const { - VLOG(3) << "Aplies MKL-DNN placement strategy."; + VLOG(30) << "Aplies MKL-DNN placement strategy."; for (const Node* n : graph->Nodes()) { if (n->IsOp() && n->Op()->HasAttr("use_mkldnn")) { n->Op()->SetAttr("use_mkldnn", true); diff --git a/paddle/fluid/framework/ir/multi_batch_merge_pass.cc b/paddle/fluid/framework/ir/multi_batch_merge_pass.cc index bd5b76426eb55cebdabfccd700439a4c418a10f0..532961e4d59ad3611dc93b20738080d1755290e8 100644 --- a/paddle/fluid/framework/ir/multi_batch_merge_pass.cc +++ b/paddle/fluid/framework/ir/multi_batch_merge_pass.cc @@ -62,7 +62,7 @@ VarDesc UpdateGradVarDesc( string::Sprintf("%s.repeat.%d", var_desc->Name(), repeat); VarDesc repeated_var = CopyVarDesc(var_desc); repeated_var.SetName(new_gname); - VLOG(3) << "update " << var_desc->Name() << " to repeat " << repeat; + VLOG(30) << "update " << var_desc->Name() << " to repeat " << repeat; return repeated_var; } return *var_desc; @@ -78,7 +78,7 @@ std::unique_ptr BatchMergePass::ApplyImpl( std::vector nodes = TopologySortOperations(*graph); auto origin_nodes = graph->ReleaseNodes(); - VLOG(3) << "origin nodes count: " << origin_nodes.size(); + VLOG(30) << "origin nodes count: " << origin_nodes.size(); ir::Graph& result = *graph; // 1. record op nodes of different roles @@ -137,8 +137,8 @@ std::unique_ptr BatchMergePass::ApplyImpl( "%s.repeat.%d", repeated_op.Input("Variance")[0], i); bn_vars_need_rename.insert(repeated_op.Input("Mean")[0]); bn_vars_need_rename.insert(repeated_op.Input("Variance")[0]); - VLOG(3) << "renaming " << repeated_op.Input("Mean")[0] << " to " - << new_mean_name; + VLOG(30) << "renaming " << repeated_op.Input("Mean")[0] << " to " + << new_mean_name; repeated_op.RenameInput(repeated_op.Input("Mean")[0], new_mean_name); repeated_op.RenameInput(repeated_op.Input("Variance")[0], new_var_name); repeated_op.RenameOutput(repeated_op.Output("MeanOut")[0], diff --git a/paddle/fluid/framework/ir/pass.h b/paddle/fluid/framework/ir/pass.h index 9570c59cff2a6afeb1c607f7219b7b455974d6ce..8ac8d7677e1fe339d7802ea262e61a02a678aab5 100644 --- a/paddle/fluid/framework/ir/pass.h +++ b/paddle/fluid/framework/ir/pass.h @@ -76,7 +76,7 @@ class Pass { attr_name); attrs_[attr_name] = attr; attr_dels_[attr_name] = [attr, attr_name]() { - VLOG(3) << "deleting " << attr_name; + VLOG(30) << "deleting " << attr_name; delete attr; }; } diff --git a/paddle/fluid/framework/ir/seq_concat_fc_fuse_pass.cc b/paddle/fluid/framework/ir/seq_concat_fc_fuse_pass.cc index a7d5161c35db804703415066990f34da8109fbd9..b7687d61de3eacd47ff1208ba14c3f482215c1d4 100644 --- a/paddle/fluid/framework/ir/seq_concat_fc_fuse_pass.cc +++ b/paddle/fluid/framework/ir/seq_concat_fc_fuse_pass.cc @@ -12,10 +12,13 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/framework/ir/seq_concat_fc_fuse_pass.h" +#include +#include + #include "paddle/fluid/framework/ir/fuse_pass_base.h" #include "paddle/fluid/framework/ir/graph_pattern_detector.h" #include "paddle/fluid/framework/ir/graph_viz_pass.h" +#include "paddle/fluid/framework/ir/seq_concat_fc_fuse_pass.h" #include "paddle/fluid/framework/lod_tensor.h" namespace paddle { @@ -159,10 +162,7 @@ PDNode* BuildFCPattern(PDPattern* pattern, PDNode* fc_x) { std::set acts({"sigmoid", "tanh", "relu", "identity"}); PDNode* act = pattern->NewNode( - [=](Node* x) { - return x && x->IsOp() && acts.count(x->Op()->Type()); - - }, + [=](Node* x) { return x && x->IsOp() && acts.count(x->Op()->Type()); }, "act"); PDNode* fc_out = pattern->NewNode( @@ -196,7 +196,7 @@ std::unique_ptr SeqConcatFcFusePass::ApplyImpl( detector(graph.get(), [&](const GraphPatternDetector::subgraph_t& subgraph, Graph* graph) { - VLOG(4) << "get one concat pattern"; + VLOG(40) << "get one concat pattern"; // fc GET_NODE(fc_w, detector.pattern()); GET_NODE(fc_bias, detector.pattern()); diff --git a/paddle/fluid/framework/ir/seqconv_eltadd_relu_fuse_pass.cc b/paddle/fluid/framework/ir/seqconv_eltadd_relu_fuse_pass.cc index 0a1f65d274708dd208d7783c6273160c4c61738a..015b5e3c6363cc96e31e21095fbbb007543c99af 100644 --- a/paddle/fluid/framework/ir/seqconv_eltadd_relu_fuse_pass.cc +++ b/paddle/fluid/framework/ir/seqconv_eltadd_relu_fuse_pass.cc @@ -60,7 +60,7 @@ int BuildFusion(Graph* graph, const std::string& name_scope, Scope* scope) { auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph, Graph* g) { - VLOG(4) << "handle SeqConv EltAdd Relu fuse"; + VLOG(40) << "handle SeqConv EltAdd Relu fuse"; GET_IR_NODE_FROM_SUBGRAPH(seqconv, seqconv, fuse_pattern); GET_IR_NODE_FROM_SUBGRAPH(seqconv_weight, seqconv_weight, fuse_pattern); GET_IR_NODE_FROM_SUBGRAPH(seqconv_out, seqconv_out, fuse_pattern); diff --git a/paddle/fluid/framework/lod_rank_table.cc b/paddle/fluid/framework/lod_rank_table.cc index 6bc795b642bf79b7556869c5ebe9b0323d3cc5fc..660ce2ec85131bafae27e8b7800fbfa3c238b59a 100644 --- a/paddle/fluid/framework/lod_rank_table.cc +++ b/paddle/fluid/framework/lod_rank_table.cc @@ -31,7 +31,7 @@ void LoDRankTable::Reset(const LoD& lod, size_t level) { TableItem item; item.index = i; item.length = vec[i + 1] - vec[i]; - VLOG(10) << "Add item to rank table " << item.index << " " << item.length; + VLOG(100) << "Add item to rank table " << item.index << " " << item.length; items_.emplace_back(item); } // NOTE(yuyang18): diff --git a/paddle/fluid/framework/mixed_vector_test.cc b/paddle/fluid/framework/mixed_vector_test.cc index 0599c8d384641606b0a5ebb5ba1781b56f539e63..0330cae377c32b2d49d409eff42b968d81356d49 100644 --- a/paddle/fluid/framework/mixed_vector_test.cc +++ b/paddle/fluid/framework/mixed_vector_test.cc @@ -51,7 +51,7 @@ TEST(mixed_vector, InitWithCount) { TEST(mixed_vector, ForEach) { vec tmp; for (auto& v : tmp) { - VLOG(3) << v; + VLOG(30) << v; } } diff --git a/paddle/fluid/framework/naive_executor.cc b/paddle/fluid/framework/naive_executor.cc index 7fb42feb95b4d54aec693228721c052f683f4d80..8e660f97f051b194a0305dc82371fbe64da7e061 100644 --- a/paddle/fluid/framework/naive_executor.cc +++ b/paddle/fluid/framework/naive_executor.cc @@ -71,7 +71,7 @@ void NaiveExecutor::Prepare(Scope *parent_scope, void NaiveExecutor::Run() { for (auto &op : ops_) { - VLOG(4) << "run " << op->Type(); + VLOG(40) << "run " << op->Type(); op->Run(*scope_, place_); } } @@ -95,21 +95,21 @@ void NaiveExecutor::CreateVariables(const ProgramDesc &desc, Scope *scope, if (var->Persistable()) { auto *ptr = const_cast(ancestor_scope)->Var(var->Name()); InitializeVariable(ptr, var->GetType()); - VLOG(3) << "Create Variable " << var->Name() - << " global, which pointer is " << ptr; + VLOG(30) << "Create Variable " << var->Name() + << " global, which pointer is " << ptr; } else { // Create temporary variables in local scope. auto *ptr = scope->Var(var->Name()); InitializeVariable(ptr, var->GetType()); - VLOG(3) << "Create Variable " << var->Name() - << " locally, which pointer is " << ptr; + VLOG(30) << "Create Variable " << var->Name() + << " locally, which pointer is " << ptr; } } } else { for (auto &var : global_block.AllVars()) { auto *ptr = scope->Var(var->Name()); InitializeVariable(ptr, var->GetType()); - VLOG(3) << "Create variable " << var->Name() << ", which pointer is " - << ptr; + VLOG(30) << "Create variable " << var->Name() << ", which pointer is " + << ptr; } } } diff --git a/paddle/fluid/framework/op_desc.cc b/paddle/fluid/framework/op_desc.cc index 8ece618f3f72552fedcffab3e03ebb30476b7cab..fbaa169df6324761ef9136aa173dce4e2182ed38 100644 --- a/paddle/fluid/framework/op_desc.cc +++ b/paddle/fluid/framework/op_desc.cc @@ -82,7 +82,7 @@ class CompileTimeInferShapeContext : public InferShapeContext { auto *in_var = block_.FindVarRecursive(Inputs(in)[i]); auto *out_var = block_.FindVarRecursive(Outputs(out)[j]); if (in_var->GetType() != proto::VarType::LOD_TENSOR) { - VLOG(3) << "input " << in << " is not LodTensor"; + VLOG(30) << "input " << in << " is not LodTensor"; return; } out_var->SetLoDLevel(in_var->GetLoDLevel()); @@ -241,32 +241,32 @@ void OpDesc::SetAttr(const std::string &name, const Attribute &v) { const proto::OpProto::Attr &attr = GetProtoAttr(name); switch (attr.type()) { case proto::AttrType::BOOLEANS: { - VLOG(11) << "SetAttr: " << Type() << ", " << name - << " from INTS to BOOLEANS"; + VLOG(110) << "SetAttr: " << Type() << ", " << name + << " from INTS to BOOLEANS"; this->attrs_[name] = std::vector(); break; } case proto::AttrType::INTS: { - VLOG(11) << "SetAttr: " << Type() << ", " << name - << " from INTS to INTS"; + VLOG(110) << "SetAttr: " << Type() << ", " << name + << " from INTS to INTS"; this->attrs_[name] = std::vector(); break; } case proto::AttrType::FLOATS: { - VLOG(11) << "SetAttr: " << Type() << ", " << name - << " from INTS to FLOATS"; + VLOG(110) << "SetAttr: " << Type() << ", " << name + << " from INTS to FLOATS"; this->attrs_[name] = std::vector(); break; } case proto::AttrType::STRINGS: { - VLOG(11) << "SetAttr: " << Type() << ", " << name - << " from INTS to STRINGS"; + VLOG(110) << "SetAttr: " << Type() << ", " << name + << " from INTS to STRINGS"; this->attrs_[name] = std::vector(); break; } case proto::AttrType::BLOCKS: { - VLOG(11) << "SetAttr: " << Type() << ", " << name - << " from INTS to BLOCKS"; + VLOG(110) << "SetAttr: " << Type() << ", " << name + << " from INTS to BLOCKS"; this->SetBlocksAttr(name, std::vector()); return; } @@ -499,13 +499,13 @@ void OpDesc::CheckAttrs() { } void OpDesc::InferShape(const BlockDesc &block) const { - VLOG(3) << "CompileTime infer shape on " << Type(); + VLOG(30) << "CompileTime infer shape on " << Type(); InitInferShapeFuncs(); auto &infer_shape = OpInfoMap::Instance().Get(this->Type()).infer_shape_; PADDLE_ENFORCE(static_cast(infer_shape), "%s's infer_shape has not been registered", this->Type()); CompileTimeInferShapeContext ctx(*this, block); - if (VLOG_IS_ON(10)) { + if (VLOG_IS_ON(100)) { std::ostringstream sout; auto inames = this->InputArgumentNames(); sout << " From ["; @@ -516,7 +516,7 @@ void OpDesc::InferShape(const BlockDesc &block) const { std::copy(onames.begin(), onames.end(), std::ostream_iterator(sout, ", ")); sout << "]"; - VLOG(10) << sout.str(); + VLOG(100) << sout.str(); } infer_shape(&ctx); } @@ -607,7 +607,7 @@ DDim CompileTimeInferShapeContext::GetDim(const std::string &name) const { auto shape = var->GetShape(); res = shape.empty() ? make_ddim({0UL}) : make_ddim(shape); } catch (...) { - VLOG(5) << "GetDim of variable " << name << " error"; + VLOG(50) << "GetDim of variable " << name << " error"; std::rethrow_exception(std::current_exception()); } return res; @@ -624,7 +624,7 @@ std::vector CompileTimeInferShapeContext::GetRepeatedDims( res.push_back(s.empty() ? make_ddim({0UL}) : make_ddim(s)); } } catch (...) { - VLOG(5) << "GetRepeatedDim of variable " << name << " error."; + VLOG(50) << "GetRepeatedDim of variable " << name << " error."; std::rethrow_exception(std::current_exception()); } return res; diff --git a/paddle/fluid/framework/op_registry.cc b/paddle/fluid/framework/op_registry.cc index bfc411ca2c4a483e344b368da089392d8e4a87c1..4a841bae8323f5733ba413a2c623a8147ec32f67 100644 --- a/paddle/fluid/framework/op_registry.cc +++ b/paddle/fluid/framework/op_registry.cc @@ -46,9 +46,9 @@ static VariableNameMap ConvertOpDescVarsToVarNameMap( std::unique_ptr OpRegistry::CreateOp( const proto::OpDesc& op_desc) { - VLOG(1) << "CreateOp directly from OpDesc is deprecated. It should only be" - "used in unit tests. Use CreateOp(const OpDesc& op_desc) " - "instead."; + VLOG(10) << "CreateOp directly from OpDesc is deprecated. It should only be" + "used in unit tests. Use CreateOp(const OpDesc& op_desc) " + "instead."; VariableNameMap inputs = ConvertOpDescVarsToVarNameMap(op_desc.inputs()); VariableNameMap outputs = ConvertOpDescVarsToVarNameMap(op_desc.outputs()); AttributeMap attrs; diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc index 73886ed304188d95e1d6a4a49f8ce54168ae41ce..5624878d439873e5f6aee6ec9234e31d5c77ff97 100644 --- a/paddle/fluid/framework/operator.cc +++ b/paddle/fluid/framework/operator.cc @@ -140,7 +140,7 @@ static LoD GetLoD(const Scope& scope, const std::string& name) { } void OperatorBase::Run(const Scope& scope, const platform::Place& place) { - VLOG(4) << place << " " << DebugStringEx(&scope); + VLOG(40) << place << " " << DebugStringEx(&scope); if (platform::is_gpu_place(place)) { #ifndef PADDLE_WITH_CUDA PADDLE_THROW("Cannot run operator on place %s", place); @@ -160,7 +160,7 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) { } else { RunImpl(scope, place); } - VLOG(3) << place << " " << DebugStringEx(&scope); + VLOG(30) << place << " " << DebugStringEx(&scope); } bool OperatorBase::HasInputs(const std::string& name) const { @@ -259,6 +259,8 @@ std::string OperatorBase::DebugStringEx(const Scope* scope) const { if (row_size >= 0) { ss << "[row_size=" << row_size << "]"; } + std::string dtype = GetDtype(*scope, output.second[i]); + ss << ":" << dtype; ss << "[" << GetDims(*scope, var_name, true) << "]"; ss << "(" << GetLoD(*scope, var_name) << ")"; } @@ -715,14 +717,14 @@ void OperatorWithKernel::RunImpl(const Scope& scope, auto expected_kernel_key = this->GetExpectedKernelType(ExecutionContext(*this, scope, *dev_ctx)); - VLOG(3) << "expected_kernel_key:" << expected_kernel_key; + VLOG(30) << "expected_kernel_key:" << expected_kernel_key; auto kernel_iter = kernels.find(expected_kernel_key); #ifdef PADDLE_WITH_MKLDNN // workaround for missing MKLDNN kernel when FLAGS_use_mkldnn env var is set if (kernel_iter == kernels.end() && expected_kernel_key.library_type_ == LibraryType::kMKLDNN) { - VLOG(3) << "missing MKLDNN kernel: fallbacking to PLAIN one"; + VLOG(30) << "missing MKLDNN kernel: fallbacking to PLAIN one"; expected_kernel_key.library_type_ = LibraryType::kPlain; expected_kernel_key.data_layout_ = DataLayout::kAnyLayout; kernel_iter = kernels.find(expected_kernel_key); @@ -774,7 +776,8 @@ void OperatorWithKernel::TransferInplaceVarsBack( const Scope& scope, const std::vector& inplace_vars, const Scope& transfer_scope) const { for (auto& var_name : inplace_vars) { - VLOG(3) << "share inplace var " + var_name + " back to it's original scope"; + VLOG(30) << "share inplace var " + var_name + + " back to it's original scope"; auto* original_tensor = GetMutableLoDTensorOrSelectedRowsValueFromVar(scope.FindVar(var_name)); auto* var = transfer_scope.FindVar(var_name); @@ -815,8 +818,8 @@ Scope* OperatorWithKernel::TryTransferData( transfered_inplace_vars->emplace_back(var_name); } - VLOG(3) << "Transform Variable " << var_name << " from " - << kernel_type_for_var << " to " << expected_kernel_key; + VLOG(30) << "Transform Variable " << var_name << " from " + << kernel_type_for_var << " to " << expected_kernel_key; if (new_scope == nullptr) { new_scope = &scope.NewScope(); diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index dfb107688ad7281765049cd9849d56b8a61bdd37..39b47415ff7e378cabc79e668fe2be63eb71d87f 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -199,7 +199,7 @@ void ParallelExecutor::BCastParamsToDevices( auto &main_tensor = main_var->Get(); if (!main_tensor.IsInitialized()) { - VLOG(3) << "one in var not inited, return!"; + VLOG(30) << "one in var not inited, return!"; continue; } auto &dims = main_tensor.dims(); diff --git a/paddle/fluid/framework/scope.cc b/paddle/fluid/framework/scope.cc index a4abd1b1283f08fb8431fbeea0cea17c8439fdd7..0c407f8c1d11a8a0f99551fc51d2ef2be5262c63 100644 --- a/paddle/fluid/framework/scope.cc +++ b/paddle/fluid/framework/scope.cc @@ -149,7 +149,7 @@ Variable* Scope::VarInternal(const std::string& name) { v = new Variable(); vars_[name].reset(v); - VLOG(3) << "Create variable " << name; + VLOG(30) << "Create variable " << name; v->name_ = &(vars_.find(name)->first); return v; } diff --git a/paddle/fluid/framework/selected_rows.cc b/paddle/fluid/framework/selected_rows.cc index 8c290bb095d554a973e66a3a19606a06759fd668..3319c772ec789bc5b28307906adfb2a9417d9182 100644 --- a/paddle/fluid/framework/selected_rows.cc +++ b/paddle/fluid/framework/selected_rows.cc @@ -176,7 +176,7 @@ void SelectedRows::Get(const framework::Tensor& ids, framework::Tensor* value, PADDLE_ENFORCE(value->IsInitialized(), "The value tensor should be initialized."); if (ids.numel() == 0) { - VLOG(3) << "keys is empty, please check data!"; + VLOG(30) << "keys is empty, please check data!"; } else { int64_t value_width = value_->numel() / value_->dims()[0]; PADDLE_ENFORCE_EQ(value_width, value->numel() / value->dims()[0], diff --git a/paddle/fluid/framework/tensor_util.cc b/paddle/fluid/framework/tensor_util.cc index ca1e01c89f07c4ffc3979a6a6c3728328e0a1819..8d8f07a1f52b3062498b59a4dbc20219d42e4735 100644 --- a/paddle/fluid/framework/tensor_util.cc +++ b/paddle/fluid/framework/tensor_util.cc @@ -22,8 +22,8 @@ namespace framework { void TensorCopy(const Tensor& src, const platform::Place& dst_place, const platform::DeviceContext& ctx, Tensor* dst) { - VLOG(3) << "TensorCopy " << src.dims() << " from " << src.place() << " to " - << dst_place; + VLOG(30) << "TensorCopy " << src.dims() << " from " << src.place() << " to " + << dst_place; src.check_memory_size(); dst->Resize(src.dims()); @@ -37,8 +37,8 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place, if (platform::is_cpu_place(src_place) && platform::is_cpu_place(dst_place)) { if (src_ptr == dst_ptr) { - VLOG(3) << "Skip copy the same data async from " << src_place << " to " - << dst_place; + VLOG(30) << "Skip copy the same data async from " << src_place << " to " + << dst_place; return; } memory::Copy(boost::get(dst_place), dst_ptr, @@ -77,8 +77,8 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place, reinterpret_cast(ctx).stream(); if (platform::is_same_place(src_place, dst_place)) { if (src_ptr == dst_ptr) { - VLOG(3) << "Skip copy the same data async from " << src_place << " to " - << dst_place; + VLOG(30) << "Skip copy the same data async from " << src_place << " to " + << dst_place; return; } memory::Copy(dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size, @@ -114,8 +114,8 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place, void TensorCopySync(const Tensor& src, const platform::Place& dst_place, Tensor* dst) { - VLOG(3) << "TensorCopySync " << src.dims() << " from " << src.place() - << " to " << dst_place; + VLOG(30) << "TensorCopySync " << src.dims() << " from " << src.place() + << " to " << dst_place; src.check_memory_size(); dst->Resize(src.dims()); dst->set_layout(src.layout()); @@ -125,8 +125,8 @@ void TensorCopySync(const Tensor& src, const platform::Place& dst_place, auto size = src.numel() * SizeOfType(src.type()); if (platform::is_cpu_place(src_place) && platform::is_cpu_place(dst_place)) { if (src_ptr == dst_ptr) { - VLOG(3) << "Skip copy the same data from " << src_place << " to " - << dst_place; + VLOG(30) << "Skip copy the same data from " << src_place << " to " + << dst_place; return; } memory::Copy(boost::get(dst_place), dst_ptr, @@ -146,8 +146,8 @@ void TensorCopySync(const Tensor& src, const platform::Place& dst_place, } else if (platform::is_gpu_place(src_place) && platform::is_gpu_place(dst_place)) { if (src_ptr == dst_ptr && platform::is_same_place(src_place, dst_place)) { - VLOG(3) << "Skip copy the same data from " << src_place << " to " - << dst_place; + VLOG(30) << "Skip copy the same data from " << src_place << " to " + << dst_place; return; } auto src_gpu_place = boost::get(src_place); diff --git a/paddle/fluid/framework/threadpool.cc b/paddle/fluid/framework/threadpool.cc index fcec955360f1c681a62929e904d5736854a8ffad..2dab4e793eeacd65239786976948b8043aeeb215 100644 --- a/paddle/fluid/framework/threadpool.cc +++ b/paddle/fluid/framework/threadpool.cc @@ -39,7 +39,7 @@ void ThreadPool::Init() { int num_threads = std::thread::hardware_concurrency(); if (FLAGS_dist_threadpool_size > 0) { num_threads = FLAGS_dist_threadpool_size; - VLOG(1) << "set dist_threadpool_size to " << num_threads; + VLOG(10) << "set dist_threadpool_size to " << num_threads; } PADDLE_ENFORCE_GT(num_threads, 0); threadpool_.reset(new ThreadPool(num_threads)); diff --git a/paddle/fluid/framework/var_desc.cc b/paddle/fluid/framework/var_desc.cc index 7e3f002b53351ba5892aaa50482b21a83db94069..29ef459b454075a30c3a4d0ff0f9ef1212292b4b 100644 --- a/paddle/fluid/framework/var_desc.cc +++ b/paddle/fluid/framework/var_desc.cc @@ -61,10 +61,10 @@ size_t VarDesc::GetTensorDescNum() const { void VarDesc::SetShapes( const std::vector> &multiple_dims) { if (multiple_dims.size() != GetTensorDescNum()) { - VLOG(3) << "WARNING: The number of given shapes(" << multiple_dims.size() - << ") doesn't match the existing tensor number(" - << GetTensorDescNum() - << "). The Reader is going to be reinitialized."; + VLOG(30) << "WARNING: The number of given shapes(" << multiple_dims.size() + << ") doesn't match the existing tensor number(" + << GetTensorDescNum() + << "). The Reader is going to be reinitialized."; SetTensorDescNum(multiple_dims.size()); } std::vector tensors = mutable_tensor_descs(); @@ -94,11 +94,11 @@ void VarDesc::SetDataType(proto::VarType::Type data_type) { void VarDesc::SetDataTypes( const std::vector &multiple_data_type) { if (multiple_data_type.size() != GetTensorDescNum()) { - VLOG(3) << "WARNING: The number of given data types(" - << multiple_data_type.size() - << ") doesn't match the existing tensor number(" - << GetTensorDescNum() - << "). The Reader is going to be reinitialized."; + VLOG(30) << "WARNING: The number of given data types(" + << multiple_data_type.size() + << ") doesn't match the existing tensor number(" + << GetTensorDescNum() + << "). The Reader is going to be reinitialized."; SetTensorDescNum(multiple_data_type.size()); } std::vector tensor_descs = @@ -139,11 +139,11 @@ void VarDesc::SetLoDLevel(int32_t lod_level) { void VarDesc::SetLoDLevels(const std::vector &multiple_lod_level) { if (multiple_lod_level.size() != GetTensorDescNum()) { - VLOG(3) << "WARNING: The number of given lod_levels(" - << multiple_lod_level.size() - << ") doesn't match the existing tensor number(" - << GetTensorDescNum() - << "). The Reader is going to be reinitialized."; + VLOG(30) << "WARNING: The number of given lod_levels(" + << multiple_lod_level.size() + << ") doesn't match the existing tensor number(" + << GetTensorDescNum() + << "). The Reader is going to be reinitialized."; SetTensorDescNum(multiple_lod_level.size()); } switch (desc_.type().type()) { diff --git a/paddle/fluid/framework/var_type_inference.h b/paddle/fluid/framework/var_type_inference.h index f3035cd712bdea517068b4c172bb2794d5fccddb..64236b78d2e390ea5f6c43c76a4b33b62c67629f 100644 --- a/paddle/fluid/framework/var_type_inference.h +++ b/paddle/fluid/framework/var_type_inference.h @@ -13,6 +13,9 @@ See the License for the specific language governing permissions and limitations under the License. */ #pragma once +#include +#include "paddle/fluid/framework/block_desc.h" +#include "paddle/fluid/framework/op_desc.h" #include "paddle/fluid/framework/type_defs.h" namespace paddle { @@ -24,5 +27,27 @@ class VarTypeInference { virtual void operator()(const OpDesc& op_desc, BlockDesc* block) const = 0; }; +class PassInDtypeAndVarTypeToOutput : public framework::VarTypeInference { + public: + void operator()(const framework::OpDesc& op_desc, + framework::BlockDesc* block) const final { + auto in_out_var_names = this->GetInputOutputWithSameType(); + + for (auto& i_o_n : in_out_var_names) { + auto& x_name = op_desc.Input(i_o_n.first).at(0); + auto& out_name = op_desc.Output(i_o_n.second).at(0); + + auto& x = block->FindRecursiveOrCreateVar(x_name); + auto& out = block->FindRecursiveOrCreateVar(out_name); + out.SetType(x.GetType()); + out.SetDataType(x.GetDataType()); + } + } + + protected: + virtual std::unordered_map + GetInputOutputWithSameType() const = 0; +}; + } // namespace framework } // namespace paddle diff --git a/paddle/fluid/inference/analysis/analyzer.cc b/paddle/fluid/inference/analysis/analyzer.cc index ef4142f334e503380dc7ccd74c348404ffe52ee6..d55303a51e9fee3057455470b4b3139dc5f85e89 100644 --- a/paddle/fluid/inference/analysis/analyzer.cc +++ b/paddle/fluid/inference/analysis/analyzer.cc @@ -60,7 +60,7 @@ class DfgPassManagerImpl final : public DfgPassManager { private: void AddPass(const std::string& name, AnalysisPass* pass) { - VLOG(3) << "Adding pass " << name; + VLOG(30) << "Adding pass " << name; Register(name, pass); AddGraphvizDebugerPass(pass); } @@ -101,22 +101,25 @@ Analyzer::Analyzer() { Register("manager1", new DfgPassManagerImpl); } void Analyzer::Run(Argument* argument) { std::vector passes; + passes.push_back("graph_viz_pass"); // add graphviz for debug. #ifdef PADDLE_WITH_MKLDNN if (use_mkldnn_) { - VLOG(3) << "Adding MKL-DNN placement pass"; + VLOG(30) << "Adding MKL-DNN placement pass"; passes.push_back("mkldnn_placement_pass"); } #endif // infer_clean_graph_pass should be the first default pass // after mkldnn_placement_pass. passes.push_back("infer_clean_graph_pass"); + passes.push_back("graph_viz_pass"); // add graphviz for debug. for (auto& pass : ir_passes_) { - if (!disabled_ir_passes_.count(pass)) { + // skip mkldnn pass when use_mkldnn_ = false; + bool skip_pass = (!use_mkldnn_) && pass.find("mkldnn") != std::string::npos; + if (!disabled_ir_passes_.count(pass) && !skip_pass) { passes.push_back(pass); passes.push_back("graph_viz_pass"); // add graphviz for debug. } } - passes.push_back("graph_viz_pass"); argument->Set(kFluidToIrPassesAttr, new std::vector(passes)); for (auto& x : data_) { diff --git a/paddle/fluid/inference/analysis/argument.h b/paddle/fluid/inference/analysis/argument.h index e8fb0775b45761f64fd6fd28306c35b76d1e40c4..9495e2435c79ff660c64322d2acd8e058e09e563 100644 --- a/paddle/fluid/inference/analysis/argument.h +++ b/paddle/fluid/inference/analysis/argument.h @@ -68,8 +68,8 @@ struct Argument { key); attrs_[key] = data; attr_deleters_[key] = [data, key]() { - VLOG(3) << "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"; - VLOG(3) << "argument delete attr: " << key; + VLOG(30) << "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"; + VLOG(30) << "argument delete attr: " << key; delete data; }; } diff --git a/paddle/fluid/inference/analysis/data_flow_graph.cc b/paddle/fluid/inference/analysis/data_flow_graph.cc index 8c7d58678fd29cb25d13d64a08e6c6f26f242d8b..545017da07f8e414f21a02c35fca96aba6de41aa 100644 --- a/paddle/fluid/inference/analysis/data_flow_graph.cc +++ b/paddle/fluid/inference/analysis/data_flow_graph.cc @@ -112,8 +112,8 @@ void DataFlowGraph::Build(const framework::proto::ProgramDesc &prog) { out_alias->SetPbMsg(out->pb_msg()); var2id[out_alias->name()] = out_alias->id(); // update variable's alias Node - LOG(INFO) << "loop found in graph, create SSA alias node [" - << out_alias->repr() << "] for [" << out->repr() << "]"; + VLOG(40) << "loop found in graph, create SSA alias node [" + << out_alias->repr() << "] for [" << out->repr() << "]"; out = out_alias; } out->inlinks.push_back(o); @@ -132,7 +132,7 @@ void DataFlowGraph::Build(const framework::ir::Graph &graph) { Node *x{nullptr}; if (ir_node->IsOp()) { PADDLE_ENFORCE(ir_node->Op()); - VLOG(4) << "get op " << ir_node << " " << ir_node->Name(); + VLOG(40) << "get op " << ir_node << " " << ir_node->Name(); x = nodes.Create(Node::Type::kFunction); x->attr("ir_node").Pointer() = ir_node; PADDLE_ENFORCE(ir_node->Op()->Proto()); @@ -141,7 +141,7 @@ void DataFlowGraph::Build(const framework::ir::Graph &graph) { } else if (ir_node->IsVar()) { // Not create a Node for IR ControlDepVar, considering Inference currently // just used in single thread scenerio. - VLOG(4) << "get var " << ir_node->Name(); + VLOG(40) << "get var " << ir_node->Name(); x = nodes.Create(Node::Type::kValue); x->attr("ir_node").Pointer() = ir_node; x->SetName(ir_node->Name()); @@ -151,9 +151,9 @@ void DataFlowGraph::Build(const framework::ir::Graph &graph) { } ir_node_map.emplace(ir_node, x); } - VLOG(4) << "finish creating Nodes"; + VLOG(40) << "finish creating Nodes"; - VLOG(4) << "to create edge"; + VLOG(40) << "to create edge"; // Create links for (auto *ir_node : graph.Nodes()) { auto it = ir_node_map.find(ir_node); @@ -175,7 +175,7 @@ void DataFlowGraph::Build(const framework::ir::Graph &graph) { "Can't deduce any inputs from the graph, Is the graph empty?"); ir_graph = &graph; - VLOG(3) << "finished build from IR"; + VLOG(30) << "finished build from IR"; } void DataFlowGraph::Clean() { diff --git a/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.cc b/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.cc index cb549f4b50cf56154a951d16b58b022dbad3e990..dbe138514b20a4be20e7cca800f8e12b230e7824 100644 --- a/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.cc +++ b/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.cc @@ -239,9 +239,10 @@ void DataFlowGraphToFluidPass::AddEngineOp(Node *node) { framework::BlockDesc block_desc(nullptr, &proto); block_desc.Proto()->set_parent_idx(-1); block_desc.Proto()->set_idx(0); - VLOG(4) << "origin variable size: " - << argument_->origin_program_desc->blocks(0).vars().size(); - VLOG(4) << "transformed variable size: " << block_desc.Proto()->vars().size(); + VLOG(40) << "origin variable size: " + << argument_->origin_program_desc->blocks(0).vars().size(); + VLOG(40) << "transformed variable size: " + << block_desc.Proto()->vars().size(); // copy ops. for (auto *node : block_node->subgraph) { diff --git a/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.cc b/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.cc index 648b8f7d6a6ec4bafbad2838c5631e776c8699b1..8888529a57a29c4349095c2ff4c527346716e026 100644 --- a/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.cc +++ b/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.cc @@ -29,7 +29,7 @@ void DFG_GraphvizDrawPass::Run(DataFlowGraph *graph) { auto png_path = dot_path.substr(0, dot_path.size() - 4) + ".png"; std::string message; - VLOG(3) << "draw to " << png_path; + VLOG(30) << "draw to " << png_path; ExecShellCommand("dot -Tpng " + dot_path + " -o " + png_path, &message); } diff --git a/paddle/fluid/inference/analysis/fluid_to_ir_pass.cc b/paddle/fluid/inference/analysis/fluid_to_ir_pass.cc index fc60ca3bd0bf706407defb2655a093d999aef7c2..9f52af670b8e26c31230bc003af4c9ddc5b67802 100644 --- a/paddle/fluid/inference/analysis/fluid_to_ir_pass.cc +++ b/paddle/fluid/inference/analysis/fluid_to_ir_pass.cc @@ -29,7 +29,7 @@ void FluidToIrPass::EnableParamModify(const std::string &model_dir, PADDLE_ENFORCE(argument_); argument_->Set(framework::ir::kParamScopeAttr, new framework::Scope); // Load parameters. - VLOG(3) << "Loading parameters from " << model_dir; + VLOG(30) << "Loading parameters from " << model_dir; LoadParams(&argument_->Get(framework::ir::kParamScopeAttr), model_dir, prog_file, param_file); } diff --git a/paddle/fluid/inference/analysis/model_store_pass.cc b/paddle/fluid/inference/analysis/model_store_pass.cc index c313db08875669010ddcca13aa66b383ee6d26f8..4f40a7a1adc324b824af8e3831901abcbffaeca6 100644 --- a/paddle/fluid/inference/analysis/model_store_pass.cc +++ b/paddle/fluid/inference/analysis/model_store_pass.cc @@ -35,21 +35,21 @@ void ModelStorePass::Run(DataFlowGraph *x) { std::stringstream ss; // NOTE these commands only works on linux. ss << "mkdir -p " << *argument_->model_output_store_path; - VLOG(3) << "run command: " << ss.str(); + VLOG(30) << "run command: " << ss.str(); PADDLE_ENFORCE_EQ(system(ss.str().c_str()), 0); ss.str(""); ss << "cp " << *argument_->fluid_model_dir << "/*" << " " << *argument_->model_output_store_path; - VLOG(3) << "run command: " << ss.str(); + VLOG(30) << "run command: " << ss.str(); PADDLE_ENFORCE_EQ(system(ss.str().c_str()), 0); // Store program PADDLE_ENFORCE_NOT_NULL(argument_->transformed_program_desc, "program desc is not transformed, should call " "DataFlowGraphToFluidPass first."); - VLOG(3) << "store analyzed program to " - << *argument_->model_output_store_path; + VLOG(30) << "store analyzed program to " + << *argument_->model_output_store_path; const std::string program_output_path = *argument_->model_output_store_path + "/__model__"; std::ofstream file(program_output_path, std::ios::binary); diff --git a/paddle/fluid/inference/analysis/pass_manager.cc b/paddle/fluid/inference/analysis/pass_manager.cc index a6ac0ee49f8f408faa7a17bf5ef5d2799a9a6238..ce390ee8313d6e3e2f0d79fb59d2225e2779180b 100644 --- a/paddle/fluid/inference/analysis/pass_manager.cc +++ b/paddle/fluid/inference/analysis/pass_manager.cc @@ -23,7 +23,7 @@ namespace analysis { bool PassManager::Initialize(Argument* argument) { argument_ = argument; for (auto& pass : data_) { - VLOG(3) << "Initializing pass [" << pass->repr() << "]"; + VLOG(30) << "Initializing pass [" << pass->repr() << "]"; if (!pass->Initialize(argument)) { LOG(ERROR) << "Failed to initialize pass [" << pass->repr() << "]"; return false; @@ -34,7 +34,7 @@ bool PassManager::Initialize(Argument* argument) { void DfgPassManager::RunAll() { PADDLE_ENFORCE(argument_); - VLOG(3) << "Total " << data_.size() << " Analysys passes"; + VLOG(30) << "Total " << data_.size() << " Analysys passes"; for (auto& pass : data_) { string::PrettyLogEndl(string::Style::H1(), "* Running Analysis pass [%s]", pass->repr()); diff --git a/paddle/fluid/inference/analysis/subgraph_splitter.cc b/paddle/fluid/inference/analysis/subgraph_splitter.cc index 526bbbadfe90c3064d7c620cc22e30f7fef99088..3688ea15d959309d33901c360cb1055e2ac489a5 100644 --- a/paddle/fluid/inference/analysis/subgraph_splitter.cc +++ b/paddle/fluid/inference/analysis/subgraph_splitter.cc @@ -232,7 +232,7 @@ std::vector> SubGraphSplitter::ExtractSubGraphs() { BriefNode *brief_node = itr.second; if (!brief_node->node->attr(kMarkerAttrName).Bool()) { - VLOG(4) << brief_node->node->id() << " node not a trt candicate."; + VLOG(40) << brief_node->node->id() << " node not a trt candicate."; continue; } diff --git a/paddle/fluid/inference/analysis/tensorrt_subgraph_pass.cc b/paddle/fluid/inference/analysis/tensorrt_subgraph_pass.cc index cc1746ecb34c983d219693bcec17c8789c38fa9f..3aa65f223a9e70b8ba7e387d1766ec6a97aee385 100644 --- a/paddle/fluid/inference/analysis/tensorrt_subgraph_pass.cc +++ b/paddle/fluid/inference/analysis/tensorrt_subgraph_pass.cc @@ -25,9 +25,9 @@ TensorRTSubGraphPass::TensorRTSubGraphPass( void TensorRTSubGraphPass::Run(DataFlowGraph *graph) { SubGraphFuse(graph, node_inside_subgraph_teller_, argument_)(); - VLOG(4) << "debug info " - << graph->HumanReadableInfo(false /*show_values*/, - true /*show_functions*/); + VLOG(40) << "debug info " + << graph->HumanReadableInfo(false /*show_values*/, + true /*show_functions*/); } } // namespace analysis diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc index 54c37fe64590aa82d7100c93c4c5c4ee36491421..dd295854a87c9707aaa85e1e2c6111089e3fe885 100644 --- a/paddle/fluid/inference/api/analysis_predictor.cc +++ b/paddle/fluid/inference/api/analysis_predictor.cc @@ -38,7 +38,7 @@ using contrib::AnalysisConfig; bool AnalysisPredictor::Init( const std::shared_ptr &parent_scope, const std::shared_ptr &program) { - VLOG(3) << "Predictor::init()"; + VLOG(30) << "Predictor::init()"; #if !defined(_WIN32) if (FLAGS_profile) { LOG(WARNING) << "Profiler is actived, might affect the performance"; @@ -89,7 +89,7 @@ bool AnalysisPredictor::Init( bool AnalysisPredictor::Run(const std::vector &inputs, std::vector *output_data, int batch_size) { - VLOG(3) << "Predictor::predict"; + VLOG(30) << "Predictor::predict"; inference::Timer timer; timer.tic(); // set feed variable @@ -109,7 +109,7 @@ bool AnalysisPredictor::Run(const std::vector &inputs, LOG(ERROR) << "fail to get fetches"; return false; } - VLOG(3) << "predict cost: " << timer.toc() << "ms"; + VLOG(30) << "predict cost: " << timer.toc() << "ms"; // Fix TensorArray reuse not cleaned bug. tensor_array_batch_cleaner_.CollectTensorArrays(scope_.get()); @@ -119,7 +119,7 @@ bool AnalysisPredictor::Run(const std::vector &inputs, bool AnalysisPredictor::SetFeed(const std::vector &inputs, framework::Scope *scope) { - VLOG(3) << "Predictor::set_feed"; + VLOG(30) << "Predictor::set_feed"; if (inputs.size() != feeds_.size()) { LOG(ERROR) << "wrong feed input size, need " << feeds_.size() << " but get " << inputs.size(); @@ -184,7 +184,7 @@ void AnalysisPredictor::GetFetchOne(const framework::LoDTensor &fetch, bool AnalysisPredictor::GetFetch(std::vector *outputs, framework::Scope *scope) { - VLOG(3) << "Predictor::get_fetch"; + VLOG(30) << "Predictor::get_fetch"; outputs->resize(fetchs_.size()); for (size_t i = 0; i < fetchs_.size(); ++i) { int idx = boost::get(fetchs_[i]->GetAttr("col")); @@ -246,7 +246,7 @@ void AnalysisPredictor::OptimizeInferenceProgram() { } CHECK(argument_.transformed_program_desc); - VLOG(5) << "to prepare executor"; + VLOG(50) << "to prepare executor"; inference_program_.reset( new framework::ProgramDesc(*argument_.transformed_program_desc)); if (argument_.Has(framework::ir::kParamScopeAttr)) { @@ -260,7 +260,7 @@ void AnalysisPredictor::OptimizeInferenceProgram() { template <> std::unique_ptr CreatePaddlePredictor< AnalysisConfig, PaddleEngineKind::kAnalysis>(const AnalysisConfig &config) { - VLOG(3) << "create AnalysisConfig"; + VLOG(30) << "create AnalysisConfig"; if (config.use_gpu) { // 1. GPU memeroy PADDLE_ENFORCE_GT( @@ -274,7 +274,7 @@ std::unique_ptr CreatePaddlePredictor< std::string flag = "--fraction_of_gpu_memory_to_use=" + std::to_string(config.fraction_of_gpu_memory); flags.push_back(flag); - VLOG(3) << "set flag: " << flag; + VLOG(30) << "set flag: " << flag; framework::InitGflags(flags); } } diff --git a/paddle/fluid/inference/api/analysis_predictor.h b/paddle/fluid/inference/api/analysis_predictor.h index b7dc2067332278c1c38df4beefb5059efe76417f..a9f4cce6dfa1c92301f57a7b1dd024a61f99d5ab 100644 --- a/paddle/fluid/inference/api/analysis_predictor.h +++ b/paddle/fluid/inference/api/analysis_predictor.h @@ -13,6 +13,8 @@ // limitations under the License. #pragma once +#include +#include #include #include #include "paddle/fluid/framework/naive_executor.h" diff --git a/paddle/fluid/inference/api/api.cc b/paddle/fluid/inference/api/api.cc index 01ea942d3c8d20180cfc9664b8601ba87a898e86..20fab8078fedf837564496aa296648bf5970a348 100644 --- a/paddle/fluid/inference/api/api.cc +++ b/paddle/fluid/inference/api/api.cc @@ -16,7 +16,6 @@ #include "paddle/fluid/framework/scope.h" #include "paddle/fluid/inference/api/paddle_inference_api.h" #include "paddle/fluid/platform/enforce.h" -#include "paddle_inference_api.h" namespace paddle { diff --git a/paddle/fluid/inference/api/api_impl.cc b/paddle/fluid/inference/api/api_impl.cc index d06ab8f8c8e3c0adf4a4074eb1450012126e83ea..fcbc3803d04def9a9855f2fee489e7e2c561b454 100644 --- a/paddle/fluid/inference/api/api_impl.cc +++ b/paddle/fluid/inference/api/api_impl.cc @@ -157,7 +157,7 @@ bool NativePaddlePredictor::Run(const std::vector &inputs, LOG(ERROR) << "fail to get fetches"; return false; } - VLOG(3) << "predict cost: " << timer.toc() << "ms"; + VLOG(30) << "predict cost: " << timer.toc() << "ms"; // Fix TensorArray reuse not cleaned bug. tensor_array_batch_cleaner_.CollectTensorArrays(scope_.get()); diff --git a/paddle/fluid/inference/api/api_tensorrt_subgraph_engine.cc b/paddle/fluid/inference/api/api_tensorrt_subgraph_engine.cc index 7ac468ee4d33f49bba20a07c976055a083743cbc..94b3933497daac1a4db1787994ea1bc33ec4e74f 100644 --- a/paddle/fluid/inference/api/api_tensorrt_subgraph_engine.cc +++ b/paddle/fluid/inference/api/api_tensorrt_subgraph_engine.cc @@ -34,7 +34,7 @@ class TensorRTSubgraphPredictor : public NativePaddlePredictor { bool Init(const std::shared_ptr& parent_scope) { FLAGS_IA_enable_tensorrt_subgraph_engine = true; - VLOG(3) << "Predictor::init()"; + VLOG(30) << "Predictor::init()"; if (config_.use_gpu) { place_ = paddle::platform::CUDAPlace(config_.device); } else { @@ -70,7 +70,7 @@ class TensorRTSubgraphPredictor : public NativePaddlePredictor { OptimizeInferenceProgram(); ctx_ = executor_->Prepare(*inference_program_, 0); - VLOG(5) << "to create variables"; + VLOG(50) << "to create variables"; executor_->CreateVariables(*inference_program_, sub_scope_ ? sub_scope_ : scope_.get(), 0); // Get the feed_target_names and fetch_target_names @@ -114,9 +114,9 @@ class TensorRTSubgraphPredictor : public NativePaddlePredictor { new ProgramDesc(*inference_program_->Proto())); Singleton::Global().Run(&argument); CHECK(argument.transformed_program_desc); - VLOG(5) << "transformed program:\n" - << argument.transformed_program_desc->SerializeAsString(); - VLOG(5) << "to prepare executor"; + VLOG(50) << "transformed program:\n" + << argument.transformed_program_desc->SerializeAsString(); + VLOG(50) << "to prepare executor"; inference_program_.reset( new framework::ProgramDesc(*argument.transformed_program_desc)); } @@ -129,7 +129,7 @@ template <> std::unique_ptr CreatePaddlePredictor( const MixedRTConfig& config) { - VLOG(3) << "create TensorRTSubgraphPredictor"; + VLOG(30) << "create TensorRTSubgraphPredictor"; if (config.use_gpu) { // 1. GPU memeroy PADDLE_ENFORCE_GT( @@ -143,7 +143,7 @@ CreatePaddlePredictor( std::string flag = "--fraction_of_gpu_memory_to_use=" + std::to_string(config.fraction_of_gpu_memory); flags.push_back(flag); - VLOG(3) << "set flag: " << flag; + VLOG(30) << "set flag: " << flag; framework::InitGflags(flags); } } diff --git a/paddle/fluid/inference/api/demo_ci/trt_mobilenet_demo.cc b/paddle/fluid/inference/api/demo_ci/trt_mobilenet_demo.cc index 4a8404f21c6ec6a1647e964ac3538b4b49151009..6460514f3f80cac3c5e52560ab61b5cc7fd74636 100644 --- a/paddle/fluid/inference/api/demo_ci/trt_mobilenet_demo.cc +++ b/paddle/fluid/inference/api/demo_ci/trt_mobilenet_demo.cc @@ -45,7 +45,7 @@ void Main() { config.fraction_of_gpu_memory = 0.1; // set by yourself predictor = CreatePaddlePredictor(config); - VLOG(3) << "begin to process data"; + VLOG(30) << "begin to process data"; // Just a single batch of data. std::string line; std::ifstream file(FLAGS_data); @@ -60,13 +60,13 @@ void Main() { PaddleBuf(record.data.data(), record.data.size() * sizeof(float)); input.dtype = PaddleDType::FLOAT32; - VLOG(3) << "run executor"; + VLOG(30) << "run executor"; std::vector output; predictor->Run({input}, &output, 1); - VLOG(3) << "output.size " << output.size(); + VLOG(30) << "output.size " << output.size(); auto& tensor = output.front(); - VLOG(3) << "output: " << SummaryTensor(tensor); + VLOG(30) << "output: " << SummaryTensor(tensor); // compare with reference result CheckOutput(FLAGS_refer, tensor); diff --git a/paddle/fluid/inference/api/demo_ci/utils.h b/paddle/fluid/inference/api/demo_ci/utils.h index d70c6aea791219a40c3164b51499f9d5e562be71..664b9d01c7810aa4f053cd6ebbff5f3f7619fd05 100644 --- a/paddle/fluid/inference/api/demo_ci/utils.h +++ b/paddle/fluid/inference/api/demo_ci/utils.h @@ -47,7 +47,7 @@ static void split(const std::string& str, char sep, } Record ProcessALine(const std::string& line) { - VLOG(3) << "process a line"; + VLOG(30) << "process a line"; std::vector columns; split(line, '\t', &columns); CHECK_EQ(columns.size(), 2UL) @@ -65,8 +65,8 @@ Record ProcessALine(const std::string& line) { for (auto& s : shape_strs) { record.shape.push_back(std::stoi(s)); } - VLOG(3) << "data size " << record.data.size(); - VLOG(3) << "data shape size " << record.shape.size(); + VLOG(30) << "data size " << record.data.size(); + VLOG(30) << "data shape size " << record.shape.size(); return record; } @@ -78,8 +78,8 @@ void CheckOutput(const std::string& referfile, const PaddleTensor& output) { file.close(); size_t numel = output.data.length() / PaddleDtypeSize(output.dtype); - VLOG(3) << "predictor output numel " << numel; - VLOG(3) << "reference output numel " << refer.data.size(); + VLOG(30) << "predictor output numel " << numel; + VLOG(30) << "reference output numel " << refer.data.size(); CHECK_EQ(numel, refer.data.size()); switch (output.dtype) { case PaddleDType::INT64: { diff --git a/paddle/fluid/inference/api/demo_ci/vis_demo.cc b/paddle/fluid/inference/api/demo_ci/vis_demo.cc index 8d546e3e9c740c10bcf2984e073c956e3612625c..d747f855803a6997d08957b5d35a56a0fe4160c5 100644 --- a/paddle/fluid/inference/api/demo_ci/vis_demo.cc +++ b/paddle/fluid/inference/api/demo_ci/vis_demo.cc @@ -49,11 +49,11 @@ void Main(bool use_gpu) { config.fraction_of_gpu_memory = 0.1; // set by yourself } - VLOG(3) << "init predictor"; + VLOG(30) << "init predictor"; predictor = CreatePaddlePredictor(config); analysis_predictor = CreatePaddlePredictor(config); - VLOG(3) << "begin to process data"; + VLOG(30) << "begin to process data"; // Just a single batch of data. std::string line; std::ifstream file(FLAGS_data); @@ -68,13 +68,13 @@ void Main(bool use_gpu) { PaddleBuf(record.data.data(), record.data.size() * sizeof(float)); input.dtype = PaddleDType::FLOAT32; - VLOG(3) << "run executor"; + VLOG(30) << "run executor"; std::vector output, analysis_output; predictor->Run({input}, &output, 1); - VLOG(3) << "output.size " << output.size(); + VLOG(30) << "output.size " << output.size(); auto& tensor = output.front(); - VLOG(3) << "output: " << SummaryTensor(tensor); + VLOG(30) << "output: " << SummaryTensor(tensor); // compare with reference result CheckOutput(FLAGS_refer, tensor); diff --git a/paddle/fluid/inference/api/details/reset_tensor_array.cc b/paddle/fluid/inference/api/details/reset_tensor_array.cc index 4ae6c6dc9f44650c1c62f5be5448864d817513b1..244b0b567b5df6735acd7f1bf3c2056f449be872 100644 --- a/paddle/fluid/inference/api/details/reset_tensor_array.cc +++ b/paddle/fluid/inference/api/details/reset_tensor_array.cc @@ -26,7 +26,7 @@ void TensorArrayBatchCleaner::CollectTensorArrays(framework::Scope *scope) { // parameter. if (var_name == "feed" || var_name == "fetch") continue; if (var->Type() == typeid(framework::LoDTensorArray)) { - VLOG(4) << "collect " << var_name; + VLOG(40) << "collect " << var_name; arrays_.push_back(var->GetMutable()); } } @@ -34,7 +34,7 @@ void TensorArrayBatchCleaner::CollectTensorArrays(framework::Scope *scope) { CollectTensorArrays(kid); } - VLOG(3) << "Collect " << arrays_.size() << " arrays"; + VLOG(30) << "Collect " << arrays_.size() << " arrays"; flag_ = false; } } diff --git a/paddle/fluid/inference/api/helper.h b/paddle/fluid/inference/api/helper.h index e46dc1326951f68fd030f2208b9bea1647d0026d..af21c0095c28b26c0ef4afc83572a9681d49d497 100644 --- a/paddle/fluid/inference/api/helper.h +++ b/paddle/fluid/inference/api/helper.h @@ -16,13 +16,14 @@ #include #include +#include #include // NOLINT #include #include #include #include +#include "paddle/fluid/inference/api/paddle_inference_api.h" #include "paddle/fluid/string/printf.h" -#include "paddle_inference_api.h" namespace paddle { namespace inference { diff --git a/paddle/fluid/inference/io.cc b/paddle/fluid/inference/io.cc index e246a06fd079d837ac321197914c9f70b528f2c8..bb749e8f8b0ba9d5cd82d91ce86c619f52f34c30 100644 --- a/paddle/fluid/inference/io.cc +++ b/paddle/fluid/inference/io.cc @@ -59,7 +59,8 @@ void ReadBinaryFile(const std::string& filename, std::string* contents) { bool IsPersistable(const framework::VarDesc* var) { if (var->Persistable() && var->GetType() != framework::proto::VarType::FEED_MINIBATCH && - var->GetType() != framework::proto::VarType::FETCH_LIST) { + var->GetType() != framework::proto::VarType::FETCH_LIST && + var->GetType() != framework::proto::VarType::RAW) { return true; } return false; @@ -77,7 +78,7 @@ void LoadPersistables(framework::Executor* executor, framework::Scope* scope, for (auto* var : global_block.AllVars()) { if (IsPersistable(var)) { - VLOG(3) << "persistable variable's name: " << var->Name(); + VLOG(30) << "persistable variable's name: " << var->Name(); framework::VarDesc* new_var = load_block->Var(var->Name()); new_var->SetShape(var->GetShape()); @@ -120,7 +121,7 @@ std::unique_ptr Load(framework::Executor* executor, const std::string& dirname) { std::string model_filename = dirname + "/__model__"; std::string program_desc_str; - VLOG(3) << "loading model from " << model_filename; + VLOG(30) << "loading model from " << model_filename; ReadBinaryFile(model_filename, &program_desc_str); std::unique_ptr main_program( diff --git a/paddle/fluid/inference/tensorrt/convert/activation_op.cc b/paddle/fluid/inference/tensorrt/convert/activation_op.cc index e73c5bbf57501e4ff3c080a46d91685035652bfa..0b756534ec6fbf27a3e92bf39fb7544d9785ca48 100644 --- a/paddle/fluid/inference/tensorrt/convert/activation_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/activation_op.cc @@ -27,7 +27,7 @@ class ActivationOpConverter : public OpConverter { // Here the two nullptr looks strange, that's because the // framework::OpDesc's constructor is strange. framework::OpDesc op_desc(op, nullptr); - LOG(INFO) + VLOG(3) << "convert a fluid Activation op to tensorrt activation layer whose " "type is " << op_type_; diff --git a/paddle/fluid/inference/tensorrt/convert/batch_norm_op.cc b/paddle/fluid/inference/tensorrt/convert/batch_norm_op.cc index 3330af2da6c97ad153dcecd86be4b441eac62b5e..d017bac66dd99a4b54c44ec786de61d1e66b8981 100644 --- a/paddle/fluid/inference/tensorrt/convert/batch_norm_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/batch_norm_op.cc @@ -23,7 +23,7 @@ class BatchNormOpConverter : public OpConverter { public: void operator()(const framework::proto::OpDesc& op, const framework::Scope& scope, bool test_mode) override { - LOG(INFO) << "convert a fluid batch norm op to tensorrt batch_norm"; + VLOG(3) << "convert a fluid batch norm op to tensorrt batch_norm"; framework::OpDesc op_desc(op, nullptr); PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1); diff --git a/paddle/fluid/inference/tensorrt/convert/concat_op.cc b/paddle/fluid/inference/tensorrt/convert/concat_op.cc index a11dfa1e8f2dacfad067d025678911200db500fb..b2e7c593e85974898012f8a353817a27ca212f4d 100644 --- a/paddle/fluid/inference/tensorrt/convert/concat_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/concat_op.cc @@ -25,7 +25,7 @@ class ConcatOpConverter : public OpConverter { public: void operator()(const framework::proto::OpDesc& op, const framework::Scope& scope, bool test_mode) override { - VLOG(4) << "convert a fluid mul op to tensorrt mul layer without bias"; + VLOG(3) << "convert a fluid mul op to tensorrt mul layer without bias"; framework::OpDesc op_desc(op, nullptr); // Declare inputs diff --git a/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc b/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc index 7bcf2dd1eeb17e802c5647df31945284ae08fa95..43950b8c048b4e1b8974956948caa639812b2f78 100644 --- a/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc @@ -37,8 +37,7 @@ class Conv2dOpConverter : public OpConverter { public: void operator()(const framework::proto::OpDesc& op, const framework::Scope& scope, bool test_mode) override { - LOG(INFO) - << "convert a fluid conv2d op to tensorrt conv layer without bias"; + VLOG(3) << "convert a fluid conv2d op to tensorrt conv layer without bias"; framework::OpDesc op_desc(op, nullptr); PADDLE_ENFORCE_EQ(op_desc.Input("Input").size(), 1); diff --git a/paddle/fluid/inference/tensorrt/convert/dropout_op.cc b/paddle/fluid/inference/tensorrt/convert/dropout_op.cc index 9533ecbcfda4e2500fd201d8efc64fc5bd97169a..ddbc724e3b2a48b75df17f9bda691a1fd3883c32 100644 --- a/paddle/fluid/inference/tensorrt/convert/dropout_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/dropout_op.cc @@ -25,7 +25,7 @@ class DropoutOpConverter : public OpConverter { public: void operator()(const framework::proto::OpDesc& op, const framework::Scope& scope, bool test_mode) override { - VLOG(4) << "convert a fluid dropout op to tensorrt dropout layer"; + VLOG(3) << "convert a fluid dropout op to tensorrt dropout layer"; framework::OpDesc op_desc(op, nullptr); // Declare inputs auto* input1 = engine_->GetITensor(op_desc.Input("X")[0]); diff --git a/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc b/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc index 0a6ce568f194f03c7259e1ebf28dd6ce4df2d594..671bcd8aa9a9fff34644a056499961cf6ab81287 100644 --- a/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc @@ -26,7 +26,7 @@ class ElementwiseWeightOpConverter : public OpConverter { // Here the two nullptr looks strange, that's because the // framework::OpDesc's constructor is strange. framework::OpDesc op_desc(op, nullptr); - LOG(INFO) << "convert a fluid elementwise op to tensorrt IScaleLayer"; + VLOG(3) << "convert a fluid elementwise op to tensorrt IScaleLayer"; PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1); PADDLE_ENFORCE_EQ(op_desc.Input("Y").size(), 1); // Y is a weight @@ -108,7 +108,7 @@ class ElementwiseTensorOpConverter : public OpConverter { // Here the two nullptr looks strange, that's because the // framework::OpDesc's constructor is strange. framework::OpDesc op_desc(op, nullptr); - LOG(INFO) << "convert a fluid elementwise op to tensorrt IScaleLayer"; + VLOG(3) << "convert a fluid elementwise op to tensorrt IScaleLayer"; PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1); PADDLE_ENFORCE_EQ(op_desc.Input("Y").size(), 1); // Y is a weight diff --git a/paddle/fluid/inference/tensorrt/convert/fc_op.cc b/paddle/fluid/inference/tensorrt/convert/fc_op.cc index 7c21ecd95da07b498eed2ab1bbdcc0e8cd184787..eef4fab4e86f05fa80bc614371f1aa43e433407e 100644 --- a/paddle/fluid/inference/tensorrt/convert/fc_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/fc_op.cc @@ -52,7 +52,7 @@ class FcOpConverter : public OpConverter { public: void operator()(const framework::proto::OpDesc& op, const framework::Scope& scope, bool test_mode) override { - VLOG(4) << "convert a fluid fc op to tensorrt fc layer without bias"; + VLOG(3) << "convert a fluid fc op to tensorrt fc layer without bias"; framework::OpDesc op_desc(op, nullptr); PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1); diff --git a/paddle/fluid/inference/tensorrt/convert/mul_op.cc b/paddle/fluid/inference/tensorrt/convert/mul_op.cc index 514eb659a8da73b6e56b5d17148ec0cb2aeaa135..5b6aaad49833cedbd8d1ee0ec5d24c7f983190e6 100644 --- a/paddle/fluid/inference/tensorrt/convert/mul_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/mul_op.cc @@ -25,7 +25,7 @@ class MulOpConverter : public OpConverter { public: void operator()(const framework::proto::OpDesc& op, const framework::Scope& scope, bool test_mode) override { - VLOG(4) << "convert a fluid mul op to tensorrt mul layer without bias"; + VLOG(3) << "convert a fluid mul op to tensorrt mul layer without bias"; framework::OpDesc op_desc(op, nullptr); // Declare inputs diff --git a/paddle/fluid/inference/tensorrt/convert/pad_op.cc b/paddle/fluid/inference/tensorrt/convert/pad_op.cc index 218030a591fcc7e533ef37062265449d4b6044bc..4afcb0aecec9d07b52d2fd701fae8750067a6041 100644 --- a/paddle/fluid/inference/tensorrt/convert/pad_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/pad_op.cc @@ -25,7 +25,7 @@ class PadOpConverter : public OpConverter { public: void operator()(const framework::proto::OpDesc& op, const framework::Scope& scope, bool test_mode) override { - VLOG(4) << "convert a fluid transpose op to tensorrt tranpose layer"; + VLOG(3) << "convert a fluid transpose op to tensorrt tranpose layer"; framework::OpDesc op_desc(op, nullptr); // Declare inputs diff --git a/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc b/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc index 677f85152f202b514d0563f885d872c84faba19a..48850020840a49bd309c007943f14b2f7eec5e2d 100644 --- a/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc @@ -25,7 +25,7 @@ class Pool2dOpConverter : public OpConverter { public: void operator()(const framework::proto::OpDesc& op, const framework::Scope& scope, bool test_mode) override { - VLOG(4) + VLOG(3) << "convert a fluid pool2d op to tensorrt pool2d layer without bias"; framework::OpDesc op_desc(op, nullptr); // Declare inputs diff --git a/paddle/fluid/inference/tensorrt/convert/softmax_op.cc b/paddle/fluid/inference/tensorrt/convert/softmax_op.cc index 0064f90fd7944403c14d4d47616ea82f681ceb74..80bfb2d190a5637032e7c18fbac7f22b3a9e81e1 100644 --- a/paddle/fluid/inference/tensorrt/convert/softmax_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/softmax_op.cc @@ -25,7 +25,7 @@ class SoftMaxOpConverter : public OpConverter { public: void operator()(const framework::proto::OpDesc& op, const framework::Scope& scope, bool test_mode) override { - VLOG(4) + VLOG(3) << "convert a fluid softmax op to tensorrt softmax layer without bias"; framework::OpDesc op_desc(op, nullptr); // Declare inputs diff --git a/paddle/fluid/inference/tensorrt/engine.h b/paddle/fluid/inference/tensorrt/engine.h index d9d3827321127631c0af6e5cfd2dfdd640cee146..828181200e300c370bbfa234c3c23ae44810878c 100644 --- a/paddle/fluid/inference/tensorrt/engine.h +++ b/paddle/fluid/inference/tensorrt/engine.h @@ -134,7 +134,7 @@ class TensorRTEngine : public EngineBase { std::unordered_map> weight_map; - // TODO: (NHZLX) + // TODO(NHZLX) // In the normal case, the paddle-trt exists bug when runing the googlenet. // When there are more than two convolutions of 1 * 1 with the same input, the // paddle-tensorrt will do the merging optimization, which fuse those conv diff --git a/paddle/fluid/inference/tensorrt/helper.h b/paddle/fluid/inference/tensorrt/helper.h index b6e7968108403c9c9c192759c44eac040d1c5073..fc7ca7714e9325d2b6bce6189300aa339c81c2ba 100644 --- a/paddle/fluid/inference/tensorrt/helper.h +++ b/paddle/fluid/inference/tensorrt/helper.h @@ -52,7 +52,7 @@ class NaiveLogger : public nvinfer1::ILogger { void log(nvinfer1::ILogger::Severity severity, const char* msg) override { switch (severity) { case Severity::kINFO: - LOG(INFO) << msg; + VLOG(3) << msg; break; case Severity::kWARNING: LOG(WARNING) << msg; diff --git a/paddle/fluid/inference/tests/api/CMakeLists.txt b/paddle/fluid/inference/tests/api/CMakeLists.txt index 2ca84c80058b35840aff5d072cdc99ecf5165f8e..5287cd51cd2c339601b91b6a5e9ad4b9b1f5ee48 100644 --- a/paddle/fluid/inference/tests/api/CMakeLists.txt +++ b/paddle/fluid/inference/tests/api/CMakeLists.txt @@ -110,5 +110,5 @@ if(WITH_GPU AND TENSORRT_FOUND) endif() cc_test(test_trt_models SRCS trt_models_tester.cc ARGS --dirname=${TRT_MODEL_INSTALL_DIR}/trt_test_models - DEPS paddle_inference_tensorrt_subgraph_engine) + DEPS paddle_inference_tensorrt_subgraph_engine SERIAL) endif() diff --git a/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc b/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc index 8933296490793a7693124eba23f8cf0801881e14..b2cd49af9aa580482fad84b6b23cb19f954e22fc 100644 --- a/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc @@ -27,7 +27,7 @@ struct Record { }; Record ProcessALine(const std::string &line) { - VLOG(3) << "process a line"; + VLOG(30) << "process a line"; std::vector columns; split(line, '\t', &columns); CHECK_EQ(columns.size(), 2UL) @@ -45,8 +45,8 @@ Record ProcessALine(const std::string &line) { for (auto &s : shape_strs) { record.shape.push_back(std::stoi(s)); } - VLOG(3) << "data size " << record.data.size(); - VLOG(3) << "data shape size " << record.shape.size(); + VLOG(30) << "data size " << record.data.size(); + VLOG(30) << "data shape size " << record.shape.size(); return record; } diff --git a/paddle/fluid/memory/detail/buddy_allocator.cc b/paddle/fluid/memory/detail/buddy_allocator.cc index 26ef27c3caafadb4801b0ae52133f6175655ce0a..dd7ffaa26426edebd47ec3f6fb275ad5a2d23322 100644 --- a/paddle/fluid/memory/detail/buddy_allocator.cc +++ b/paddle/fluid/memory/detail/buddy_allocator.cc @@ -32,11 +32,11 @@ BuddyAllocator::BuddyAllocator( system_allocator_(std::move(system_allocator)) {} BuddyAllocator::~BuddyAllocator() { - VLOG(10) << "BuddyAllocator Disconstructor makes sure that all of these " - "have actually been freed"; + VLOG(100) << "BuddyAllocator Disconstructor makes sure that all of these " + "have actually been freed"; while (!pool_.empty()) { auto block = static_cast(std::get<2>(*pool_.begin())); - VLOG(10) << "Free from block (" << block << ", " << max_chunk_size_ << ")"; + VLOG(100) << "Free from block (" << block << ", " << max_chunk_size_ << ")"; system_allocator_->Free(block, max_chunk_size_, block->index(cache_)); cache_.invalidate(block); @@ -57,12 +57,12 @@ void* BuddyAllocator::Alloc(size_t unaligned_size) { // acquire the allocator lock std::lock_guard lock(mutex_); - VLOG(10) << "Allocate " << unaligned_size << " bytes from chunk size " - << size; + VLOG(100) << "Allocate " << unaligned_size << " bytes from chunk size " + << size; // if the allocation is huge, send directly to the system allocator if (size > max_chunk_size_) { - VLOG(10) << "Allocate from system allocator."; + VLOG(100) << "Allocate from system allocator."; return SystemAlloc(size); } @@ -77,9 +77,9 @@ void* BuddyAllocator::Alloc(size_t unaligned_size) { return nullptr; } } else { - VLOG(10) << "Allocation from existing memory block " << std::get<2>(*it) - << " at address " - << reinterpret_cast(std::get<2>(*it))->data(); + VLOG(100) << "Allocation from existing memory block " << std::get<2>(*it) + << " at address " + << reinterpret_cast(std::get<2>(*it))->data(); } total_used_ += size; @@ -96,10 +96,10 @@ void BuddyAllocator::Free(void* p) { // Acquire the allocator lock std::lock_guard lock(mutex_); - VLOG(10) << "Free from address " << block; + VLOG(100) << "Free from address " << block; if (block->type(cache_) == MemoryBlock::HUGE_CHUNK) { - VLOG(10) << "Free directly from system allocator"; + VLOG(100) << "Free directly from system allocator"; system_allocator_->Free(block, block->total_size(cache_), block->index(cache_)); @@ -116,8 +116,8 @@ void BuddyAllocator::Free(void* p) { // Trying to merge the right buddy if (block->has_right_buddy(cache_)) { - VLOG(10) << "Merging this block " << block << " with its right buddy " - << block->right_buddy(cache_); + VLOG(100) << "Merging this block " << block << " with its right buddy " + << block->right_buddy(cache_); auto right_buddy = block->right_buddy(cache_); @@ -134,8 +134,8 @@ void BuddyAllocator::Free(void* p) { // Trying to merge the left buddy if (block->has_left_buddy(cache_)) { - VLOG(10) << "Merging this block " << block << " with its left buddy " - << block->left_buddy(cache_); + VLOG(100) << "Merging this block " << block << " with its left buddy " + << block->left_buddy(cache_); auto left_buddy = block->left_buddy(cache_); @@ -151,8 +151,8 @@ void BuddyAllocator::Free(void* p) { } // Dumping this block into pool - VLOG(10) << "Inserting free block (" << block << ", " - << block->total_size(cache_) << ")"; + VLOG(100) << "Inserting free block (" << block << ", " + << block->total_size(cache_) << ")"; pool_.insert( IndexSizeAddress(block->index(cache_), block->total_size(cache_), block)); @@ -174,7 +174,7 @@ void* BuddyAllocator::SystemAlloc(size_t size) { size_t index = 0; void* p = system_allocator_->Alloc(&index, size); - VLOG(10) << "Allocated " << p << " from system allocator."; + VLOG(100) << "Allocated " << p << " from system allocator."; if (p == nullptr) return nullptr; @@ -200,8 +200,8 @@ BuddyAllocator::PoolSet::iterator BuddyAllocator::RefillPool() { if (p == nullptr) return pool_.end(); - VLOG(10) << "Creating and inserting new block " << p - << " from system allocator"; + VLOG(100) << "Creating and inserting new block " << p + << " from system allocator"; static_cast(p)->init(&cache_, MemoryBlock::FREE_CHUNK, index, max_chunk_size_, nullptr, nullptr); @@ -245,19 +245,19 @@ void* BuddyAllocator::SplitToAlloc(BuddyAllocator::PoolSet::iterator it, auto block = static_cast(std::get<2>(*it)); pool_.erase(it); - VLOG(10) << "Split block (" << block << ", " << block->total_size(cache_) - << ") into"; + VLOG(100) << "Split block (" << block << ", " << block->total_size(cache_) + << ") into"; block->split(&cache_, size); - VLOG(10) << "Left block (" << block << ", " << block->total_size(cache_) - << ")"; + VLOG(100) << "Left block (" << block << ", " << block->total_size(cache_) + << ")"; block->set_type(&cache_, MemoryBlock::ARENA_CHUNK); // the rest of memory if exist if (block->has_right_buddy(cache_)) { if (block->right_buddy(cache_)->type(cache_) == MemoryBlock::FREE_CHUNK) { - VLOG(10) << "Insert right block (" << block->right_buddy(cache_) << ", " - << block->right_buddy(cache_)->total_size(cache_) << ")"; + VLOG(100) << "Insert right block (" << block->right_buddy(cache_) << ", " + << block->right_buddy(cache_)->total_size(cache_) << ")"; pool_.insert( IndexSizeAddress(block->right_buddy(cache_)->index(cache_), @@ -284,7 +284,7 @@ void BuddyAllocator::CleanIdleFallBackAlloc() { return; } - VLOG(10) << "Return block " << block << " to fallback allocator."; + VLOG(100) << "Return block " << block << " to fallback allocator."; system_allocator_->Free(block, max_chunk_size_, block->index(cache_)); cache_.invalidate(block); @@ -320,7 +320,7 @@ void BuddyAllocator::CleanIdleNormalAlloc() { MemoryBlock* block = static_cast(std::get<2>(*pool)); - VLOG(10) << "Return block " << block << " to base allocator."; + VLOG(100) << "Return block " << block << " to base allocator."; system_allocator_->Free(block, max_chunk_size_, block->index(cache_)); cache_.invalidate(block); diff --git a/paddle/fluid/memory/detail/meta_cache.cc b/paddle/fluid/memory/detail/meta_cache.cc index b86e4f38c42a26e155f276f9b73cbed1d0d83f7d..152e4e7f9fa2e18a2b3e5b4042089660d291badf 100644 --- a/paddle/fluid/memory/detail/meta_cache.cc +++ b/paddle/fluid/memory/detail/meta_cache.cc @@ -29,7 +29,7 @@ MemoryBlock::Desc MetadataCache::load(const MemoryBlock* block) const { return existing_desc->second; } else { auto* desc = reinterpret_cast(block); - VLOG(10) << "Load MemoryBlock::Desc type=" << desc->type; + VLOG(100) << "Load MemoryBlock::Desc type=" << desc->type; PADDLE_ASSERT(desc->check_guards()); return *reinterpret_cast(block); } diff --git a/paddle/fluid/memory/malloc.cc b/paddle/fluid/memory/malloc.cc index 0f13a4ea9c1af175771f5cc201ea5c0a8a0f7555..ec87793b442058ddfc9e22fee47fb0aa5f430b93 100644 --- a/paddle/fluid/memory/malloc.cc +++ b/paddle/fluid/memory/malloc.cc @@ -71,18 +71,18 @@ struct NaiveAllocator { template <> void* Alloc(platform::CPUPlace place, size_t size) { - VLOG(10) << "Allocate " << size << " bytes on " << platform::Place(place); + VLOG(100) << "Allocate " << size << " bytes on " << platform::Place(place); void* p = GetCPUBuddyAllocator()->Alloc(size); if (FLAGS_init_allocated_mem) { memset(p, 0xEF, size); } - VLOG(10) << " pointer=" << p; + VLOG(100) << " pointer=" << p; return p; } template <> void Free(platform::CPUPlace place, void* p) { - VLOG(10) << "Free pointer=" << p << " on " << platform::Place(place); + VLOG(100) << "Free pointer=" << p << " on " << platform::Place(place); GetCPUBuddyAllocator()->Free(p); } @@ -110,12 +110,12 @@ BuddyAllocator* GetGPUBuddyAllocator(int gpu_id) { std::unique_ptr(new detail::GPUAllocator(i)), platform::GpuMinChunkSize(), platform::GpuMaxChunkSize()); - VLOG(10) << "\n\nNOTE: each GPU device use " - << FLAGS_fraction_of_gpu_memory_to_use * 100 - << "% of GPU memory.\n" - << "You can set GFlags environment variable '" - << "FLAGS_fraction_of_gpu_memory_to_use" - << "' to change the fraction of GPU usage.\n\n"; + VLOG(100) << "\n\nNOTE: each GPU device use " + << FLAGS_fraction_of_gpu_memory_to_use * 100 + << "% of GPU memory.\n" + << "You can set GFlags environment variable '" + << "FLAGS_fraction_of_gpu_memory_to_use" + << "' to change the fraction of GPU usage.\n\n"; } }); diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt index 2a7de024bf4ec1f49f7672de782f68ba8b353bbd..776bdfaee8ac24b066b95328fdb59d240f16a446 100644 --- a/paddle/fluid/operators/CMakeLists.txt +++ b/paddle/fluid/operators/CMakeLists.txt @@ -5,6 +5,8 @@ list(REMOVE_DUPLICATES GENERAL_OPS) set(DEPS_OPS "") set(pybind_file ${PADDLE_BINARY_DIR}/paddle/fluid/pybind/pybind.h) file(WRITE ${pybind_file} "// Generated by the paddle/fluid/operator/CMakeLists.txt. DO NOT EDIT!\n\n") + +set(PART_CUDA_KERNEL_FILES) function(op_library TARGET) # op_library is a function to create op library. The interface is same as # cc_library. But it handle split GPU/CPU code and link some common library @@ -37,6 +39,12 @@ function(op_library TARGET) if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.cu) list(APPEND cu_srcs ${TARGET}.cu) endif() + if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.part.cu) + set(PART_CUDA_KERNEL_FILES ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.part.cu + ${PART_CUDA_KERNEL_FILES} PARENT_SCOPE) + list(APPEND cu_srcs ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.part.cu) + endif() + if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.hip.cu) list(APPEND hip_cu_srcs ${TARGET}.hip.cu) endif() @@ -317,6 +325,7 @@ op_library(save_op DEPS lod_tensor) op_library(load_op DEPS lod_tensor) op_library(save_combine_op DEPS lod_tensor) op_library(load_combine_op DEPS lod_tensor) +op_library(tensor_array_to_tensor_op DEPS concat_op) op_library(concat_op DEPS concat_and_split) list(REMOVE_ITEM GENERAL_OPS ${DEPS_OPS}) @@ -326,6 +335,8 @@ foreach(src ${GENERAL_OPS}) endforeach() file(APPEND ${pybind_file} "USE_OP(less_than);\nUSE_OP(logical_and);\nUSE_NO_KERNEL_OP(read_from_array);\n") + + if (NOT WIN32) add_subdirectory(reader) endif(NOT WIN32) @@ -352,3 +363,14 @@ if(NOT WIN32) nv_test(nccl_op_test SRCS nccl_op_test.cu.cc DEPS nccl_op gpu_info device_context) endif() nv_test(dropout_op_test SRCS dropout_op_test.cc DEPS dropout_op tensor) + +if(WITH_GPU) + foreach(CUDA_KERNEL_FILE ${PART_CUDA_KERNEL_FILES}) + file(READ ${CUDA_KERNEL_FILE} TARGET_CONTENT) + string(REGEX MATCH "REGISTER_OP_CUDA_KERNEL\\(\\n?([^,]+),.*" MATCHED ${TARGET_CONTENT}) + if (MATCHED) + string(STRIP ${CMAKE_MATCH_1} MATCHED) + file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${MATCHED}, CUDA);\n") + endif() + endforeach() +endif() diff --git a/paddle/fluid/operators/activation_op.cc b/paddle/fluid/operators/activation_op.cc index 9ddb3a5d29f973047507855b43b226913a3600b5..ea260a3e92b775023085fd02eec33e6ecfaf2e81 100644 --- a/paddle/fluid/operators/activation_op.cc +++ b/paddle/fluid/operators/activation_op.cc @@ -91,16 +91,12 @@ class ActivationOp : public framework::OperatorWithKernel { } }; -class ActivationOpInferVarType : public framework::VarTypeInference { - public: - void operator()(const framework::OpDesc& op_desc, - framework::BlockDesc* block) const override { - auto x_name = op_desc.Input("X")[0]; - auto out_name = op_desc.Output("Out")[0]; - auto& x = block->FindRecursiveOrCreateVar(x_name); - auto& out = block->FindRecursiveOrCreateVar(out_name); - out.SetType(x.GetType()); - out.SetDataType(x.GetDataType()); +class ActivationOpInferVarType + : public framework::PassInDtypeAndVarTypeToOutput { + protected: + std::unordered_map GetInputOutputWithSameType() + const override { + return std::unordered_map{{"X", /*->*/ "Out"}}; } }; diff --git a/paddle/fluid/operators/activation_op.h b/paddle/fluid/operators/activation_op.h index 0747469e0f4c4fe6a323a499c720a54d1e278e09..4ffc7f364bcb9bda5f94be5fe071c73bd5c40ca7 100644 --- a/paddle/fluid/operators/activation_op.h +++ b/paddle/fluid/operators/activation_op.h @@ -95,7 +95,7 @@ class ActivationGradKernel auto x = framework::EigenVector::Flatten(*X); functor(*place, x, out, dout, dx); } else { - VLOG(10) << " Inplace activation "; + VLOG(100) << " Inplace activation "; auto x = framework::EigenVector::Flatten(*dX); functor(*place, x, out, dout, dx); } diff --git a/paddle/fluid/operators/adam_op.h b/paddle/fluid/operators/adam_op.h index 3455d1ee54e8e6e498d0b0e6932ec099af9c0b30..48e0448d09c64e2c2fa655d125064e7a6572e30e 100644 --- a/paddle/fluid/operators/adam_op.h +++ b/paddle/fluid/operators/adam_op.h @@ -297,7 +297,7 @@ class AdamOpKernel : public framework::OpKernel { auto& grad = Ref(ctx.Input("Grad"), "Must set Grad"); if (grad.rows().size() == 0) { - VLOG(3) << "grad row size is 0!!"; + VLOG(30) << "grad row size is 0!!"; return; } diff --git a/paddle/fluid/operators/add_position_encoding_op.h b/paddle/fluid/operators/add_position_encoding_op.h index 5f371235f160c416058e877dbba2d9fe89abf7db..0b40d3de890a02a9dbec2328f9f6388ffa35561b 100644 --- a/paddle/fluid/operators/add_position_encoding_op.h +++ b/paddle/fluid/operators/add_position_encoding_op.h @@ -66,9 +66,10 @@ class AddPositionEncodingKernel : public framework::OpKernel { x_lod.empty() ? max_seq_len : x_lod[0][i + 1] - x_lod[0][i]; for (int j = 0; j < max_length; ++j) { for (int k = 0; k < half_size; ++k) { - const double val = (half_size > 1) - ? j / pow(10000.0, double(k) / (half_size - 1)) - : j / 10000.0; + const double val = + (half_size > 1) + ? j / pow(10000.0, static_cast(k) / (half_size - 1)) + : j / 10000.0; dst_ptr[k] = src_ptr[k] * alpha + sin(val) * beta; dst_ptr[half_size + k] = src_ptr[half_size + k] * alpha + cos(val) * beta; diff --git a/paddle/fluid/operators/array_operator.h b/paddle/fluid/operators/array_operator.h index 4309f0a5497456065e5c43bc8f7b265fa711f699..eddf34494bdab18c9d4ae1fb3d1e5d1a71fe590e 100644 --- a/paddle/fluid/operators/array_operator.h +++ b/paddle/fluid/operators/array_operator.h @@ -49,7 +49,7 @@ class ArrayOp : public framework::OperatorBase { } else { offset = static_cast(*i_tensor.data()); } - VLOG(10) << " Offset = " << offset; + VLOG(100) << " Offset = " << offset; return offset; } }; diff --git a/paddle/fluid/operators/array_to_lod_tensor_op.cc b/paddle/fluid/operators/array_to_lod_tensor_op.cc index 6257e04b010d8c580e69e466759e8e80d344c105..3c40135eca00f4e0bbff9b0f0f7cf2a4c85ec556 100644 --- a/paddle/fluid/operators/array_to_lod_tensor_op.cc +++ b/paddle/fluid/operators/array_to_lod_tensor_op.cc @@ -148,8 +148,8 @@ class ArrayToLoDTensorOp : public framework::OperatorBase { size_t start_offset = lod_and_offset.second.first; size_t end_offset = lod_and_offset.second.second; - VLOG(10) << "idx=" << idx << " x_idx=" << x_idx << " [" - << ", " << end_offset << "]"; + VLOG(100) << "idx=" << idx << " x_idx=" << x_idx << " [" + << ", " << end_offset << "]"; // Copy data PADDLE_ENFORCE_GE(end_offset, start_offset); size_t len = end_offset - start_offset; diff --git a/paddle/fluid/operators/batch_norm_op.cc b/paddle/fluid/operators/batch_norm_op.cc index 3eb473832577bd348b33ba9b0be9e597b78f26bc..cf245f5038f5f5ad1b623542aa14686eff8aad32 100644 --- a/paddle/fluid/operators/batch_norm_op.cc +++ b/paddle/fluid/operators/batch_norm_op.cc @@ -170,6 +170,15 @@ The required data format for this layer is one of the following: } }; +class BatchNormOpInferVarType + : public framework::PassInDtypeAndVarTypeToOutput { + protected: + std::unordered_map GetInputOutputWithSameType() + const override { + return std::unordered_map{{"X", /*->*/ "Y"}}; + } +}; + template class BatchNormKernel : public framework::OpKernel { @@ -525,7 +534,7 @@ class BatchNormGradMaker : public framework::SingleGradOpDescMaker { namespace ops = paddle::operators; REGISTER_OPERATOR(batch_norm, ops::BatchNormOp, ops::BatchNormOpMaker, - ops::BatchNormGradMaker); + ops::BatchNormOpInferVarType, ops::BatchNormGradMaker); REGISTER_OPERATOR(batch_norm_grad, ops::BatchNormGradOp); REGISTER_OP_CPU_KERNEL( diff --git a/paddle/fluid/operators/batch_norm_op.cu.cc b/paddle/fluid/operators/batch_norm_op.cu.cc index aaed335c905c0d80cd519afc5fecb06af73fcfe7..0609027c6940533483173209176f3243ccb36f8f 100644 --- a/paddle/fluid/operators/batch_norm_op.cu.cc +++ b/paddle/fluid/operators/batch_norm_op.cu.cc @@ -96,7 +96,7 @@ class BatchNormKernel mode_ = CUDNN_BATCHNORM_SPATIAL; #endif - VLOG(3) << "Setting descriptors."; + VLOG(30) << "Setting descriptors."; std::vector dims; std::vector strides; if (data_layout == DataLayout::kNCHW) { diff --git a/paddle/fluid/operators/beam_search_op.cc b/paddle/fluid/operators/beam_search_op.cc index 62771d09f112785ca1ba741a0ba239b1f0234633..791f8a4d3be6780c584997113b7ffcfb7ab35667 100644 --- a/paddle/fluid/operators/beam_search_op.cc +++ b/paddle/fluid/operators/beam_search_op.cc @@ -33,11 +33,11 @@ void BeamSearch::operator()(const framework::LoDTensor &pre_ids, auto items = SelectTopBeamSizeItems(pre_ids, pre_scores); auto selected_items = ToMap(items, high_level.back()); - VLOG(3) << "selected_items:"; + VLOG(30) << "selected_items:"; for (size_t i = 0; i < selected_items.size(); ++i) { - VLOG(3) << "offset:" << i; + VLOG(30) << "offset:" << i; for (auto &item : selected_items[i]) { - VLOG(3) << ItemToString(item); + VLOG(30) << ItemToString(item); } } @@ -138,11 +138,11 @@ std::vector> BeamSearch::SelectTopBeamSizeItems( } result.emplace_back(items); } - VLOG(3) << "SelectTopBeamSizeItems result size " << result.size(); + VLOG(30) << "SelectTopBeamSizeItems result size " << result.size(); for (auto &items : result) { - VLOG(3) << "item set:"; + VLOG(30) << "item set:"; for (auto &item : items) { - VLOG(3) << ItemToString(item); + VLOG(30) << ItemToString(item); } } diff --git a/paddle/fluid/operators/bilinear_interp_op.cu b/paddle/fluid/operators/bilinear_interp_op.cu deleted file mode 100644 index 4c1971538495c6f111e9db18f4014786f6f0dd58..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/bilinear_interp_op.cu +++ /dev/null @@ -1,207 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ - -#include "paddle/fluid/operators/bilinear_interp_op.h" -#include "paddle/fluid/platform/cuda_primitives.h" - -namespace paddle { -namespace operators { - -using framework::Tensor; - -template -__global__ void KeBilinearInterpFw( - const T* in, const size_t in_img_h, const size_t in_img_w, - const size_t input_h, const size_t input_w, T* out, const size_t out_img_h, - const size_t out_img_w, const size_t output_h, const size_t output_w, - const size_t num_channels, const T ratio_h, const T ratioW) { - int nthreads = output_h * output_w; - int tid = blockIdx.x * blockDim.x + threadIdx.x; - if (tid < nthreads) { - int out_id_h = tid / output_w; - int out_id_w = tid % output_w; - int in_img_size = input_w / num_channels; - int out_img_size = output_w / num_channels; - int channel_id = out_id_w / out_img_size; - - int out_img_idy = (out_id_w % out_img_size) / out_img_w; - int in_img_idy = ratio_h * out_img_idy; - int h_id = (in_img_idy < in_img_h - 1) ? 1 : 0; - T h1lambda = ratio_h * out_img_idy - in_img_idy; - T h2lambda = 1.f - h1lambda; - - int out_img_idx = tid % out_img_w; - int in_img_idx = ratioW * out_img_idx; - int w_id = (in_img_idx < in_img_w - 1) ? 1 : 0; - T w1lambda = ratioW * out_img_idx - in_img_idx; - T w2lambda = 1.f - w1lambda; - - const T* in_pos = &in[out_id_h * input_w + channel_id * in_img_size + - in_img_idy * in_img_w + in_img_idx]; - - // bilinear interpolation - out[out_id_h * output_w + out_id_w] = - h2lambda * (w2lambda * in_pos[0] + w1lambda * in_pos[w_id]) + - h1lambda * (w2lambda * in_pos[h_id * in_img_w] + - w1lambda * in_pos[h_id * in_img_w + w_id]); - } -} - -template -__global__ void KeBilinearInterpBw( - T* in, const size_t in_img_h, const size_t in_img_w, const size_t input_h, - const size_t input_w, const T* out, const size_t out_img_h, - const size_t out_img_w, const size_t output_h, const size_t output_w, - const size_t num_channels, const T ratio_h, const T ratioW) { - int nthreads = output_h * output_w; - int tid = blockIdx.x * blockDim.x + threadIdx.x; - if (tid < nthreads) { - int out_id_h = tid / output_w; - int out_id_w = tid % output_w; - int in_img_size = input_w / num_channels; - int out_img_size = output_w / num_channels; - int channel_id = out_id_w / out_img_size; - - int out_img_idy = (out_id_w % out_img_size) / out_img_w; - int in_img_idy = ratio_h * out_img_idy; - int h_id = (in_img_idy < in_img_h - 1) ? 1 : 0; - T h1lambda = ratio_h * out_img_idy - in_img_idy; - T h2lambda = 1.f - h1lambda; - - int out_img_idx = tid % out_img_w; - int in_img_idx = ratioW * out_img_idx; - int w_id = (in_img_idx < in_img_w - 1) ? 1 : 0; - T w1lambda = ratioW * out_img_idx - in_img_idx; - T w2lambda = 1.f - w1lambda; - - T* in_pos = &in[out_id_h * input_w + channel_id * in_img_size + - in_img_idy * in_img_w + in_img_idx]; - const T* out_pos = &out[out_id_h * output_w + out_id_w]; - atomicAdd(&in_pos[0], h2lambda * w2lambda * out_pos[0]); - atomicAdd(&in_pos[w_id], h2lambda * w1lambda * out_pos[0]); - atomicAdd(&in_pos[h_id * in_img_w], h1lambda * w2lambda * out_pos[0]); - atomicAdd(&in_pos[h_id * in_img_w + w_id], - h1lambda * w1lambda * out_pos[0]); - } -} - -template -class BilinearInterpOpCUDAKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()), - "This kernel only runs on GPU device."); - auto* input_t = ctx.Input("X"); // float tensor - auto* output_t = ctx.Output("Out"); // float tensor - auto* input = input_t->data(); - - int out_h = ctx.Attr("out_h"); - int out_w = ctx.Attr("out_w"); - auto out_dims = output_t->dims(); - auto out_size_t = ctx.Input("OutSize"); - if (out_size_t != nullptr) { - Tensor sizes; - framework::TensorCopy(*out_size_t, platform::CPUPlace(), &sizes); - auto size_data = sizes.data(); - out_h = size_data[0]; - out_w = size_data[1]; - } - auto* output = output_t->mutable_data( - {out_dims[0], out_dims[1], out_h, out_w}, ctx.GetPlace()); - - int batch_size = input_t->dims()[0]; - int channels = input_t->dims()[1]; - int in_h = input_t->dims()[2]; - int in_w = input_t->dims()[3]; - - int in_hw = in_h * in_w; - int out_hw = out_h * out_w; - int in_chw = channels * in_hw; - int out_chw = channels * out_hw; - - T ratio_h = (out_h > 1) ? static_cast(in_h - 1) / (out_h - 1) : 0.f; - T ratio_w = (out_w > 1) ? static_cast(in_w - 1) / (out_w - 1) : 0.f; - - if (in_h == out_h && in_w == out_w) { - memcpy(output, input, input_t->numel() * sizeof(T)); - } else { - int threadNum = batch_size * out_chw; - int blocks = (threadNum + 1024 - 1) / 1024; - - KeBilinearInterpFw< - T><<>>( - input, in_h, in_w, batch_size, in_chw, output, out_h, out_w, - batch_size, out_chw, channels, ratio_h, ratio_w); - } - } -}; - -template -class BilinearInterpGradOpCUDAKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* d_input_t = ctx.Output(framework::GradVarName("X")); - auto* d_output_t = ctx.Input(framework::GradVarName("Out")); - auto* d_output = d_output_t->data(); - auto* d_input = d_input_t->mutable_data(ctx.GetPlace()); - - auto& device_ctx = - ctx.template device_context(); - math::SetConstant zero; - zero(device_ctx, d_input_t, static_cast(0.0)); - - int out_h = ctx.Attr("out_h"); - int out_w = ctx.Attr("out_w"); - - auto out_size_t = ctx.Input("OutSize"); - if (out_size_t != nullptr) { - Tensor sizes; - framework::TensorCopy(*out_size_t, platform::CPUPlace(), &sizes); - auto size_data = sizes.data(); - out_h = size_data[0]; - out_w = size_data[1]; - } - - int batch_size = d_input_t->dims()[0]; - int channels = d_input_t->dims()[1]; - int in_h = d_input_t->dims()[2]; - int in_w = d_input_t->dims()[3]; - - int in_hw = in_h * in_w; - int out_hw = out_h * out_w; - int in_chw = channels * in_hw; - int out_chw = channels * out_hw; - - T ratio_h = (out_h > 1) ? static_cast(in_h - 1) / (out_h - 1) : 0.f; - T ratio_w = (out_w > 1) ? static_cast(in_w - 1) / (out_w - 1) : 0.f; - - if (in_h == out_h && in_w == out_w) { - memcpy(d_input, d_output, d_input_t->numel() * sizeof(T)); - } else { - int threadNum = batch_size * out_chw; - int blocks = (threadNum + 1024 - 1) / 1024; - - KeBilinearInterpBw< - T><<>>( - d_input, in_h, in_w, batch_size, in_chw, d_output, out_h, out_w, - batch_size, out_chw, channels, ratio_h, ratio_w); - } - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL(bilinear_interp, - ops::BilinearInterpOpCUDAKernel); -REGISTER_OP_CUDA_KERNEL(bilinear_interp_grad, - ops::BilinearInterpGradOpCUDAKernel); diff --git a/paddle/fluid/operators/bilinear_interp_op.h b/paddle/fluid/operators/bilinear_interp_op.h deleted file mode 100644 index 70847cb8c1abe2e94bc844ab8117d1f23fea533b..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/bilinear_interp_op.h +++ /dev/null @@ -1,163 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ - -#pragma once -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/math/math_function.h" - -namespace paddle { -namespace operators { - -using Tensor = framework::Tensor; - -template -class BilinearInterpKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* input_t = ctx.Input("X"); // float tensor - auto* output_t = ctx.Output("Out"); // float tensor - auto out_dims = output_t->dims(); - auto* input = input_t->data(); - int out_h = ctx.Attr("out_h"); - int out_w = ctx.Attr("out_w"); - auto out_size_t = ctx.Input("OutSize"); - if (out_size_t != nullptr) { - auto out_size_data = out_size_t->data(); - out_h = out_size_data[0]; - out_w = out_size_data[1]; - } - auto* output = output_t->mutable_data( - {out_dims[0], out_dims[1], out_h, out_w}, ctx.GetPlace()); - int batch_size = input_t->dims()[0]; - int channels = input_t->dims()[1]; - int in_h = input_t->dims()[2]; - int in_w = input_t->dims()[3]; - - int in_hw = in_h * in_w; - int out_hw = out_h * out_w; - int in_chw = channels * in_hw; - int out_chw = channels * out_hw; - - float ratio_h = - (out_h > 1) ? static_cast(in_h - 1) / (out_h - 1) : 0.f; - float ratio_w = - (out_w > 1) ? static_cast(in_w - 1) / (out_w - 1) : 0.f; - - if (in_h == out_h && in_w == out_w) { - memcpy(output, input, input_t->numel() * sizeof(T)); - } else { - for (int k = 0; k < batch_size; ++k) { // loop for batches - for (int i = 0; i < out_h; ++i) { // loop for images - int h = ratio_h * i; - int hid = (h < in_h - 1) ? 1 : 0; - float h1lambda = ratio_h * i - h; - float h2lambda = 1.f - h1lambda; - - for (int j = 0; j < out_w; ++j) { - int w = ratio_w * j; - int wid = (w < in_w - 1) ? 1 : 0; - float w1lambda = ratio_w * j - w; - float w2lambda = 1.f - w1lambda; - // calculate four position for bilinear interpolation - const T* in_pos = &input[k * in_chw + h * in_w + w]; - T* out_pos = &output[k * out_chw + i * out_w + j]; - - for (int c = 0; c < channels; ++c) { // loop for channels - // bilinear interpolation - out_pos[0] = static_cast( - h2lambda * (w2lambda * in_pos[0] + w1lambda * in_pos[wid]) + - h1lambda * (w2lambda * in_pos[hid * in_w] + - w1lambda * in_pos[hid * in_w + wid])); - in_pos += in_hw; - out_pos += out_hw; - } - } - } - } - } - } -}; - -template -class BilinearInterpGradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* d_input_t = ctx.Output(framework::GradVarName("X")); - auto* d_output_t = ctx.Input(framework::GradVarName("Out")); - auto* d_output = d_output_t->data(); - auto* d_input = d_input_t->mutable_data(ctx.GetPlace()); - auto& device_ctx = - ctx.template device_context(); - math::SetConstant zero; - zero(device_ctx, d_input_t, static_cast(0.0)); - - int out_h = ctx.Attr("out_h"); - int out_w = ctx.Attr("out_w"); - - auto out_size_t = ctx.Input("OutSize"); - if (out_size_t != nullptr) { - auto out_size_data = out_size_t->data(); - out_h = out_size_data[0]; - out_w = out_size_data[1]; - } - - int batch_size = d_input_t->dims()[0]; - int channels = d_input_t->dims()[1]; - int in_h = d_input_t->dims()[2]; - int in_w = d_input_t->dims()[3]; - - int in_hw = in_h * in_w; - int out_hw = out_h * out_w; - int in_chw = channels * in_hw; - int out_chw = channels * out_hw; - - float ratio_h = - (out_h > 1) ? static_cast(in_h - 1) / (out_h - 1) : 0.f; - float ratio_w = - (out_w > 1) ? static_cast(in_w - 1) / (out_w - 1) : 0.f; - - if (in_h == out_h && in_w == out_w) { - memcpy(d_input, d_output, d_input_t->numel() * sizeof(T)); - } else { - for (int k = 0; k < batch_size; ++k) { // loop for batches - for (int i = 0; i < out_h; ++i) { // loop for images - int h = ratio_h * i; - int hid = (h < in_h - 1) ? 1 : 0; - float h1lambda = ratio_h * i - h; - float h2lambda = 1 - h1lambda; - - for (int j = 0; j < out_w; ++j) { - int w = ratio_w * j; - int wid = (w < in_w - 1) ? 1 : 0; - float w1lambda = ratio_w * j - w; - float w2lambda = 1 - w1lambda; - T* in_pos = &d_input[k * in_chw + h * in_w + w]; - const T* out_pos = &d_output[k * out_chw + i * out_w + j]; - - for (int c = 0; c < channels; ++c) { // loop for channels - in_pos[0] += static_cast(h2lambda * w2lambda * out_pos[0]); - in_pos[wid] += static_cast(h2lambda * w1lambda * out_pos[0]); - in_pos[hid * in_w] += - static_cast(h1lambda * w2lambda * out_pos[0]); - in_pos[hid * in_w + wid] += - static_cast(h1lambda * w1lambda * out_pos[0]); - in_pos += in_hw; - out_pos += out_hw; - } - } - } - } - } - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/checkpoint_notify_op.cc b/paddle/fluid/operators/checkpoint_notify_op.cc index 7c072cb071a5d1e6a0549cf6d9eff18fd2533edc..defa287bdb913e406aa7e2a383cefc3cca8c4d94 100644 --- a/paddle/fluid/operators/checkpoint_notify_op.cc +++ b/paddle/fluid/operators/checkpoint_notify_op.cc @@ -46,8 +46,8 @@ class CheckpointNotifyOp : public framework::OperatorBase { auto lookup_table_save_dir = string::Sprintf("%s/%s_%d", dir, lookup_table_name, i); rpc_client->AsyncCheckpointNotify(epmap[i], lookup_table_save_dir); - VLOG(3) << "checkpoint notify sending lookup table: " << lookup_table_name - << " and dir:" << dir << " to " << epmap[i]; + VLOG(30) << "checkpoint notify sending lookup table: " + << lookup_table_name << " and dir:" << dir << " to " << epmap[i]; } PADDLE_ENFORCE(rpc_client->Wait(), "internal error in RPCClient"); } diff --git a/paddle/fluid/operators/concat_op.cc b/paddle/fluid/operators/concat_op.cc index 57817da71adfd80faad29a48b05ba2f326de6c07..093b0a9a1f9ac05cf4d72fc748fac827387e5dbe 100644 --- a/paddle/fluid/operators/concat_op.cc +++ b/paddle/fluid/operators/concat_op.cc @@ -37,7 +37,7 @@ class ConcatOp : public framework::OperatorWithKernel { PADDLE_ENFORCE_GT(n, 0, "Input tensors count should > 0."); if (n == 1) { - VLOG(3) << "Warning: concat op have only one input, may waste memory"; + VLOG(30) << "Warning: concat op have only one input, may waste memory"; } auto out_dims = ins[0]; diff --git a/paddle/fluid/operators/conv_cudnn_op.cu.cc b/paddle/fluid/operators/conv_cudnn_op.cu.cc index 76eda51ad414030074b69ee8d4f796c5c32d12f3..3083e622c3066879e107f930a45bcec36d347f80 100644 --- a/paddle/fluid/operators/conv_cudnn_op.cu.cc +++ b/paddle/fluid/operators/conv_cudnn_op.cu.cc @@ -15,15 +15,22 @@ limitations under the License. */ #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/memory/memory.h" +#include "paddle/fluid/operators/conv_cudnn_op_cache.h" #include "paddle/fluid/operators/conv_op.h" #include "paddle/fluid/platform/assert.h" #include "paddle/fluid/platform/cudnn_helper.h" #include "paddle/fluid/platform/float16.h" +#include "paddle/fluid/platform/profiler.h" DEFINE_bool(cudnn_deterministic, false, "Whether allow using an autotuning algorithm for convolution " "operator. The autotuning algorithm may be non-deterministic. If " "true, the algorithm is deterministic."); +DEFINE_uint64(conv_workspace_size_limit, 4096, + "cuDNN convolution workspace limit in MB unit."); +DEFINE_bool(cudnn_exhaustive_search, false, + "Whether enable exhaustive search for cuDNN convolution or " + "not, defalut is False."); namespace paddle { namespace operators { @@ -36,13 +43,25 @@ using DataLayout = platform::DataLayout; template using ScalingParamType = typename platform::CudnnDataType::ScalingParamType; +static constexpr char kCUDNNFwdAlgoCache[] = "kCUDNNFwdAlgoCache"; +static constexpr char kCUDNNBwdDataAlgoCache[] = "kCUDNNBwdDataAlgoCache"; +static constexpr char kCUDNNBwdFilterAlgoCache[] = "kCUDNNBwdFilterAlgoCache"; + static constexpr size_t kCONV_CUDNN_WORKSPACE_LIMIT_BYTES = static_cast(1024) * 1024 * 1024; +static constexpr size_t kNUM_CUDNN_FWD_ALGS = + CUDNN_CONVOLUTION_BWD_FILTER_ALGO_COUNT; +static constexpr size_t kNUM_CUDNN_BWD_FILTER_ALGS = + CUDNN_CONVOLUTION_BWD_FILTER_ALGO_COUNT; +static constexpr size_t kNUM_CUDNN_BWD_DATA_ALGS = + CUDNN_CONVOLUTION_BWD_DATA_ALGO_COUNT; + template class CUDNNConvOpKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { + auto& dev_ctx = ctx.template device_context(); PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()), "It must use CUDAPlace."); auto* input = ctx.Input("Input"); @@ -55,6 +74,8 @@ class CUDNNConvOpKernel : public framework::OpKernel { int groups = ctx.Attr("groups"); int64_t user_workspace_size = static_cast(ctx.Attr("workspace_size_MB")); + bool exhaustive_search = + FLAGS_cudnn_exhaustive_search || ctx.Attr("exhaustive_search"); const T* input_data = input->data(); const T* filter_data = filter->data(); @@ -120,19 +141,19 @@ class CUDNNConvOpKernel : public framework::OpKernel { // ------------------- cudnn conv workspace --------------------- size_t workspace_size_in_bytes; // final workspace to allocate. size_t workspace_size_limit = kCONV_CUDNN_WORKSPACE_LIMIT_BYTES; - if (user_workspace_size > 0) { - workspace_size_limit = user_workspace_size * 1024 * 1024; + if (FLAGS_conv_workspace_size_limit > 0 || user_workspace_size > 0) { + int64_t max_user_size = + std::max(static_cast(FLAGS_conv_workspace_size_limit), + user_workspace_size); + workspace_size_limit = max_user_size * 1024 * 1024; } + // ------------------- cudnn conv algorithm --------------------- cudnnConvolutionFwdAlgo_t algo; - auto& dev_ctx = ctx.template device_context(); auto handle = dev_ctx.cudnn_handle(); + auto workspace_handle = dev_ctx.cudnn_workspace_handle(); - CUDNN_ENFORCE(platform::dynload::cudnnGetConvolutionForwardAlgorithm( - handle, cudnn_input_desc, cudnn_filter_desc, cudnn_conv_desc, - cudnn_output_desc, CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT, - workspace_size_limit, &algo)); - + bool half_float = false; #if CUDA_VERSION >= 9000 && CUDNN_VERSION_MIN(7, 0, 1) // Tensor core is supported since the volta GPU and // is only enabled when input and filter data are float16 @@ -143,14 +164,66 @@ class CUDNNConvOpKernel : public framework::OpKernel { cudnn_conv_desc, CUDNN_TENSOR_OP_MATH)); // Currently tensor core is only enabled using this algo algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM; - VLOG(5) << "use cudnn_tensor_op_math"; + half_float = true; + VLOG(50) << "use cudnn_tensor_op_math"; } else { CUDNN_ENFORCE(platform::dynload::cudnnSetConvolutionMathType( cudnn_conv_desc, CUDNN_DEFAULT_MATH)); - VLOG(5) << "NOT use cudnn_tensor_op_math"; + VLOG(50) << "NOT use cudnn_tensor_op_math"; } #endif + auto x_dims = framework::vectorize(input->dims()); + auto f_dims = framework::vectorize(filter->dims()); + if ((!exhaustive_search) && (!half_float)) { + CUDNN_ENFORCE(platform::dynload::cudnnGetConvolutionForwardAlgorithm( + handle, cudnn_input_desc, cudnn_filter_desc, cudnn_conv_desc, + cudnn_output_desc, CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT, + workspace_size_limit, &algo)); + VLOG(3) << "cuDNN forward algo " << algo; + } else if (exhaustive_search && (!half_float)) { + AlgorithmsCache* algo_cache = nullptr; + if (ctx.scope().FindVar(kCUDNNFwdAlgoCache)) { + algo_cache = + ctx.scope() + .FindVar(kCUDNNFwdAlgoCache) + ->GetMutable>(); + } else { + algo_cache = + const_cast(ctx.scope()) + .Var(kCUDNNFwdAlgoCache) + ->GetMutable>(); + } + algo = algo_cache->GetAlgorithm( + x_dims, f_dims, strides, paddings, dilations, 0, [&]() { + int returned_algo_count; + std::array + fwd_perf_stat; + auto cudnn_find_func = [&](void* cudnn_workspace) { + CUDNN_ENFORCE( + platform::dynload::cudnnFindConvolutionForwardAlgorithmEx( + handle, cudnn_input_desc, input_data, cudnn_filter_desc, + filter_data, cudnn_conv_desc, cudnn_output_desc, + output_data, kNUM_CUDNN_FWD_ALGS, &returned_algo_count, + fwd_perf_stat.data(), cudnn_workspace, + workspace_size_limit)); + }; + workspace_handle.RunFunc(cudnn_find_func, workspace_size_limit); + + VLOG(3) << "Perf result: (algo: stat, time, memory)"; + for (int i = 0; i < returned_algo_count; ++i) { + const auto& stat = fwd_perf_stat[i]; + VLOG(3) << stat.algo << ": " << stat.status << " " << stat.time + << " " << stat.memory; + } + return fwd_perf_stat[0].algo; + }); + VLOG(3) << "choose algo " << algo; + } else { + PADDLE_ENFORCE(half_float, + "cuDNN exhaustive search doesn't support half float."); + } + // get workspace size able to allocate CUDNN_ENFORCE(platform::dynload::cudnnGetConvolutionForwardWorkspaceSize( handle, cudnn_input_desc, cudnn_filter_desc, cudnn_conv_desc, @@ -162,7 +235,6 @@ class CUDNNConvOpKernel : public framework::OpKernel { // ------------------- cudnn conv forward --------------------- ScalingParamType alpha = 1.0f, beta = 0.0f; - auto workspace_handle = dev_ctx.cudnn_workspace_handle(); for (int i = 0; i < groups; i++) { auto cudnn_func = [&](void* cudnn_workspace) { CUDNN_ENFORCE(platform::dynload::cudnnConvolutionForward( @@ -180,6 +252,7 @@ template class CUDNNConvGradOpKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { + auto& dev_ctx = ctx.template device_context(); PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()), "It must use CUDAPlace."); auto input = ctx.Input("Input"); @@ -198,6 +271,13 @@ class CUDNNConvGradOpKernel : public framework::OpKernel { int groups = ctx.Attr("groups"); int64_t user_workspace_size = static_cast(ctx.Attr("workspace_size_MB")); + bool exhaustive_search = + FLAGS_cudnn_exhaustive_search || ctx.Attr("exhaustive_search"); + if (exhaustive_search && FLAGS_cudnn_deterministic) { + PADDLE_THROW( + "Cann't set exhaustive_search True and " + "FLAGS_cudnn_deterministic True at same time."); + } // ------------------- cudnn descriptors --------------------- ScopedTensorDescriptor input_desc; @@ -265,14 +345,66 @@ class CUDNNConvGradOpKernel : public framework::OpKernel { cudnnConvolutionBwdFilterAlgo_t filter_algo; size_t workspace_size_in_bytes = 0, tmp_size = 0; size_t workspace_size_limit = kCONV_CUDNN_WORKSPACE_LIMIT_BYTES; - if (user_workspace_size > 0) { - workspace_size_limit = user_workspace_size * 1024 * 1024; + if (FLAGS_conv_workspace_size_limit > 0 || user_workspace_size > 0) { + int64_t max_user_size = + std::max(static_cast(FLAGS_conv_workspace_size_limit), + user_workspace_size); + workspace_size_limit = max_user_size * 1024 * 1024; } - auto& dev_ctx = ctx.template device_context(); + auto x_dims = framework::vectorize(input->dims()); + auto f_dims = framework::vectorize(filter->dims()); auto handle = dev_ctx.cudnn_handle(); + auto workspace_handle = dev_ctx.cudnn_workspace_handle(); if (input_grad) { - if (!FLAGS_cudnn_deterministic) { + T* input_grad_data = input_grad->mutable_data(ctx.GetPlace()); + if (exhaustive_search) { + AlgorithmsCache* data_algo_cache; + if (ctx.scope().FindVar(kCUDNNBwdDataAlgoCache)) { + data_algo_cache = + ctx.scope() + .FindVar(kCUDNNBwdDataAlgoCache) + ->GetMutable< + AlgorithmsCache>(); + } else { + data_algo_cache = + const_cast(ctx.scope()) + .Var(kCUDNNBwdDataAlgoCache) + ->GetMutable< + AlgorithmsCache>(); + } + data_algo = data_algo_cache->GetAlgorithm( + x_dims, f_dims, strides, paddings, dilations, 0, [&]() { + int returned_algo_count; + std::array + data_perf_stat; + auto cudnn_find_bd_data_func = [&](void* cudnn_workspace) { + CUDNN_ENFORCE( + platform::dynload:: + cudnnFindConvolutionBackwardDataAlgorithmEx( + handle, cudnn_filter_desc, filter_data, + cudnn_output_grad_desc, output_grad_data, + cudnn_conv_desc, cudnn_input_desc, input_grad_data, + kNUM_CUDNN_BWD_DATA_ALGS, &returned_algo_count, + data_perf_stat.data(), cudnn_workspace, + workspace_size_limit)); + }; + workspace_handle.RunFunc(cudnn_find_bd_data_func, + workspace_size_limit); + + VLOG(3) << "Perf result: (algo: stat, time, memory)"; + for (int i = 0; i < returned_algo_count; ++i) { + const auto& stat = data_perf_stat[i]; + VLOG(3) << stat.algo << ": " << stat.status << " " << stat.time + << " " << stat.memory; + } + return data_perf_stat[0].algo; + }); + VLOG(3) << "cuDNN backward data algo " << data_algo; + } else if (FLAGS_cudnn_deterministic) { + data_algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_1; + } else { CUDNN_ENFORCE( platform::dynload::cudnnGetConvolutionBackwardDataAlgorithm( handle, cudnn_filter_desc, @@ -285,10 +417,7 @@ class CUDNNConvGradOpKernel : public framework::OpKernel { cudnn_input_desc, CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT, workspace_size_limit, &data_algo)); - } else { - data_algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_1; } - CUDNN_ENFORCE( platform::dynload::cudnnGetConvolutionBackwardDataWorkspaceSize( handle, cudnn_filter_desc, cudnn_output_grad_desc, @@ -297,17 +426,54 @@ class CUDNNConvGradOpKernel : public framework::OpKernel { } if (filter_grad) { - if (!FLAGS_cudnn_deterministic) { + T* filter_grad_data = filter_grad->mutable_data(ctx.GetPlace()); + if (exhaustive_search) { + AlgorithmsCache* f_algo_cache; + if (ctx.scope().FindVar(kCUDNNBwdFilterAlgoCache)) { + f_algo_cache = + ctx.scope() + .FindVar(kCUDNNBwdFilterAlgoCache) + ->GetMutable< + AlgorithmsCache>(); + } else { + f_algo_cache = + const_cast(ctx.scope()) + .Var(kCUDNNBwdFilterAlgoCache) + ->GetMutable< + AlgorithmsCache>(); + } + filter_algo = f_algo_cache->GetAlgorithm( + x_dims, f_dims, strides, paddings, dilations, 0, [&]() { + int returned_algo_count; + std::array + filter_perf_stat; + auto cudnn_find_bd_f_func = [&](void* cudnn_workspace) { + CUDNN_ENFORCE( + platform::dynload:: + cudnnFindConvolutionBackwardFilterAlgorithmEx( + handle, cudnn_input_desc, input_data, + cudnn_output_grad_desc, output_grad_data, + cudnn_conv_desc, cudnn_filter_desc, + filter_grad_data, kNUM_CUDNN_BWD_FILTER_ALGS, + &returned_algo_count, filter_perf_stat.data(), + cudnn_workspace, workspace_size_limit)); + }; + workspace_handle.RunFunc(cudnn_find_bd_f_func, + workspace_size_limit); + return filter_perf_stat[0].algo; + }); + VLOG(3) << "cuDNN backward filter algo " << filter_algo; + } else if (FLAGS_cudnn_deterministic) { + filter_algo = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1; + } else { CUDNN_ENFORCE( platform::dynload::cudnnGetConvolutionBackwardFilterAlgorithm( handle, cudnn_input_desc, cudnn_output_grad_desc, cudnn_conv_desc, cudnn_filter_desc, CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT, workspace_size_limit, &filter_algo)); - } else { - filter_algo = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1; } - CUDNN_ENFORCE( platform::dynload::cudnnGetConvolutionBackwardFilterWorkspaceSize( handle, cudnn_input_desc, cudnn_output_grad_desc, cudnn_conv_desc, @@ -317,7 +483,6 @@ class CUDNNConvGradOpKernel : public framework::OpKernel { // ------------------- cudnn conv backward data --------------------- ScalingParamType alpha = 1.0f, beta = 0.0f; - auto workspace_handle = dev_ctx.cudnn_workspace_handle(); if (input_grad) { T* input_grad_data = input_grad->mutable_data(ctx.GetPlace()); // Because beta is zero, it is unnecessary to reset input_grad. diff --git a/paddle/fluid/operators/conv_cudnn_op_cache.h b/paddle/fluid/operators/conv_cudnn_op_cache.h new file mode 100644 index 0000000000000000000000000000000000000000..4b534321f746d5620005743eb8d45b71259156dd --- /dev/null +++ b/paddle/fluid/operators/conv_cudnn_op_cache.h @@ -0,0 +1,90 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include + +namespace paddle { +namespace operators { + +template +class AlgorithmsCache { + public: + // Caches the best algorithm for a given + // combination of tensor dimensions & compute data type. + TAlgorithm GetAlgorithm( + const std::vector& dims1, const std::vector& dims2, + const std::vector& strides, const std::vector& paddings, + const std::vector& dilations, + int algorithmFlags, // can set for different data type + std::function gen_func); + + private: + std::unordered_map hash_; + std::mutex mutex_; +}; + +template +TAlgorithm AlgorithmsCache::GetAlgorithm( + const std::vector& dims1, const std::vector& dims2, + const std::vector& strides, const std::vector& paddings, + const std::vector& dilations, int algorithmFlags, + std::function gen_func) { + std::lock_guard lock(mutex_); + int64_t seed = 0; + // Hash all of the inputs, use to try and look up a previously + // discovered algorithm, or fall back to generating a new one. + std::hash hashFn; + // do hash like boost + // https://stackoverflow.com/questions/2590677/how-do-i-combine-hash-values-in-c0x + for (const auto num : dims1) { + seed ^= hashFn(num) + 0x9e3779b9 + (seed << 6) + (seed >> 2); + } + + for (const auto num : dims2) { + seed ^= hashFn(num) + 0x9e3779b9 + (seed << 6) + (seed >> 2) + 1; + } + + for (const auto num : strides) { + seed ^= hashFn(static_cast(num)) + 0x9e3779b9 + (seed << 6) + + (seed >> 2) + 2; + } + + for (const auto num : paddings) { + seed ^= hashFn(static_cast(num)) + 0x9e3779b9 + (seed << 6) + + (seed >> 2) + 3; + } + + for (const auto num : dilations) { + seed ^= hashFn(static_cast(num)) + 0x9e3779b9 + (seed << 6) + + (seed >> 2) + 4; + } + + seed ^= hashFn(static_cast(algorithmFlags)) + 0x9e3779b9 + + (seed << 6) + (seed >> 2) + 5; + + if (seed == 0) return gen_func(); + + if (hash_.find(seed) == hash_.end()) { + TAlgorithm value = gen_func(); + hash_[seed] = value; + } + return hash_[seed]; +} + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/conv_op.cc b/paddle/fluid/operators/conv_op.cc index 2cd9979bd3426a15af34a49002d5db2fdd9aeec7..4d370746382a4247f51aafa189e86eece941c320 100644 --- a/paddle/fluid/operators/conv_op.cc +++ b/paddle/fluid/operators/conv_op.cc @@ -189,6 +189,11 @@ void Conv2DOpMaker::Make() { "workspace size can increase performance but also requires " "better hardware. This size should be chosen carefully.") .SetDefault(4096); + AddAttr("exhaustive_search", + "(bool, default false) cuDNN has many algorithm to calculation " + "convolution, whether enable exhaustive search ", + "for cuDNN convolution or not, defalut is False.") + .SetDefault(false); AddComment(R"DOC( Convolution Operator. @@ -219,6 +224,15 @@ $$ )DOC"); } +class ConvOpInferVarType : public framework::PassInDtypeAndVarTypeToOutput { + protected: + std::unordered_map GetInputOutputWithSameType() + const override { + return std::unordered_map{ + {"Input", /*->*/ "Output"}}; + } +}; + void Conv3DOpMaker::Make() { AddInput( "Input", @@ -283,7 +297,11 @@ void Conv3DOpMaker::Make() { "workspace size can increase performance but also requires " "better hardware. This size should be chosen carefully.") .SetDefault(4096); - + AddAttr("exhaustive_search", + "(bool, default false) cuDNN has many algorithm to calculation " + "convolution, whether enable exhaustive search ", + "for cuDNN convolution or not, defalut is False.") + .SetDefault(false); AddComment(R"DOC( Convolution3D Operator. @@ -356,6 +374,7 @@ framework::OpKernelType ConvOpGrad::GetExpectedKernelType( namespace ops = paddle::operators; REGISTER_OPERATOR(conv2d, ops::ConvOp, ops::Conv2DOpMaker, + ops::ConvOpInferVarType, paddle::framework::DefaultGradOpDescMaker); REGISTER_OPERATOR(conv2d_grad, ops::ConvOpGrad); @@ -363,7 +382,9 @@ REGISTER_OPERATOR(conv2d_grad, ops::ConvOpGrad); REGISTER_OPERATOR(depthwise_conv2d, ops::ConvOp, ops::Conv2DOpMaker, paddle::framework::DefaultGradOpDescMaker); REGISTER_OPERATOR(depthwise_conv2d_grad, ops::ConvOpGrad); + REGISTER_OPERATOR(conv3d, ops::ConvOp, ops::Conv3DOpMaker, + ops::ConvOpInferVarType, paddle::framework::DefaultGradOpDescMaker); REGISTER_OPERATOR(conv3d_grad, ops::ConvOpGrad); diff --git a/paddle/fluid/operators/cross_entropy_op.cc b/paddle/fluid/operators/cross_entropy_op.cc index 66f19fe7ecfa51b2ce917f0c5fcb6d486f1a7307..a904dd91302c951560dc32ac107d4d73b6024c25 100644 --- a/paddle/fluid/operators/cross_entropy_op.cc +++ b/paddle/fluid/operators/cross_entropy_op.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/cross_entropy_op.h" +#include namespace paddle { namespace operators { @@ -179,6 +180,15 @@ or not. But the output only shares the LoD information with input X. )DOC"); } }; + +class CrossEntropyOpInferVarType + : public framework::PassInDtypeAndVarTypeToOutput { + protected: + std::unordered_map GetInputOutputWithSameType() + const override { + return std::unordered_map{{"X", /*->*/ "Y"}}; + } +}; } // namespace operators } // namespace paddle @@ -186,6 +196,7 @@ namespace ops = paddle::operators; using CPUCtx = paddle::platform::CPUDeviceContext; REGISTER_OPERATOR(cross_entropy, ops::CrossEntropyOp, ops::CrossEntropyOpMaker, + ops::CrossEntropyOpInferVarType, paddle::framework::DefaultGradOpDescMaker); REGISTER_OPERATOR(cross_entropy_grad, ops::CrossEntropyGradientOp); REGISTER_OP_CPU_KERNEL(cross_entropy, ops::CrossEntropyOpKernel, diff --git a/paddle/fluid/operators/distributed/brpc_server.cc b/paddle/fluid/operators/distributed/brpc_server.cc index 862167f02084cfe81db1c0936bbfb0415fa85721..47a06dd0f378f6cc4f79aee52052717188d72420 100644 --- a/paddle/fluid/operators/distributed/brpc_server.cc +++ b/paddle/fluid/operators/distributed/brpc_server.cc @@ -133,10 +133,10 @@ void AsyncBRPCServer::StartServer() { void AsyncBRPCServer::ShutDownImpl() { server_.Stop(1000); } void AsyncBRPCServer::WaitServerReady() { - VLOG(3) << "AsyncGRPCServer is wait server ready"; + VLOG(30) << "AsyncGRPCServer is wait server ready"; std::unique_lock lock(this->mutex_ready_); condition_ready_.wait(lock, [=] { return this->ready_ == 1; }); - VLOG(3) << "AsyncGRPCServer WaitSeverReady"; + VLOG(30) << "AsyncGRPCServer WaitSeverReady"; } }; // namespace distributed diff --git a/paddle/fluid/operators/distributed/grpc_client.cc b/paddle/fluid/operators/distributed/grpc_client.cc index be5c20ad2e4b53e0ff98561b92543b03298381d9..c28f86146d3040c6a26cabfb795eff67375d4b76 100644 --- a/paddle/fluid/operators/distributed/grpc_client.cc +++ b/paddle/fluid/operators/distributed/grpc_client.cc @@ -38,7 +38,7 @@ void GRPCClient::SendComplete() { std::unique_lock lk(completed_mutex_); if (!completed_) { for (auto& it : channels_) { - VLOG(3) << "send complete message to " << it.first; + VLOG(30) << "send complete message to " << it.first; this->AsyncSendComplete(it.first); } PADDLE_ENFORCE(this->Wait(), "internal grpc error"); @@ -81,7 +81,7 @@ VarHandlePtr GRPCClient::AsyncSendVar(const std::string& ep, ::grpc::ByteBuffer req; SerializeToByteBuffer(var_name_val, var, *p_ctx, &req, "", trainer_id_); - VLOG(3) << s->GetVarHandlePtr()->String() << " begin"; + VLOG(30) << s->GetVarHandlePtr()->String() << " begin"; // stub context s->response_call_back_ = nullptr; @@ -142,7 +142,7 @@ VarHandlePtr GRPCClient::AsyncGetVar(const std::string& ep, ::grpc::ByteBuffer buf; RequestToByteBuffer(req, &buf); - VLOG(3) << s->GetVarHandlePtr()->String() << " begin"; + VLOG(30) << s->GetVarHandlePtr()->String() << " begin"; // stub context s->response_call_back_ = ProcGetResponse; @@ -190,7 +190,7 @@ VarHandlePtr GRPCClient::AsyncPrefetchVar(const std::string& ep, ::grpc::ByteBuffer req; SerializeToByteBuffer(in_var_name_val, var, *p_ctx, &req, out_var_name_val); - VLOG(3) << s->GetVarHandlePtr()->String() << " begin"; + VLOG(30) << s->GetVarHandlePtr()->String() << " begin"; // stub context s->response_call_back_ = ProcGetResponse; @@ -328,14 +328,14 @@ void GRPCClient::Proceed() { void* tag = nullptr; bool ok = false; - VLOG(3) << "GRPCClient Proceed begin"; + VLOG(30) << "GRPCClient Proceed begin"; while (!stopped_ && cq_.Next(&tag, &ok)) { BaseProcessor* c = static_cast(tag); GPR_ASSERT(ok); PADDLE_ENFORCE(c); if (c->status_.ok()) { - VLOG(3) << c->GetVarHandlePtr()->String() << " process"; + VLOG(30) << c->GetVarHandlePtr()->String() << " process"; c->Process(); } else if (c->status_.error_code() == grpc::StatusCode::DEADLINE_EXCEEDED) { // FIXME(gongwb): parse error_details? @@ -370,7 +370,7 @@ void GRPCClient::Proceed() { sync_cond_.notify_all(); } } - VLOG(3) << "GRPCClient Proceed end"; + VLOG(30) << "GRPCClient Proceed end"; } std::shared_ptr GRPCClient::GetChannel(const std::string& ep) { diff --git a/paddle/fluid/operators/distributed/grpc_server.cc b/paddle/fluid/operators/distributed/grpc_server.cc index eb9e36029c01fb38ca4438578190dd7895182ea1..ffd2b1707bea6c9379dc09c629fa4c920dac8ed0 100644 --- a/paddle/fluid/operators/distributed/grpc_server.cc +++ b/paddle/fluid/operators/distributed/grpc_server.cc @@ -98,7 +98,7 @@ class RequestSend final : public RequestBase { void Process() override { std::string varname = GetReqName(); - VLOG(4) << "RequestSend var_name:" << varname; + VLOG(40) << "RequestSend var_name:" << varname; auto scope = request_->GetMutableLocalScope(); auto invar = request_->GetVar(); @@ -135,7 +135,7 @@ class RequestGet final : public RequestBase { // proc request. std::string varname = request_.varname(); int trainer_id = request_.trainer_id(); - VLOG(4) << "RequestGet " << varname; + VLOG(40) << "RequestGet " << varname; auto scope = request_handler_->scope(); auto invar = scope->FindVar(varname); @@ -182,8 +182,8 @@ class RequestPrefetch final : public RequestBase { std::string in_var_name = request_->Varname(); std::string out_var_name = request_->OutVarname(); int trainer_id = request_->GetTrainerId(); - VLOG(4) << "RequestPrefetch, in_var_name: " << in_var_name - << " out_var_name: " << out_var_name; + VLOG(40) << "RequestPrefetch, in_var_name: " << in_var_name + << " out_var_name: " << out_var_name; auto scope = request_->GetMutableLocalScope(); auto invar = scope->FindVar(in_var_name); @@ -231,8 +231,8 @@ class RequestCheckpointNotify final : public RequestBase { std::string checkpoint_dir = request_->OutVarname(); int trainer_id = request_->GetTrainerId(); - VLOG(4) << "RequestCheckpointNotify notify: " << checkpoint_notify - << ", dir: " << checkpoint_dir; + VLOG(40) << "RequestCheckpointNotify notify: " << checkpoint_notify + << ", dir: " << checkpoint_dir; request_handler_->Handle(checkpoint_notify, scope, nullptr, nullptr, trainer_id, checkpoint_dir); @@ -246,10 +246,10 @@ class RequestCheckpointNotify final : public RequestBase { }; void AsyncGRPCServer::WaitServerReady() { - VLOG(4) << "AsyncGRPCServer is wait server ready"; + VLOG(40) << "AsyncGRPCServer is wait server ready"; std::unique_lock lock(this->mutex_ready_); condition_ready_.wait(lock, [=] { return this->ready_ == 1; }); - VLOG(4) << "AsyncGRPCServer WaitSeverReady"; + VLOG(40) << "AsyncGRPCServer WaitSeverReady"; } void AsyncGRPCServer::StartServer() { @@ -282,14 +282,15 @@ void AsyncGRPCServer::StartServer() { reqs.reserve(kRequestBufSize); for (int i = 0; i < kRequestBufSize; i++) { - VLOG(6) << "TryToRegisterNewOne on RPC NAME: " << rpc_name << " I: " << i; + VLOG(60) << "TryToRegisterNewOne on RPC NAME: " << rpc_name + << " I: " << i; TryToRegisterNewOne(rpc_name, i); } for (int i = 0; i < threadnum; i++) { rpc_threads_[rpc_name].emplace_back(new std::thread(std::bind( &AsyncGRPCServer::HandleRequest, this, cq.get(), rpc_name, f))); - VLOG(4) << t.first << " creates threads!"; + VLOG(40) << t.first << " creates threads!"; } } @@ -306,7 +307,7 @@ void AsyncGRPCServer::StartServer() { auto& threads = t.second; for (size_t i = 0; i < threads.size(); ++i) { threads[i]->join(); - VLOG(4) << t.first << " threads ends!"; + VLOG(40) << t.first << " threads ends!"; } } } @@ -314,7 +315,7 @@ void AsyncGRPCServer::StartServer() { void AsyncGRPCServer::ShutdownQueue() { for (auto& t : rpc_cq_) { t.second->Shutdown(); - VLOG(4) << t.first << " queue shutdown!"; + VLOG(40) << t.first << " queue shutdown!"; } } @@ -323,7 +324,7 @@ void AsyncGRPCServer::ShutDownImpl() { is_shut_down_ = true; ShutdownQueue(); - VLOG(4) << "server_ shutdown!"; + VLOG(40) << "server_ shutdown!"; server_->Shutdown(); } @@ -331,12 +332,12 @@ void AsyncGRPCServer::TryToRegisterNewOne(const std::string& rpc_name, int req_id) { std::unique_lock lock(cq_mutex_); if (is_shut_down_) { - VLOG(4) << "shutdown, do not TryToRegisterNewSendOne"; + VLOG(40) << "shutdown, do not TryToRegisterNewSendOne"; return; } - VLOG(4) << "TryToRegisterNewOne on RPC NAME: " << rpc_name - << " REQ ID: " << req_id; + VLOG(40) << "TryToRegisterNewOne on RPC NAME: " << rpc_name + << " REQ ID: " << req_id; auto& reqs = rpc_reqs_[rpc_name]; auto& handler = rpc_call_map_[rpc_name]; @@ -357,7 +358,7 @@ void AsyncGRPCServer::TryToRegisterNewOne(const std::string& rpc_name, reqs[req_id] = b; - VLOG(4) << "Create RequestSend status:" << b->Status(); + VLOG(40) << "Create RequestSend status:" << b->Status(); } void AsyncGRPCServer::HandleRequest( @@ -367,15 +368,15 @@ void AsyncGRPCServer::HandleRequest( bool ok = false; while (true) { - VLOG(4) << "HandleRequest " << rpc_name << " wait next"; + VLOG(40) << "HandleRequest " << rpc_name << " wait next"; if (!cq->Next(&tag, &ok)) { - VLOG(3) << "CompletionQueue " << rpc_name << " shutdown!"; + VLOG(30) << "CompletionQueue " << rpc_name << " shutdown!"; break; } int req_id = static_cast(reinterpret_cast(tag)); - VLOG(4) << "HandleRequest " << rpc_name << ", req_id:" << req_id - << " get next"; + VLOG(40) << "HandleRequest " << rpc_name << ", req_id:" << req_id + << " get next"; auto& reqs = rpc_reqs_[rpc_name]; RequestBase* base = nullptr; @@ -385,7 +386,7 @@ void AsyncGRPCServer::HandleRequest( base = reqs[req_id]; } - VLOG(3) << base->Status2String(rpc_name); + VLOG(30) << base->Status2String(rpc_name); // reference: // https://github.com/tensorflow/tensorflow/issues/5596 diff --git a/paddle/fluid/operators/distributed/request_handler.h b/paddle/fluid/operators/distributed/request_handler.h index 3c1db147098055e9974c9dc607266cdaf2e43dae..3bcc59a47ba5f52da1374f220828a0f392e13d27 100644 --- a/paddle/fluid/operators/distributed/request_handler.h +++ b/paddle/fluid/operators/distributed/request_handler.h @@ -75,7 +75,7 @@ class VarHandle { wait_cond_.wait(lk, [this] { return status_ != kDefaultState; }); ret = status_; } - VLOG(7) << "VarHandle wait:" << ret; + VLOG(70) << "VarHandle wait:" << ret; return ret != kErrorState; } @@ -84,7 +84,7 @@ class VarHandle { std::unique_lock lk(sync_mutex_); status_ = ok ? kFinishState : kErrorState; } - VLOG(7) << "VarHandle finish:" << ok; + VLOG(70) << "VarHandle finish:" << ok; wait_cond_.notify_all(); } diff --git a/paddle/fluid/operators/distributed/request_handler_impl.cc b/paddle/fluid/operators/distributed/request_handler_impl.cc index 025528fe70b8f4d353ab92f29b1bd71c77cf7850..dae56cc8436c2241bfc8ae37ba3cad4069a054bf 100644 --- a/paddle/fluid/operators/distributed/request_handler_impl.cc +++ b/paddle/fluid/operators/distributed/request_handler_impl.cc @@ -38,19 +38,19 @@ bool RequestSendHandler::Handle(const std::string& varname, framework::Variable** outvar, const int trainer_id, const std::string& out_var_name) { - VLOG(4) << "RequestSendHandler:" << varname; + VLOG(40) << "RequestSendHandler:" << varname; // Sync if (varname == BATCH_BARRIER_MESSAGE) { - VLOG(3) << "sync: recv BATCH_BARRIER_MESSAGE"; + VLOG(30) << "sync: recv BATCH_BARRIER_MESSAGE"; rpc_server_->IncreaseBatchBarrier(kRequestSend); } else if (varname == COMPLETE_MESSAGE) { - VLOG(3) << "sync: recv complete message"; + VLOG(30) << "sync: recv complete message"; rpc_server_->Complete(); } else { // Async if (!sync_mode_) { - VLOG(3) << "async process var: " << varname; + VLOG(30) << "async process var: " << varname; try { executor_->RunPreparedContext((*grad_to_prepared_ctx_)[varname].get(), scope); @@ -61,7 +61,7 @@ bool RequestSendHandler::Handle(const std::string& varname, return true; } else { // sync rpc_server_->WaitCond(kRequestSend); - VLOG(3) << "sync: processing received var: " << varname; + VLOG(30) << "sync: processing received var: " << varname; if (invar == nullptr) { LOG(FATAL) << "sync: Can not find server side var: " << varname; @@ -78,10 +78,10 @@ bool RequestGetHandler::Handle(const std::string& varname, framework::Variable** outvar, const int trainer_id, const std::string& out_var_name) { - VLOG(4) << "RequestGetHandler:" << varname; + VLOG(40) << "RequestGetHandler:" << varname; if (sync_mode_) { if (varname == FETCH_BARRIER_MESSAGE) { - VLOG(3) << "sync: recv fetch barrier message"; + VLOG(30) << "sync: recv fetch barrier message"; rpc_server_->IncreaseBatchBarrier(kRequestGet); } else { rpc_server_->WaitCond(kRequestGet); @@ -93,13 +93,14 @@ bool RequestGetHandler::Handle(const std::string& varname, // NOTE: the format is determined by distributed_transpiler.py std::string param_bak_name = string::Sprintf("%s.trainer_%d_bak", varname, trainer_id); - VLOG(3) << "getting " << param_bak_name << " trainer_id " << trainer_id; + VLOG(30) << "getting " << param_bak_name << " trainer_id " + << trainer_id; auto var = scope_->FindVar(varname); auto t_orig = var->Get(); auto param_bak = scope_->Var(param_bak_name); auto t = param_bak->GetMutable(); t->mutable_data(dev_ctx_->GetPlace(), t_orig.type()); - VLOG(3) << "copying " << varname << " to " << param_bak_name; + VLOG(30) << "copying " << varname << " to " << param_bak_name; framework::TensorCopy(t_orig, dev_ctx_->GetPlace(), t); } *outvar = scope_->FindVar(varname); @@ -114,7 +115,7 @@ bool RequestPrefetchHandler::Handle(const std::string& varname, framework::Variable** outvar, const int trainer_id, const std::string& out_var_name) { - VLOG(4) << "RequestPrefetchHandler " << varname; + VLOG(40) << "RequestPrefetchHandler " << varname; auto var_desc = program_->Block(0).FindVar(out_var_name); InitializeVariable(*outvar, var_desc->GetType()); @@ -138,8 +139,8 @@ bool RequestCheckpointHandler::Handle(const std::string& varname, auto* lt_var = scope_->FindVar(LOOKUP_TABLE_PATH)->GetMutable(); lt_var->clear(); lt_var->append(out_var_name); - VLOG(4) << "RequestCheckpointHandler update var kLookupTablePath to: " - << out_var_name; + VLOG(40) << "RequestCheckpointHandler update var kLookupTablePath to: " + << out_var_name; executor_->RunPreparedContext(checkpoint_prepared_ctx_.get(), scope_); return true; } diff --git a/paddle/fluid/operators/distributed/rpc_server.cc b/paddle/fluid/operators/distributed/rpc_server.cc index 3e30ed4ac86bd2cb3f7c4301163e54a947c3d5b4..4055091104f2f96070d0c4e806c6908da691d732 100644 --- a/paddle/fluid/operators/distributed/rpc_server.cc +++ b/paddle/fluid/operators/distributed/rpc_server.cc @@ -39,7 +39,7 @@ void RPCServer::SavePort() const { port_file.open(file_path); port_file << selected_port_; port_file.close(); - VLOG(4) << "selected port written to " << file_path; + VLOG(40) << "selected port written to " << file_path; } void RPCServer::WaitBarrier(const std::string& rpc_name) { @@ -49,12 +49,12 @@ void RPCServer::WaitBarrier(const std::string& rpc_name) { exit_flag_.load()); }); - VLOG(3) << "batch_barrier_: " << rpc_name << " " - << barrier_counter_[rpc_name]; + VLOG(30) << "batch_barrier_: " << rpc_name << " " + << barrier_counter_[rpc_name]; } void RPCServer::IncreaseBatchBarrier(const std::string rpc_name) { - VLOG(4) << "RPCServer begin IncreaseBatchBarrier " << rpc_name; + VLOG(40) << "RPCServer begin IncreaseBatchBarrier " << rpc_name; int b = 0; std::unique_lock lock(mutex_); b = ++barrier_counter_[rpc_name]; @@ -71,7 +71,7 @@ void RPCServer::Complete() { client_num_--; need_reset_all_vars_ = true; - VLOG(4) << "decrease client_num to: " << client_num_; + VLOG(40) << "decrease client_num to: " << client_num_; if (cur_cond_.load() == rpc_cond_map_[kRequestGet]) { barrier_counter_[kRequestGet]--; } @@ -90,7 +90,7 @@ int RPCServer::GetClientNum() { } void RPCServer::ResetBarrierCounter() { - VLOG(3) << "RPCServer ResetBarrierCounter "; + VLOG(30) << "RPCServer ResetBarrierCounter "; std::unique_lock lock(mutex_); for (auto& t : barrier_counter_) { t.second = 0; @@ -105,12 +105,12 @@ void RPCServer::RegisterRPC(const std::string& rpc_name, static int cond = -1; rpc_cond_map_[rpc_name] = ++cond; - VLOG(4) << "RegisterRPC rpc_name:" << rpc_name << ", handler:" << handler - << ", cond:" << rpc_cond_map_[rpc_name]; + VLOG(40) << "RegisterRPC rpc_name:" << rpc_name << ", handler:" << handler + << ", cond:" << rpc_cond_map_[rpc_name]; } void RPCServer::SetCond(const std::string& rpc_name) { - VLOG(3) << "RPCServer SetCond " << rpc_name; + VLOG(30) << "RPCServer SetCond " << rpc_name; { std::unique_lock lock(mutex_); cur_cond_ = rpc_cond_map_[rpc_name]; @@ -120,7 +120,7 @@ void RPCServer::SetCond(const std::string& rpc_name) { } void RPCServer::WaitCond(const std::string& rpc_name) { - VLOG(4) << "RPCServer WaitCond " << rpc_name; + VLOG(40) << "RPCServer WaitCond " << rpc_name; int cond = 0; { std::unique_lock lock(mutex_); diff --git a/paddle/fluid/operators/distributed/variable_response.cc b/paddle/fluid/operators/distributed/variable_response.cc index b2f73b67dc9bf944892187abd2e5709e54449d7d..d1572ce01aa17273988955c27bdea5b2f40c27ea 100644 --- a/paddle/fluid/operators/distributed/variable_response.cc +++ b/paddle/fluid/operators/distributed/variable_response.cc @@ -50,7 +50,7 @@ bool VariableResponse::ReadRaw(::google::protobuf::io::CodedInputStream* input, size_to_write = length - total_written; } // This log is useful to see how long a internal block size is of rpc. - VLOG(7) << "copy " << size_to_write << " data to CUDAPlace"; + VLOG(70) << "copy " << size_to_write << " data to CUDAPlace"; memory::Copy(boost::get(place), reinterpret_cast(p), cpu, data, size_to_write, gpu_dev_ctx.stream()); @@ -79,7 +79,7 @@ bool VariableResponse::ReadRaw(::google::protobuf::io::CodedInputStream* input, // TODO(gongwb): can we avoid copy? platform::CPUPlace cpu; // This log is useful to see how long a internal block size is of rpc. - VLOG(7) << "copy " << size_to_write << " data to CPUPlace"; + VLOG(70) << "copy " << size_to_write << " data to CPUPlace"; memory::Copy(cpu, reinterpret_cast(p), cpu, data, size_to_write); p += size_to_write; @@ -198,8 +198,8 @@ bool VariableResponse::ProcSerializedField( #endif } - VLOG(7) << "ProcSerializedField:" << meta_.varname() - << ", type:" << meta_.type() << std::endl; + VLOG(70) << "ProcSerializedField:" << meta_.varname() + << ", type:" << meta_.type() << std::endl; framework::DDim dims = GetDims(meta_.dims()); if (meta_.type() == sendrecv::LOD_TENSOR) { PADDLE_ENFORCE(meta_.lod_size() >= 0, "lod info should be got first!"); diff --git a/paddle/fluid/operators/elementwise_op.h b/paddle/fluid/operators/elementwise_op.h index 5eb4233344e1c49e69dd9830178fd6fb2ae7e51c..f01f67692e1e5dd040971cb0dd1dd793648da97a 100644 --- a/paddle/fluid/operators/elementwise_op.h +++ b/paddle/fluid/operators/elementwise_op.h @@ -75,16 +75,12 @@ class ElementwiseOp : public framework::OperatorWithKernel { } }; -class ElementwiseOpInferVarType : public framework::VarTypeInference { - public: - void operator()(const framework::OpDesc &op_desc, - framework::BlockDesc *block) const override { - auto x_name = op_desc.Input("X")[0]; - auto out_name = op_desc.Output("Out")[0]; - auto &x = block->FindRecursiveOrCreateVar(x_name); - auto &out = block->FindRecursiveOrCreateVar(out_name); - out.SetType(x.GetType()); - out.SetDataType(x.GetDataType()); +class ElementwiseOpInferVarType + : public framework::PassInDtypeAndVarTypeToOutput { + protected: + std::unordered_map GetInputOutputWithSameType() + const override { + return std::unordered_map{{"X", /*->*/ "Out"}}; } }; diff --git a/paddle/fluid/operators/feed_op.cc b/paddle/fluid/operators/feed_op.cc index dc7ef664958238ddbd48745bd59cc7db28e49f5b..5da0a536d96e5184d51638bc6b374d2263b5e9eb 100644 --- a/paddle/fluid/operators/feed_op.cc +++ b/paddle/fluid/operators/feed_op.cc @@ -47,8 +47,8 @@ class FeedOp : public framework::OperatorBase { auto col = Attr("col"); - VLOG(3) << "Feed Var " << feed_var_name << "'s " << col << " column to var " - << out_name; + VLOG(30) << "Feed Var " << feed_var_name << "'s " << col + << " column to var " << out_name; auto &feed_list = feed_var->Get(); auto &feed_item = feed_list.at(static_cast(col)); diff --git a/paddle/fluid/operators/fetch_barrier_op.cc b/paddle/fluid/operators/fetch_barrier_op.cc index 8754856e140ed074782e6fccb8991571a12babab..88a5e59ce7d6c0d14e480922bd328d632c9178e5 100644 --- a/paddle/fluid/operators/fetch_barrier_op.cc +++ b/paddle/fluid/operators/fetch_barrier_op.cc @@ -43,7 +43,7 @@ class FetchBarrierOp : public framework::OperatorBase { PADDLE_ENFORCE(rpc_client->Wait(), "internal error in RPCClient"); for (auto& ep : eps) { - VLOG(3) << "fetch barrier, ep: " << ep; + VLOG(30) << "fetch barrier, ep: " << ep; rpc_client->AsyncSendFetchBarrier(ep); } PADDLE_ENFORCE(rpc_client->Wait(), "internal error in RPCClient"); diff --git a/paddle/fluid/operators/fetch_op.cc b/paddle/fluid/operators/fetch_op.cc index c197b45e8196a47def6465128e8ca39d8daefed6..c9e759ebff63948046e67def7fb94e0241029581 100644 --- a/paddle/fluid/operators/fetch_op.cc +++ b/paddle/fluid/operators/fetch_op.cc @@ -57,7 +57,7 @@ class FetchOp : public framework::OperatorBase { TensorCopySync(src_item, platform::CPUPlace(), &dst_item); dst_item.set_lod(src_item.lod()); - VLOG(3) << "Fetch variable " << fetch_var_name << " to " << out_name; + VLOG(30) << "Fetch variable " << fetch_var_name << " to " << out_name; } }; diff --git a/paddle/fluid/operators/gather.cu.h b/paddle/fluid/operators/gather.cu.h index d74d4db92528d69492ab7b90a98d3775dadc35d1..e4df59c5d51c390cf593add0c5562665c91f33f6 100644 --- a/paddle/fluid/operators/gather.cu.h +++ b/paddle/fluid/operators/gather.cu.h @@ -50,7 +50,9 @@ void GPUGather(const platform::DeviceContext& ctx, const Tensor& src, const Tensor& index, Tensor* output) { // PADDLE_ENFORCE(platform::is_gpu_place(place)); // check index of shape 1-D - PADDLE_ENFORCE(index.dims().size() == 1); + PADDLE_ENFORCE(index.dims().size() == 1 || + (index.dims().size() == 2 && index.dims()[1] == 1)); + int index_size = index.dims()[0]; auto src_dims = src.dims(); diff --git a/paddle/fluid/operators/gather.h b/paddle/fluid/operators/gather.h index d72e07d76c97e9e455e54980207d7c02842cc04b..dc08ee5efacde5e232d751b13aaf11f51237634a 100644 --- a/paddle/fluid/operators/gather.h +++ b/paddle/fluid/operators/gather.h @@ -38,7 +38,8 @@ void CPUGather(const platform::DeviceContext& ctx, const Tensor& src, const Tensor& index, Tensor* output) { PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace())); // check index of shape 1-D - PADDLE_ENFORCE(index.dims().size() == 1); + PADDLE_ENFORCE(index.dims().size() == 1 || + (index.dims().size() == 2 && index.dims()[1] == 1)); int64_t index_size = index.dims()[0]; auto src_dims = src.dims(); diff --git a/paddle/fluid/operators/gather_op.cc b/paddle/fluid/operators/gather_op.cc index f84ff206fffddef1030b7ed439e887bdfef342a6..95aa9b573c795159079bdb5401b34d7a61252115 100644 --- a/paddle/fluid/operators/gather_op.cc +++ b/paddle/fluid/operators/gather_op.cc @@ -31,7 +31,8 @@ class GatherOp : public framework::OperatorWithKernel { "Output(Out) of GatherOp should not be null."); auto index_dims = ctx->GetInputDim("Index"); - PADDLE_ENFORCE(index_dims.size() == 1); + PADDLE_ENFORCE(index_dims.size() == 1 || + (index_dims.size() == 2 && index_dims[1] == 1)); int batch_size = ctx->GetInputDim("Index")[0]; framework::DDim output_dims(ctx->GetInputDim("X")); output_dims[0] = batch_size; @@ -53,6 +54,7 @@ class GatherGradOp : public framework::OperatorWithKernel { void InferShape(framework::InferShapeContext* ctx) const override { ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X")); + ctx->ShareLoD("X", /*-->*/ framework::GradVarName("X")); } protected: @@ -75,7 +77,7 @@ Gather Operator. $Out = X[Index]$ -Out is obtained by gathering entries of the outer-most dimension +Out is obtained by gathering entries of the outer-most dimension of X indexed by Index and concatenate them together. Example: diff --git a/paddle/fluid/operators/gen_nccl_id_op.cc b/paddle/fluid/operators/gen_nccl_id_op.cc index ef574ccdf48dcf6074a777bcb7667b114415674c..56ea165ff84291babc0e9ee56ada669cbbbe79fe 100644 --- a/paddle/fluid/operators/gen_nccl_id_op.cc +++ b/paddle/fluid/operators/gen_nccl_id_op.cc @@ -64,7 +64,7 @@ class GenNCCLIdOp : public framework::OperatorBase { distributed::RPCClient::GetInstance(0); for (auto& ep : endpoint_list) { - VLOG(3) << "sending nccl id to " << ep; + VLOG(30) << "sending nccl id to " << ep; client->AsyncSendVar(ep, dev_ctx, *scope, NCCL_ID_VARNAME); } client->Wait(); @@ -72,7 +72,7 @@ class GenNCCLIdOp : public framework::OperatorBase { client->AsyncSendBatchBarrier(ep); } client->Wait(); - VLOG(3) << "sending completed..."; + VLOG(30) << "sending completed..."; } void GetIdByServer(framework::Scope* scope, @@ -99,11 +99,11 @@ class GenNCCLIdOp : public framework::OperatorBase { std::bind(&distributed::RPCServer::StartServer, rpc_service.get())); rpc_service->SetCond(distributed::kRequestSend); - VLOG(3) << "start getting nccl id from trainer 0..."; + VLOG(30) << "start getting nccl id from trainer 0..."; rpc_service->WaitBarrier(distributed::kRequestSend); - VLOG(3) << "got nccl id and stop server..."; + VLOG(30) << "got nccl id and stop server..."; rpc_service->ShutDown(); - VLOG(3) << "rpc server stopped"; + VLOG(30) << "rpc server stopped"; server_thread.join(); } }; diff --git a/paddle/fluid/operators/grid_sampler_op.h b/paddle/fluid/operators/grid_sampler_op.h index 0d5874fc0cc4b90bec141690b88f28a27443bd60..4e91a3dcd272c8d368cb8c43e7e1fb4c98265db4 100644 --- a/paddle/fluid/operators/grid_sampler_op.h +++ b/paddle/fluid/operators/grid_sampler_op.h @@ -63,12 +63,19 @@ static void CalcGridLocations(const platform::CPUDeviceContext& ctx, Tensor ones; ones.mutable_data({n, h, w}, ctx.GetPlace()); auto ones_t = EigenTensor::From(ones).setConstant(1.0); + Tensor half_xmax, half_ymax; + half_xmax.mutable_data({n, h, w}, ctx.GetPlace()); + auto half_xmax_t = + EigenTensor::From(half_xmax).setConstant(0.5 * x_max); + half_ymax.mutable_data({n, h, w}, ctx.GetPlace()); + auto half_ymax_t = + EigenTensor::From(half_ymax).setConstant(0.5 * y_max); // scale grid to [0, h-1/w-1] auto grid_x_t = EigenTensor::From(grid_x); auto grid_y_t = EigenTensor::From(grid_y); - grid_x_t.device(place) = 0.5 * ((grid_x_t + ones_t) * x_max); - grid_y_t.device(place) = 0.5 * ((grid_y_t + ones_t) * y_max); + grid_x_t.device(place) = (grid_x_t + ones_t) * half_xmax_t; + grid_y_t.device(place) = (grid_y_t + ones_t) * half_ymax_t; // calculate coords of 4 corner points x_w->mutable_data({n, h, w}, ctx.GetPlace()); diff --git a/paddle/fluid/operators/bilinear_interp_op.cc b/paddle/fluid/operators/interpolate_op.cc similarity index 52% rename from paddle/fluid/operators/bilinear_interp_op.cc rename to paddle/fluid/operators/interpolate_op.cc index 2dc3399da183fbcf7664066f6f7ce12db3dc6d5e..8f979e05d31e5a85bc86784943f4588ab650f668 100644 --- a/paddle/fluid/operators/bilinear_interp_op.cc +++ b/paddle/fluid/operators/interpolate_op.cc @@ -1,4 +1,4 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at @@ -9,7 +9,8 @@ See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/bilinear_interp_op.h" +#include "paddle/fluid/operators/interpolate_op.h" +#include #include #include "paddle/fluid/framework/op_registry.h" @@ -18,27 +19,34 @@ namespace operators { using framework::Tensor; -class BilinearInterpOp : public framework::OperatorWithKernel { +class InterpolateOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; protected: void InferShape(framework::InferShapeContext* ctx) const override { PADDLE_ENFORCE(ctx->HasInput("X"), - "Input(X) of BilinearInterOp should not be null."); + "Input(X) of InterpolateOp should not be null."); PADDLE_ENFORCE(ctx->HasOutput("Out"), - "Output(Out) of BilinearInterOp should not be null."); + "Output(Out) of InterpolationOp should not be null."); + + auto interp_method = ctx->Attrs().Get("interp_method"); + PADDLE_ENFORCE( + "bilinear" == interp_method || "nearest" == interp_method, + "Interpolation method can only be \"bilinear\" or \"nearest\"."); auto dim_x = ctx->GetInputDim("X"); // NCHW format int out_h = ctx->Attrs().Get("out_h"); int out_w = ctx->Attrs().Get("out_w"); PADDLE_ENFORCE_EQ(dim_x.size(), 4, "X's dimension must be 4"); - if (ctx->HasInput("OutSize")) { + if (ctx->HasInput("OutSize") && ctx->IsRuntime()) { auto out_size_dim = ctx->GetInputDim("OutSize"); PADDLE_ENFORCE_EQ(out_size_dim.size(), 1, "OutSize's dimension size must be 1"); PADDLE_ENFORCE_EQ(out_size_dim[0], 2, "OutSize's dim[0] must be 2"); + ctx->ShareLoD("X", "Out"); + return; } std::vector dim_out({dim_x[0], dim_x[1], out_h, out_w}); ctx->SetOutputDim("Out", framework::make_ddim(dim_out)); @@ -52,35 +60,53 @@ class BilinearInterpOp : public framework::OperatorWithKernel { } }; -class BilinearInterpOpMaker : public framework::OpProtoAndCheckerMaker { +class InterpolateOpMaker : public framework::OpProtoAndCheckerMaker { public: void Make() override { AddInput("X", - "The input tensor of bilinear interpolation, " - "This is a 4-D tensor with shape of (N x C x h x w)"); + "The input tensor of interpolate operator, " + "This is a 4-D tensor with shape of [N, C, H, w]."); AddInput("OutSize", - "This is a 1-D tensor with two number. " + "This is a 1-D tensor with two numbers to specify output size. " "The first number is height and the second number is width.") .AsDispensable(); - AddOutput("Out", "The dimension of output is (N x C x out_h x out_w)"); + AddOutput("Out", + "The output tensor of interpolate operator, " + "This is a 4-D tensor with shape of [N, C, H, W]."); - AddAttr("out_h", "output height of bilinear interpolation op."); - AddAttr("out_w", "output width of bilinear interpolation op."); + AddAttr("out_h", "output height of interpolate op."); + AddAttr("out_w", "output width of interpolate op."); + AddAttr( + "interp_method", + "(string), interpolation method, can be \"bilinear\" for " + "bilinear interpolation and \"nearest\" for nearest " + "neighbor interpolation."); AddComment(R"DOC( + This operator samples input X to given output shape by using specified + interpolation method, the interpolation methods can be \"nearest\" + for nearest neighbor interpolation and \"bilinear\" for bilinear + interpolation. + + Nearest neighbor interpolation is to perform nearest neighbor interpolation + in both the 3rd dimention(in height direction) and the 4th dimention(in width + direction) on input tensor. + Bilinear interpolation is an extension of linear interpolation for interpolating functions of two variables (e.g. H-direction and - W-direction in this op) on a rectilinear 2D grid. - - The key idea is to perform linear interpolation first in one - direction, and then again in the other direction. - - For details, please refer to Wikipedia: + W-direction in this op) on a rectilinear 2D grid. The key idea is + to perform linear interpolation first in one direction, and then + again in the other direction. + + For details of nearest neighbor interpolation, please refer to Wikipedia: + https://en.wikipedia.org/wiki/Nearest-neighbor_interpolation + + For details of bilinear interpolation, please refer to Wikipedia: https://en.wikipedia.org/wiki/Bilinear_interpolation )DOC"); } }; -class BilinearInterpOpGrad : public framework::OperatorWithKernel { +class InterpolateOpGrad : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; @@ -106,11 +132,11 @@ class BilinearInterpOpGrad : public framework::OperatorWithKernel { } // namespace paddle namespace ops = paddle::operators; -REGISTER_OPERATOR(bilinear_interp, ops::BilinearInterpOp, - ops::BilinearInterpOpMaker, +REGISTER_OPERATOR(interpolate, ops::InterpolateOp, ops::InterpolateOpMaker, paddle::framework::DefaultGradOpDescMaker); -REGISTER_OPERATOR(bilinear_interp_grad, ops::BilinearInterpOpGrad); -REGISTER_OP_CPU_KERNEL(bilinear_interp, ops::BilinearInterpKernel, - ops::BilinearInterpKernel); -REGISTER_OP_CPU_KERNEL(bilinear_interp_grad, - ops::BilinearInterpGradKernel); +REGISTER_OPERATOR(interpolate_grad, ops::InterpolateOpGrad); +REGISTER_OP_CPU_KERNEL(interpolate, ops::InterpolateKernel, + ops::InterpolateKernel, + ops::InterpolateKernel); +REGISTER_OP_CPU_KERNEL(interpolate_grad, ops::InterpolateGradKernel, + ops::InterpolateGradKernel); diff --git a/paddle/fluid/operators/interpolate_op.cu b/paddle/fluid/operators/interpolate_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..190afbdac431f863c32e2a4a4b3ad83848e550fc --- /dev/null +++ b/paddle/fluid/operators/interpolate_op.cu @@ -0,0 +1,292 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +#include +#include "paddle/fluid/operators/interpolate_op.h" +#include "paddle/fluid/platform/cuda_primitives.h" + +namespace paddle { +namespace operators { + +using framework::Tensor; + +template +__global__ void KeNearestNeighborInterpFw( + const T* in, const size_t in_img_h, const size_t in_img_w, + const size_t input_h, const size_t input_w, T* out, const size_t out_img_h, + const size_t out_img_w, const size_t output_h, const size_t output_w, + const size_t num_channels, const float ratio_h, const float ratio_w) { + int nthreads = output_h * output_w; + int tid = blockIdx.x * blockDim.x + threadIdx.x; + int stride = blockDim.x * gridDim.x; + for (; tid < nthreads; tid += stride) { + int out_id_h = tid / output_w; + int out_id_w = tid % output_w; + int in_img_size = input_w / num_channels; + int out_img_size = output_w / num_channels; + int channel_id = out_id_w / out_img_size; + + int out_img_idy = (out_id_w % out_img_size) / out_img_w; + int in_img_idy = static_cast(ratio_h * out_img_idy + 0.5); + + int out_img_idx = tid % out_img_w; + int in_img_idx = static_cast(ratio_w * out_img_idx + 0.5); + + out[tid] = in[out_id_h * input_w + channel_id * in_img_size + + in_img_idy * in_img_w + in_img_idx]; + } +} + +template +__global__ void KeNearestNeighborInterpBw( + T* in, const size_t in_img_h, const size_t in_img_w, const size_t input_h, + const size_t input_w, const T* out, const size_t out_img_h, + const size_t out_img_w, const size_t output_h, const size_t output_w, + const size_t num_channels, const float ratio_h, const float ratio_w) { + int nthreads = output_h * output_w; + int tid = blockIdx.x * blockDim.x + threadIdx.x; + int stride = blockDim.x * gridDim.x; + for (; tid < nthreads; tid += stride) { + int out_id_h = tid / output_w; + int out_id_w = tid % output_w; + int in_img_size = input_w / num_channels; + int out_img_size = output_w / num_channels; + int channel_id = out_id_w / out_img_size; + + int out_img_idy = (out_id_w % out_img_size) / out_img_w; + int in_img_idy = static_cast(ratio_h * out_img_idy + 0.5); + + int out_img_idx = tid % out_img_w; + int in_img_idx = static_cast(ratio_w * out_img_idx + 0.5); + + T* in_pos = &in[out_id_h * input_w + channel_id * in_img_size + + in_img_idy * in_img_w + in_img_idx]; + const T out_pos = out[out_id_h * output_w + out_id_w]; + platform::CudaAtomicAdd(in_pos, out_pos); + } +} + +template +__global__ void KeBilinearInterpFw( + const T* in, const size_t in_img_h, const size_t in_img_w, + const size_t input_h, const size_t input_w, T* out, const size_t out_img_h, + const size_t out_img_w, const size_t output_h, const size_t output_w, + const size_t num_channels, const float ratio_h, const float ratio_w) { + int nthreads = output_h * output_w; + int tid = blockIdx.x * blockDim.x + threadIdx.x; + int stride = blockDim.x * gridDim.x; + for (; tid < nthreads; tid += stride) { + int out_id_h = tid / output_w; + int out_id_w = tid % output_w; + int in_img_size = input_w / num_channels; + int out_img_size = output_w / num_channels; + int channel_id = out_id_w / out_img_size; + + int out_img_idy = (out_id_w % out_img_size) / out_img_w; + int in_img_idy = ratio_h * out_img_idy; + int h_id = (in_img_idy < in_img_h - 1) ? 1 : 0; + T h1lambda = ratio_h * out_img_idy - in_img_idy; + T h2lambda = 1.f - h1lambda; + + int out_img_idx = tid % out_img_w; + int in_img_idx = ratio_w * out_img_idx; + int w_id = (in_img_idx < in_img_w - 1) ? 1 : 0; + T w1lambda = ratio_w * out_img_idx - in_img_idx; + T w2lambda = 1.f - w1lambda; + + const T* in_pos = &in[out_id_h * input_w + channel_id * in_img_size + + in_img_idy * in_img_w + in_img_idx]; + + // bilinear interpolation + out[out_id_h * output_w + out_id_w] = + h2lambda * (w2lambda * in_pos[0] + w1lambda * in_pos[w_id]) + + h1lambda * (w2lambda * in_pos[h_id * in_img_w] + + w1lambda * in_pos[h_id * in_img_w + w_id]); + } +} + +template +__global__ void KeBilinearInterpBw( + T* in, const size_t in_img_h, const size_t in_img_w, const size_t input_h, + const size_t input_w, const T* out, const size_t out_img_h, + const size_t out_img_w, const size_t output_h, const size_t output_w, + const size_t num_channels, const T ratio_h, const T ratio_w) { + int nthreads = output_h * output_w; + int tid = blockIdx.x * blockDim.x + threadIdx.x; + int stride = blockDim.x * gridDim.x; + for (; tid < nthreads; tid += stride) { + int out_id_h = tid / output_w; + int out_id_w = tid % output_w; + int in_img_size = input_w / num_channels; + int out_img_size = output_w / num_channels; + int channel_id = out_id_w / out_img_size; + + int out_img_idy = (out_id_w % out_img_size) / out_img_w; + int in_img_idy = ratio_h * out_img_idy; + int h_id = (in_img_idy < in_img_h - 1) ? 1 : 0; + T h1lambda = ratio_h * out_img_idy - in_img_idy; + T h2lambda = 1.f - h1lambda; + + int out_img_idx = tid % out_img_w; + int in_img_idx = ratio_w * out_img_idx; + int w_id = (in_img_idx < in_img_w - 1) ? 1 : 0; + T w1lambda = ratio_w * out_img_idx - in_img_idx; + T w2lambda = 1.f - w1lambda; + + T* in_pos = &in[out_id_h * input_w + channel_id * in_img_size + + in_img_idy * in_img_w + in_img_idx]; + const T* out_pos = &out[out_id_h * output_w + out_id_w]; + platform::CudaAtomicAdd(&in_pos[0], h2lambda * w2lambda * out_pos[0]); + platform::CudaAtomicAdd(&in_pos[w_id], h2lambda * w1lambda * out_pos[0]); + platform::CudaAtomicAdd(&in_pos[h_id * in_img_w], + h1lambda * w2lambda * out_pos[0]); + platform::CudaAtomicAdd(&in_pos[h_id * in_img_w + w_id], + h1lambda * w1lambda * out_pos[0]); + } +} + +template +class InterpolateOpCUDAKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()), + "This kernel only runs on GPU device."); + auto* input = ctx.Input("X"); + auto* output = ctx.Output("Out"); + auto* input_data = input->data(); + + auto interp_method = ctx.Attr("interp_method"); + int out_h = ctx.Attr("out_h"); + int out_w = ctx.Attr("out_w"); + auto out_size = ctx.Input("OutSize"); + if (out_size != nullptr) { + Tensor sizes; + framework::TensorCopy(*out_size, platform::CPUPlace(), &sizes); + auto size_data = sizes.data(); + out_h = size_data[0]; + out_w = size_data[1]; + } + + int n = input->dims()[0]; + int c = input->dims()[1]; + int in_h = input->dims()[2]; + int in_w = input->dims()[3]; + + auto* output_data = + output->mutable_data({n, c, out_h, out_w}, ctx.GetPlace()); + + int in_hw = in_h * in_w; + int out_hw = out_h * out_w; + int in_chw = c * in_hw; + int out_chw = c * out_hw; + + float ratio_h = + (out_h > 1) ? static_cast(in_h - 1) / (out_h - 1) : 0.f; + float ratio_w = + (out_w > 1) ? static_cast(in_w - 1) / (out_w - 1) : 0.f; + + if (in_h == out_h && in_w == out_w) { + framework::TensorCopy(*input, ctx.GetPlace(), output); + return; + } + + int pixelNum = n * out_chw; + int grid_dim = (pixelNum + 512 - 1) / 512; + grid_dim = grid_dim > 8 ? 8 : grid_dim; + + if ("nearest" == interp_method) { + KeNearestNeighborInterpFw< + T><<>>( + input_data, in_h, in_w, n, in_chw, output_data, out_h, out_w, n, + out_chw, c, ratio_h, ratio_w); + } else if ("bilinear" == interp_method) { + KeBilinearInterpFw< + T><<>>( + input_data, in_h, in_w, n, in_chw, output_data, out_h, out_w, n, + out_chw, c, ratio_h, ratio_w); + } + } +}; + +template +class InterpolateGradOpCUDAKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* input_grad = ctx.Output(framework::GradVarName("X")); + auto* output_grad = ctx.Input(framework::GradVarName("Out")); + auto* output_grad_data = output_grad->data(); + auto* input_grad_data = input_grad->mutable_data(ctx.GetPlace()); + + auto& device_ctx = + ctx.template device_context(); + math::SetConstant zero; + zero(device_ctx, input_grad, static_cast(0.0)); + + auto interp_method = ctx.Attr("interp_method"); + int out_h = ctx.Attr("out_h"); + int out_w = ctx.Attr("out_w"); + auto out_size = ctx.Input("OutSize"); + if (out_size != nullptr) { + Tensor sizes; + framework::TensorCopy(*out_size, platform::CPUPlace(), &sizes); + auto size_data = sizes.data(); + out_h = size_data[0]; + out_w = size_data[1]; + } + + int n = input_grad->dims()[0]; + int c = input_grad->dims()[1]; + int in_h = input_grad->dims()[2]; + int in_w = input_grad->dims()[3]; + + int in_hw = in_h * in_w; + int out_hw = out_h * out_w; + int in_chw = c * in_hw; + int out_chw = c * out_hw; + + float ratio_h = + (out_h > 1) ? static_cast(in_h - 1) / (out_h - 1) : 0.f; + float ratio_w = + (out_w > 1) ? static_cast(in_w - 1) / (out_w - 1) : 0.f; + + if (in_h == out_h && in_w == out_w) { + framework::TensorCopy(*output_grad, ctx.GetPlace(), input_grad); + return; + } + + int pixelNum = n * out_chw; + int grid_dim = (pixelNum + 512 - 1) / 512; + grid_dim = grid_dim > 8 ? 8 : grid_dim; + + if ("nearest" == interp_method) { + KeNearestNeighborInterpBw< + T><<>>( + input_grad_data, in_h, in_w, n, in_chw, output_grad_data, out_h, + out_w, n, out_chw, c, ratio_h, ratio_w); + } else if ("bilinear" == interp_method) { + KeBilinearInterpBw< + T><<>>( + input_grad_data, in_h, in_w, n, in_chw, output_grad_data, out_h, + out_w, n, out_chw, c, ratio_h, ratio_w); + } + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL(interpolate, ops::InterpolateOpCUDAKernel, + ops::InterpolateOpCUDAKernel, + ops::InterpolateOpCUDAKernel); +REGISTER_OP_CUDA_KERNEL(interpolate_grad, + ops::InterpolateGradOpCUDAKernel, + ops::InterpolateGradOpCUDAKernel); diff --git a/paddle/fluid/operators/interpolate_op.h b/paddle/fluid/operators/interpolate_op.h new file mode 100644 index 0000000000000000000000000000000000000000..7fdb3e1f5a2ff82284d89dd0759e357978e1d873 --- /dev/null +++ b/paddle/fluid/operators/interpolate_op.h @@ -0,0 +1,236 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +#pragma once +#include +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/math/math_function.h" + +namespace paddle { +namespace operators { + +template +using EigenTensor = framework::EigenTensor; +using Tensor = framework::Tensor; + +template +static void NearestNeighborInterpolate(const Tensor& input, Tensor* output, + const float ratio_h, const float ratio_w, + const int n, const int c, + const int out_h, const int out_w) { + auto input_t = EigenTensor::From(input); + auto output_t = EigenTensor::From(*output); + for (int k = 0; k < out_h; k++) { // loop for images + int in_k = static_cast(ratio_h * k + 0.5); + + for (int l = 0; l < out_w; l++) { + int in_l = static_cast(ratio_w * l + 0.5); + + for (int i = 0; i < n; i++) { // loop for batches + for (int j = 0; j < c; j++) { // loop for channels + output_t(i, j, k, l) = input_t(i, j, in_k, in_l); + } + } + } + } +} + +template +static void BilinearInterpolation(const Tensor& input, Tensor* output, + const float ratio_h, const float ratio_w, + const int in_h, const int in_w, const int n, + const int c, const int out_h, + const int out_w) { + auto input_t = EigenTensor::From(input); + auto output_t = EigenTensor::From(*output); + for (int k = 0; k < out_h; k++) { // loop for images + int y_n = static_cast(ratio_h * k); + int y_s = (y_n + 1) < (in_h - 1) ? (y_n + 1) : (in_h - 1); + float d_n = ratio_h * k - y_n; + float d_s = 1.f - d_n; + + for (int l = 0; l < out_w; l++) { + int x_w = static_cast(ratio_w * l); + int x_e = (x_w + 1) < (in_w - 1) ? (x_w + 1) : (in_w - 1); + float d_w = ratio_w * l - x_w; + float d_e = 1.f - d_w; + + for (int i = 0; i < n; i++) { // loop for batches + for (int j = 0; j < c; j++) { // loop for channels + // bilinear interpolation + output_t(i, j, k, l) = input_t(i, j, y_n, x_w) * d_s * d_e + + input_t(i, j, y_s, x_w) * d_n * d_e + + input_t(i, j, y_n, x_e) * d_s * d_w + + input_t(i, j, y_s, x_e) * d_n * d_w; + } + } + } + } +} + +template +static void NearestNeighborInterpolateGrad(const Tensor& output_grad, + Tensor* input_grad, + const float ratio_h, + const float ratio_w, const int n, + const int c, const int out_h, + const int out_w) { + auto input_grad_t = EigenTensor::From(*input_grad); + auto output_grad_t = EigenTensor::From(output_grad); + for (int k = 0; k < out_h; k++) { // loop for images + int in_k = static_cast(ratio_h * k + 0.5); + + for (int l = 0; l < out_w; l++) { + int in_l = static_cast(ratio_w * l + 0.5); + + for (int i = 0; i < n; i++) { // loop for batches + for (int j = 0; j < c; j++) { // loop for channels + input_grad_t(i, j, in_k, in_l) += output_grad_t(i, j, k, l); + } + } + } + } +} + +template +static void BilinearInterpolationGrad(const Tensor& output_grad, + Tensor* input_grad, const float ratio_h, + const float ratio_w, const int in_h, + const int in_w, const int n, const int c, + const int out_h, const int out_w) { + auto input_grad_t = EigenTensor::From(*input_grad); + auto output_grad_t = EigenTensor::From(output_grad); + for (int k = 0; k < out_h; k++) { // loop for images + int y_n = static_cast(ratio_h * k); + int y_s = (y_n + 1) < (in_h - 1) ? (y_n + 1) : (in_h - 1); + float d_n = ratio_h * k - y_n; + float d_s = 1.f - d_n; + + for (int l = 0; l < out_w; l++) { + int x_w = static_cast(ratio_w * l); + int x_e = (x_w + 1) < (in_w - 1) ? (x_w + 1) : (in_w - 1); + float d_w = ratio_w * l - x_w; + float d_e = 1.f - d_w; + + for (int i = 0; i < n; i++) { // loop for batches + for (int j = 0; j < c; j++) { // loop for channels + // bilinear interpolation grad + const T grad = output_grad_t(i, j, k, l); + input_grad_t(i, j, y_n, x_w) += static_cast(grad * d_s * d_e); + input_grad_t(i, j, y_s, x_w) += static_cast(grad * d_n * d_e); + input_grad_t(i, j, y_n, x_e) += static_cast(grad * d_s * d_w); + input_grad_t(i, j, y_s, x_e) += static_cast(grad * d_n * d_w); + } + } + } + } +} + +template +class InterpolateKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* input = ctx.Input("X"); + auto* output = ctx.Output("Out"); + + std::string interp_method = ctx.Attr("interp_method"); + int out_h = ctx.Attr("out_h"); + int out_w = ctx.Attr("out_w"); + auto out_size = ctx.Input("OutSize"); + if (out_size != nullptr) { + auto out_size_data = out_size->data(); + out_h = out_size_data[0]; + out_w = out_size_data[1]; + } + + const int n = input->dims()[0]; + const int c = input->dims()[1]; + const int in_h = input->dims()[2]; + const int in_w = input->dims()[3]; + + output->mutable_data({n, c, out_h, out_w}, ctx.GetPlace()); + auto& device_ctx = + ctx.template device_context(); + math::SetConstant zero; + zero(device_ctx, output, static_cast(0.0)); + + if (in_h == out_h && in_w == out_w) { + framework::TensorCopy(*input, ctx.GetPlace(), output); + return; + } + + float ratio_h = + (out_h > 1) ? static_cast(in_h - 1) / (out_h - 1) : 0.f; + float ratio_w = + (out_w > 1) ? static_cast(in_w - 1) / (out_w - 1) : 0.f; + + if ("bilinear" == interp_method) { + BilinearInterpolation(*input, output, ratio_h, ratio_w, in_h, in_w, n, + c, out_h, out_w); + } else if ("nearest" == interp_method) { + NearestNeighborInterpolate(*input, output, ratio_h, ratio_w, n, c, + out_h, out_w); + } + } +}; + +template +class InterpolateGradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* input = ctx.Input("X"); + auto* input_grad = ctx.Output(framework::GradVarName("X")); + auto* output_grad = ctx.Input(framework::GradVarName("Out")); + + std::string interp_method = ctx.Attr("interp_method"); + int out_h = ctx.Attr("out_h"); + int out_w = ctx.Attr("out_w"); + auto out_size = ctx.Input("OutSize"); + if (out_size != nullptr) { + auto out_size_data = out_size->data(); + out_h = out_size_data[0]; + out_w = out_size_data[1]; + } + + const int n = input->dims()[0]; + const int c = input->dims()[1]; + const int in_h = input->dims()[2]; + const int in_w = input->dims()[3]; + + input_grad->mutable_data({n, c, in_h, in_w}, ctx.GetPlace()); + auto& device_ctx = + ctx.template device_context(); + math::SetConstant zero; + zero(device_ctx, input_grad, static_cast(0.0)); + + if (in_h == out_h && in_w == out_w) { + framework::TensorCopy(*output_grad, ctx.GetPlace(), input_grad); + return; + } + + float ratio_h = + (out_h > 1) ? static_cast(in_h - 1) / (out_h - 1) : 0.f; + float ratio_w = + (out_w > 1) ? static_cast(in_w - 1) / (out_w - 1) : 0.f; + + if ("bilinear" == interp_method) { + BilinearInterpolationGrad(*output_grad, input_grad, ratio_h, ratio_w, + in_h, in_w, n, c, out_h, out_w); + } else if ("nearest" == interp_method) { + NearestNeighborInterpolateGrad(*output_grad, input_grad, ratio_h, + ratio_w, n, c, out_h, out_w); + } + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/listen_and_serv_op.cc b/paddle/fluid/operators/listen_and_serv_op.cc index 1d8b1411cddf4fe16d2d00313c519cc173e1504d..e3d09e2d14817fe0f2ccda18ed90c9436b399ae3 100644 --- a/paddle/fluid/operators/listen_and_serv_op.cc +++ b/paddle/fluid/operators/listen_and_serv_op.cc @@ -36,7 +36,7 @@ namespace operators { void RunServer(std::shared_ptr service) { service->StartServer(); - VLOG(4) << "RunServer thread end"; + VLOG(40) << "RunServer thread end"; } static void split(const std::string &str, char sep, std::vector *pieces) { @@ -66,8 +66,8 @@ static void ParallelExecuteBlocks( fs.push_back(framework::Async([&executor, &prepared, &scope, idx]() { int run_block = idx; // thread local try { - VLOG(3) << "running server block: " << run_block - << "pointer: " << prepared[run_block].get(); + VLOG(30) << "running server block: " << run_block + << "pointer: " << prepared[run_block].get(); executor->RunPreparedContext(prepared[run_block].get(), scope); } catch (const std::exception &e) { LOG(FATAL) << "run sub program:" << idx << " error " << e.what(); @@ -108,7 +108,7 @@ void ListenAndServOp::RunSyncLoop( framework::Scope *recv_scope, platform::DeviceContext *dev_ctx, const std::vector &prefetch_block_id_list, const int checkpoint_point_block_id) const { - VLOG(2) << "RunSyncLoop"; + VLOG(20) << "RunSyncLoop"; size_t num_blocks = program->Size(); auto optimize_blocks = Attr>(kOptimizeBlocks); @@ -167,7 +167,7 @@ void ListenAndServOp::RunSyncLoop( } ParallelExecuteBlocks(parallel_blkids, executor, optimize_prepared, program, recv_scope); - VLOG(2) << "run all blocks spent " << GetTimestamp() - ts << "(ms)"; + VLOG(20) << "run all blocks spent " << GetTimestamp() - ts << "(ms)"; ResetReceivedVars(recv_scope, dev_ctx, rpc_service_->NeedResetAllVars()); @@ -183,11 +183,11 @@ void ListenAndServOp::ResetReceivedVars(framework::Scope *recv_scope, for (auto &varname : sparse_vars_) { auto var = recv_scope->FindVar(varname); if (var == nullptr) { - VLOG(2) << "can not find var " << varname << " in received scope"; + VLOG(20) << "can not find var " << varname << " in received scope"; continue; } if (var->IsType()) { - VLOG(3) << "reset sparse var: " << varname; + VLOG(30) << "reset sparse var: " << varname; var->GetMutable()->mutable_rows()->clear(); } else { PADDLE_THROW("The type of sparse var should be SelectedRows"); @@ -197,7 +197,7 @@ void ListenAndServOp::ResetReceivedVars(framework::Scope *recv_scope, for (auto &varname : dense_vars_) { auto var = recv_scope->FindVar(varname); if (var == nullptr) { - VLOG(2) << "can not find var " << varname << " in received scope"; + VLOG(20) << "can not find var " << varname << " in received scope"; continue; } if (var->IsType()) { @@ -216,7 +216,7 @@ void ListenAndServOp::ResetReceivedVars(framework::Scope *recv_scope, void ListenAndServOp::RunAsyncLoop(framework::Executor *executor, framework::ProgramDesc *program, framework::Scope *recv_scope) const { - VLOG(2) << "RunAsyncLoop"; + VLOG(20) << "RunAsyncLoop"; auto grad_to_block_id_str = Attr>("grad_to_block_id"); DoubleFindMap grad_to_block_id; @@ -225,7 +225,7 @@ void ListenAndServOp::RunAsyncLoop(framework::Executor *executor, const std::string &grad_and_id) { std::vector pieces; split(grad_and_id, ':', &pieces); - VLOG(3) << "after split, key = " << pieces[0] << ", id=" << pieces[1]; + VLOG(30) << "after split, key = " << pieces[0] << ", id=" << pieces[1]; PADDLE_ENFORCE_EQ(pieces.size(), 2); PADDLE_ENFORCE_EQ(out_map->count(pieces[0]), 0); @@ -270,7 +270,7 @@ void ListenAndServOp::RunAsyncLoop(framework::Executor *executor, while (true) { if (rpc_service_->IsExit()) { - VLOG(4) << "get exit!rpc_processor break!"; + VLOG(40) << "get exit!rpc_processor break!"; break; } @@ -332,9 +332,9 @@ void ListenAndServOp::RunImpl(const framework::Scope &scope, std::string endpoint = Attr("endpoint"); int checkpoint_block_id = Attr(kCheckpointBlockId); - VLOG(4) << "sync_mode:" << sync_mode << ", fan_in:" << fan_in - << ", end_point:" << endpoint - << ", checkpoint_block_id: " << checkpoint_block_id; + VLOG(40) << "sync_mode:" << sync_mode << ", fan_in:" << fan_in + << ", end_point:" << endpoint + << ", checkpoint_block_id: " << checkpoint_block_id; rpc_service_.reset(new RPCSERVER_T(endpoint, fan_in)); @@ -383,8 +383,8 @@ void ListenAndServOp::RunImpl(const framework::Scope &scope, prefetch_var_name_to_block_id_str) { std::vector pieces; split(prefetch_var_name_and_id, ':', &pieces); - VLOG(3) << "after split, prefetch_var = " << pieces[0] - << ", id=" << pieces[1]; + VLOG(30) << "after split, prefetch_var = " << pieces[0] + << ", id=" << pieces[1]; PADDLE_ENFORCE_EQ(pieces.size(), 2); int block_id = std::stoi(pieces[1]); @@ -415,7 +415,7 @@ void ListenAndServOp::RunImpl(const framework::Scope &scope, // start the server listening after all member initialized. server_thread_.reset(new std::thread(RunServer, rpc_service_)); - VLOG(3) << "wait server thread to become ready..."; + VLOG(30) << "wait server thread to become ready..."; rpc_service_->WaitServerReady(); // register SIGINT(from ctrl+C) and SIGTERM(from kill) signal handlers diff --git a/paddle/fluid/operators/lod_rank_table_op.cc b/paddle/fluid/operators/lod_rank_table_op.cc index 166952fe23192799443ef9c9d1f7ba5056d19290..59ef9cb626d61f918c8ad1990a0f25030fb44ec6 100644 --- a/paddle/fluid/operators/lod_rank_table_op.cc +++ b/paddle/fluid/operators/lod_rank_table_op.cc @@ -30,9 +30,9 @@ class LoDRankTableOp : public framework::OperatorBase { auto x = scope.FindVar(Input("X"))->Get(); auto *out = scope.FindVar(Output("Out"))->GetMutable(); - VLOG(10) << "Level = " << static_cast(Attr("level")); + VLOG(100) << "Level = " << static_cast(Attr("level")); out->Reset(x.lod(), static_cast(Attr("level"))); - VLOG(10) << Input("X") << "'s lod information is " << *out; + VLOG(100) << Input("X") << "'s lod information is " << *out; } }; diff --git a/paddle/fluid/operators/lookup_table_op.cc b/paddle/fluid/operators/lookup_table_op.cc index 3226a727b1f5f6de9e97ce2068381be7c9b69ff3..1878dfe8a897db1b8c948d325fa48a38ca224a2b 100644 --- a/paddle/fluid/operators/lookup_table_op.cc +++ b/paddle/fluid/operators/lookup_table_op.cc @@ -134,13 +134,13 @@ class LookupTableOpGradVarTypeInference : public framework::VarTypeInference { auto attr = op_desc.GetAttr("is_sparse"); bool is_sparse = boost::get(attr); if (is_sparse) { - VLOG(3) << "lookup_table_grad op " << framework::GradVarName("W") - << " is set to SelectedRows"; + VLOG(30) << "lookup_table_grad op " << framework::GradVarName("W") + << " is set to SelectedRows"; block->Var(out_var_name) ->SetType(framework::proto::VarType::SELECTED_ROWS); } else { - VLOG(3) << "lookup_table_grad op " << framework::GradVarName("W") - << " is set to LoDTensor"; + VLOG(30) << "lookup_table_grad op " << framework::GradVarName("W") + << " is set to LoDTensor"; block->Var(out_var_name)->SetType(framework::proto::VarType::LOD_TENSOR); } block->Var(out_var_name)->SetDataType(block->Var("W")->GetDataType()); diff --git a/paddle/fluid/operators/math/cpu_vec_test.cc b/paddle/fluid/operators/math/cpu_vec_test.cc index cd40f1b2f984126663a5711efac24fdf6d680b32..18a586f8dd9f01357d9facca19c51ed5c293ffd2 100644 --- a/paddle/fluid/operators/math/cpu_vec_test.cc +++ b/paddle/fluid/operators/math/cpu_vec_test.cc @@ -96,8 +96,8 @@ void TestAndBench(const int n, std::function tgt, } auto et = GetCurrentUS(); - VLOG(3) << "Vec size " << n << ": refer takes: " << (et - mt) / repeat - << " us, tgt takes: " << (mt - st) / repeat; + VLOG(30) << "Vec size " << n << ": refer takes: " << (et - mt) / repeat + << " us, tgt takes: " << (mt - st) / repeat; for (int i = 0; i < n; ++i) { EXPECT_NEAR(ytgt_data[i], yref_data[i], 1e-3); } diff --git a/paddle/fluid/operators/math/jit_code.cc b/paddle/fluid/operators/math/jit_code.cc index a92e5d351e71a55bca2845ce275780950d096031..6b3eecfbd11471b5d95dcb10c91acc536f78cb85 100644 --- a/paddle/fluid/operators/math/jit_code.cc +++ b/paddle/fluid/operators/math/jit_code.cc @@ -24,21 +24,30 @@ namespace gen { using namespace platform::jit; // NOLINT -bool VVVJitCode::init(int d) { +bool VXXJitCode::init(int d, int scalar_index) { // It's not necessary to use avx512 since it would slow down the frequency // and this kernel is not compute bound. - return MayIUse(avx); + return MayIUse(avx) && scalar_index >= 0 && scalar_index <= 2; } -void VVVJitCode::generate() { +void VXXJitCode::generate() { // do not need push stack, and do not need save avx512reg if do not use avx512 int offset = 0; if (with_relu_) { vxorps(ymm_zero, ymm_zero, ymm_zero); } + if (scalar_index_ == 1) { + vbroadcastss(ymm_src1, ptr[param1]); + } else if (scalar_index_ == 2) { + vbroadcastss(ymm_src2, ptr[param2]); + } for (int i = 0; i < num_ / AVX_FLOAT_BLOCK; ++i) { - vmovups(ymm_src1, ptr[param1 + offset]); - vmovups(ymm_src2, ptr[param2 + offset]); + if (scalar_index_ != 1) { + vmovups(ymm_src1, ptr[param1 + offset]); + } + if (scalar_index_ != 2) { + vmovups(ymm_src2, ptr[param2 + offset]); + } if (type_ == operand_type::mul) { vmulps(ymm_dst, ymm_src1, ymm_src2); } else if (type_ == operand_type::add) { @@ -52,8 +61,12 @@ void VVVJitCode::generate() { } int rest = num_ % AVX_FLOAT_BLOCK; if (rest >= 4) { - vmovups(xmm_src1, ptr[param1 + offset]); - vmovups(xmm_src2, ptr[param2 + offset]); + if (scalar_index_ != 1) { + vmovups(xmm_src1, ptr[param1 + offset]); + } + if (scalar_index_ != 2) { + vmovups(xmm_src2, ptr[param2 + offset]); + } if (type_ == operand_type::mul) { vmulps(xmm_dst, xmm_src1, xmm_src2); } else if (type_ == operand_type::add) { @@ -67,8 +80,12 @@ void VVVJitCode::generate() { rest -= 4; } if (rest >= 2) { - vmovq(xmm_src1, ptr[param1 + offset]); - vmovq(xmm_src2, ptr[param2 + offset]); + if (scalar_index_ != 1) { + vmovups(xmm_src1, ptr[param1 + offset]); + } + if (scalar_index_ != 2) { + vmovups(xmm_src2, ptr[param2 + offset]); + } if (type_ == operand_type::mul) { vmulps(xmm_dst, xmm_src1, xmm_src2); } else if (type_ == operand_type::add) { @@ -82,8 +99,12 @@ void VVVJitCode::generate() { rest -= 2; } if (rest > 0) { - vmovss(xmm_src1, ptr[param1 + offset]); - vmovss(xmm_src2, ptr[param2 + offset]); + if (scalar_index_ != 1) { + vmovups(xmm_src1, ptr[param1 + offset]); + } + if (scalar_index_ != 2) { + vmovups(xmm_src2, ptr[param2 + offset]); + } if (type_ == operand_type::mul) { vmulss(xmm_dst, xmm_src1, xmm_src2); } else if (type_ == operand_type::add) { @@ -96,6 +117,7 @@ void VVVJitCode::generate() { } ret(); } + } // namespace gen } // namespace jitkernel } // namespace math diff --git a/paddle/fluid/operators/math/jit_code.h b/paddle/fluid/operators/math/jit_code.h index 73692ebc67c71f6190f2d18bd50071a28a35d4c9..aaedb0ae10323eeddfba9512d9e47c7a22320610 100644 --- a/paddle/fluid/operators/math/jit_code.h +++ b/paddle/fluid/operators/math/jit_code.h @@ -29,33 +29,46 @@ using ymm_t = const Xbyak::Ymm; using zmm_t = const Xbyak::Zmm; using Label = Xbyak::Label; -// function: vec = Operand(vec, vec) (maybe with relu) typedef enum { mul = 0, add } operand_type; -class VVVJitCode : public JitCode { +// function: vec = Operand(vec(or scalar), vec(or scalar)) (maybe with relu) +class VXXJitCode : public JitCode { public: const char* name() const override { - std::string base = "VVVJitCode"; + std::string base = "VXXJitCode"; + if (scalar_index_ == 1) { + base += "_Scalar"; + } else { + base += "_Vec"; + } if (type_ == operand_type::mul) { base += "_Mul"; } else if (type_ == operand_type::add) { base += "_Add"; } - base += (with_relu_ ? "_relu" : ""); + if (scalar_index_ == 2) { + base += "_Scalar"; + } else { + base += "_Vec"; + } + base += (with_relu_ ? "_Relu" : ""); return base.c_str(); } - explicit VVVJitCode(int d, operand_type type, bool with_relu, - size_t code_size = 256 * 1024, void* code_ptr = nullptr) + explicit VXXJitCode(int d, operand_type type, int scalar_index, + bool with_relu, size_t code_size = 256 * 1024, + void* code_ptr = nullptr) : JitCode(code_size, code_ptr), num_(d), type_(type), + scalar_index_(scalar_index), with_relu_(with_relu) {} - static bool init(int d); + static bool init(int d, int scalar_index = 0); void generate() override; private: int num_; operand_type type_; + int scalar_index_; bool with_relu_; reg64_t param1{abi_param1}; reg64_t param2{abi_param2}; @@ -63,13 +76,13 @@ class VVVJitCode : public JitCode { xmm_t xmm_src1 = xmm_t(0); xmm_t xmm_src2 = xmm_t(1); - xmm_t xmm_dst = xmm_t(1); - xmm_t xmm_zero = xmm_t(2); + xmm_t xmm_dst = xmm_t(2); + xmm_t xmm_zero = xmm_t(3); ymm_t ymm_src1 = ymm_t(0); ymm_t ymm_src2 = ymm_t(1); - ymm_t ymm_dst = ymm_t(1); - ymm_t ymm_zero = ymm_t(2); + ymm_t ymm_dst = ymm_t(2); + ymm_t ymm_zero = ymm_t(3); }; } // namespace gen diff --git a/paddle/fluid/operators/math/jit_kernel.h b/paddle/fluid/operators/math/jit_kernel.h index 04e0b81d3e7c696ac2f5ee78db90fb3c89ab345d..e9b259282cd00cc2afc46634423ec09590bf5dd3 100644 --- a/paddle/fluid/operators/math/jit_kernel.h +++ b/paddle/fluid/operators/math/jit_kernel.h @@ -83,14 +83,15 @@ class VAddReluKernel : public Kernel { template class VScalKernel : public Kernel { public: - virtual void Compute(const T a, const T *x, T *y) const = 0; - virtual void Compute(const T a, T *x) const = 0; + // y = a.*x + void (*Compute)(const T *, const T *, T *, int); }; template class VAddBiasKernel : public Kernel { public: - virtual void Compute(const T a, const T *x, T *y) const = 0; + // y = a.+x + void (*Compute)(const T *, const T *, T *, int); }; template diff --git a/paddle/fluid/operators/math/jit_kernel_blas.cc b/paddle/fluid/operators/math/jit_kernel_blas.cc index f976953a245e424e6cb26bbf1cff2f120f84c133..c4bfbcf925a2bbdc39f8468049c58e126d3eba1b 100644 --- a/paddle/fluid/operators/math/jit_kernel_blas.cc +++ b/paddle/fluid/operators/math/jit_kernel_blas.cc @@ -57,6 +57,20 @@ void VAddReluRefer(const T* x, const T* y, T* z, int n) { } } +template +void VScalRefer(const T* a, const T* x, T* y, int n) { + for (int i = 0; i < n; ++i) { + y[i] = a[0] * x[i]; + } +} + +template +void VAddBiasRefer(const T* a, const T* x, T* y, int n) { + for (int i = 0; i < n; ++i) { + y[i] = a[0] + x[i]; + } +} + #ifdef PADDLE_WITH_MKLML template void VMulMKL(const T* x, const T* y, T* z, int n); @@ -83,6 +97,28 @@ template <> void VAddMKL(const double* x, const double* y, double* z, int n) { platform::dynload::vdAdd(n, x, y, z); } + +template +void VScalMKL(const T* a, const T* x, T* y, int n); + +template <> +void VScalMKL(const float* a, const float* x, float* y, int n) { + if (x == y) { + platform::dynload::cblas_sscal(n, *a, y, 1); + } else { + VScalRefer(a, x, y, n); + } +} + +template <> +void VScalMKL(const double* a, const double* x, double* y, int n) { + if (x == y) { + platform::dynload::cblas_dscal(n, *a, y, 1); + } else { + VScalRefer(a, x, y, n); + } +} + #endif #define DECLARE_STATIC_FUNC \ @@ -102,7 +138,7 @@ class VMulKernelImpl : public VMulKernel { if (useJIT(d)) { // roughly estimate the size of code size_t sz = 96 + d / AVX_FLOAT_BLOCK * 4 * 8; - jitcode_.reset(new gen::VVVJitCode(d, gen::operand_type::mul, false, + jitcode_.reset(new gen::VXXJitCode(d, gen::operand_type::mul, 0, false, sz > 4096 ? sz : 4096)); this->Compute = jitcode_->getCode(); @@ -121,14 +157,14 @@ class VMulKernelImpl : public VMulKernel { #ifdef PADDLE_WITH_XBYAK private: - std::unique_ptr jitcode_{nullptr}; + std::unique_ptr jitcode_{nullptr}; #endif }; #ifdef PADDLE_WITH_XBYAK template <> bool VMulKernelImpl::useJIT(int d) { - return gen::VVVJitCode::init(d); + return gen::VXXJitCode::init(d); } #endif @@ -153,7 +189,7 @@ class VAddKernelImpl : public VAddKernel { #ifdef PADDLE_WITH_XBYAK if (useJIT(d)) { size_t sz = 96 + d / AVX_FLOAT_BLOCK * 4 * 8; - jitcode_.reset(new gen::VVVJitCode(d, gen::operand_type::add, false, + jitcode_.reset(new gen::VXXJitCode(d, gen::operand_type::add, 0, false, sz > 4096 ? sz : 4096)); this->Compute = jitcode_->getCode(); @@ -171,14 +207,14 @@ class VAddKernelImpl : public VAddKernel { #ifdef PADDLE_WITH_XBYAK private: - std::unique_ptr jitcode_{nullptr}; + std::unique_ptr jitcode_{nullptr}; #endif }; #ifdef PADDLE_WITH_XBYAK template <> bool VAddKernelImpl::useJIT(int d) { - return gen::VVVJitCode::init(d); + return gen::VXXJitCode::init(d); } #endif @@ -203,7 +239,7 @@ class VAddReluKernelImpl : public VAddReluKernel { #ifdef PADDLE_WITH_XBYAK if (useJIT(d)) { size_t sz = 96 + d / AVX_FLOAT_BLOCK * 4 * 8; - jitcode_.reset(new gen::VVVJitCode(d, gen::operand_type::add, true, + jitcode_.reset(new gen::VXXJitCode(d, gen::operand_type::add, 0, true, sz > 4096 ? sz : 4096)); this->Compute = jitcode_->getCode(); @@ -215,148 +251,106 @@ class VAddReluKernelImpl : public VAddReluKernel { #ifdef PADDLE_WITH_XBYAK private: - std::unique_ptr jitcode_{nullptr}; + std::unique_ptr jitcode_{nullptr}; #endif }; #ifdef PADDLE_WITH_XBYAK template <> bool VAddReluKernelImpl::useJIT(int d) { - return gen::VVVJitCode::init(d); + return gen::VXXJitCode::init(d); } #endif -#undef DECLARE_STATIC_FUNC - -REGISTER_JITKERNEL(vmul, VMulKernel); -REGISTER_JITKERNEL(vadd, VAddKernel); -REGISTER_JITKERNEL(vaddrelu, VAddReluKernel); - -/* VSCAL JitKernel */ -template +/* VScal JitKernel */ +template class VScalKernelImpl : public VScalKernel { public: - explicit VScalKernelImpl(int d) : VScalKernel() { this->num_ = d; } - void Compute(const T a, const T* x, T* y) const override { - for (int i = 0; i < this->num_; ++i) { - y[i] = a * x[i]; - } - } - void Compute(const T a, T* x) const override { - for (int i = 0; i < this->num_; ++i) { - x[i] = a * x[i]; + DECLARE_STATIC_FUNC; + explicit VScalKernelImpl(int d) : VScalKernel() { +#ifdef PADDLE_WITH_XBYAK + if (useJIT(d)) { + size_t sz = 96 + d / AVX_FLOAT_BLOCK * 4 * 8; + jitcode_.reset(new gen::VXXJitCode(d, gen::operand_type::mul, 1, false, + sz > 4096 ? sz : 4096)); + this->Compute = + jitcode_->getCode(); + return; } - } -}; - +#endif #ifdef PADDLE_WITH_MKLML -#define MKL_FLOAT(isa, block) \ - template <> \ - void VScalKernelImpl::Compute(const float a, float* x) \ - const { \ - platform::dynload::cblas_sscal(this->num_, a, x, 1); \ - } - -#define MKL_DOUBLE(isa, block) \ - template <> \ - void VScalKernelImpl::Compute(const double a, double* x) \ - const { \ - platform::dynload::cblas_dscal(this->num_, a, x, 1); \ - } - -FOR_EACH_ISA(MKL_FLOAT, kGT16); -FOR_EACH_ISA_BLOCK(MKL_DOUBLE); + if (useMKL(d)) { + this->Compute = VScalMKL; + return; + } #endif - -#define INTRI8_FLOAT(isa) \ - template <> \ - void VScalKernelImpl::Compute( \ - const float a, const float* x, float* y) const { \ - __m256 tmp; \ - __m256 scalar = _mm256_set1_ps(a); \ - tmp = _mm256_loadu_ps(x); \ - tmp = _mm256_mul_ps(tmp, scalar); \ - _mm256_storeu_ps(y, tmp); \ - } -#define INTRI8_INPLACE_FLOAT(isa) \ - template <> \ - void VScalKernelImpl::Compute(const float a, float* x) \ - const { \ - __m256 tmp; \ - __m256 scalar = _mm256_set1_ps(a); \ - tmp = _mm256_loadu_ps(x); \ - tmp = _mm256_mul_ps(tmp, scalar); \ - _mm256_storeu_ps(x, tmp); \ + this->Compute = VScalRefer; } +#ifdef PADDLE_WITH_XBYAK -#ifdef __AVX__ -INTRI8_FLOAT(jit::avx); -INTRI8_INPLACE_FLOAT(jit::avx); -#endif -#ifdef __AVX2__ -INTRI8_FLOAT(jit::avx2); -INTRI8_INPLACE_FLOAT(jit::avx2); + private: + std::unique_ptr jitcode_{nullptr}; #endif -#ifdef __AVX512F__ -INTRI8_FLOAT(jit::avx512f); -INTRI8_INPLACE_FLOAT(jit::avx512f); +}; + +#ifdef PADDLE_WITH_XBYAK +template <> +bool VScalKernelImpl::useJIT(int d) { + return gen::VXXJitCode::init(d, 1); +} #endif -// TODO(TJ): eq16 test and complete avx512 -#undef INTRI8_FLOAT -#undef INTRI8_INPLACE_FLOAT -#undef MKL_FLOAT -#undef MKL_DOUBLE +#ifdef PADDLE_WITH_MKLML +template <> +bool VScalKernelImpl::useMKL(int d) { + return d > 512; +} +template <> +bool VScalKernelImpl::useMKL(int d) { + return true; +} +#endif /* VAddBias JitKernel */ -template +template class VAddBiasKernelImpl : public VAddBiasKernel { public: - explicit VAddBiasKernelImpl(int d) : VAddBiasKernel() { this->num_ = d; } - void Compute(const T a, const T* x, T* y) const override { - for (int i = 0; i < this->num_; ++i) { - y[i] = x[i] + a; + DECLARE_STATIC_FUNC; + explicit VAddBiasKernelImpl(int d) : VAddBiasKernel() { +#ifdef PADDLE_WITH_XBYAK + if (useJIT(d)) { + size_t sz = 96 + d / AVX_FLOAT_BLOCK * 4 * 8; + jitcode_.reset(new gen::VXXJitCode(d, gen::operand_type::add, 1, false, + sz > 4096 ? sz : 4096)); + this->Compute = + jitcode_->getCode(); + return; } - } -}; - -#define INTRI8_FLOAT(isa) \ - template <> \ - void VAddBiasKernelImpl::Compute( \ - const float a, const float* x, float* y) const { \ - __m256 tmp = _mm256_loadu_ps(x); \ - tmp = _mm256_add_ps(tmp, _mm256_set1_ps(a)); \ - _mm256_storeu_ps(y, tmp); \ - } +#endif -#define INTRI16_FLOAT(isa) \ - template <> \ - void VAddBiasKernelImpl::Compute( \ - const float a, const float* x, float* y) const { \ - __m256 tmp0 = _mm256_loadu_ps(x); \ - __m256 tmp1 = _mm256_loadu_ps(x + 8); \ - tmp0 = _mm256_add_ps(tmp0, _mm256_set1_ps(a)); \ - tmp1 = _mm256_add_ps(tmp1, _mm256_set1_ps(a)); \ - _mm256_storeu_ps(y, tmp0); \ - _mm256_storeu_ps(y + 8, tmp1); \ + this->Compute = VAddBiasRefer; } +#ifdef PADDLE_WITH_XBYAK -#ifdef __AVX__ -INTRI8_FLOAT(jit::avx); -INTRI16_FLOAT(jit::avx); -#endif -#ifdef __AVX2__ -INTRI8_FLOAT(jit::avx2); -INTRI16_FLOAT(jit::avx2); + private: + std::unique_ptr jitcode_{nullptr}; #endif -#ifdef __AVX512F__ -INTRI8_FLOAT(jit::avx512f); -INTRI16_FLOAT(jit::avx512f); +}; + +#ifdef PADDLE_WITH_XBYAK +template <> +bool VAddBiasKernelImpl::useJIT(int d) { + return gen::VXXJitCode::init(d, 1); +} #endif -// TODO(TJ): eq16 test and complete avx512 -#undef INTRI8_FLOAT -#undef INTRI16_FLOAT +#undef DECLARE_STATIC_FUNC + +REGISTER_JITKERNEL(vmul, VMulKernel); +REGISTER_JITKERNEL(vadd, VAddKernel); +REGISTER_JITKERNEL(vaddrelu, VAddReluKernel); +REGISTER_JITKERNEL(vscal, VScalKernel); +REGISTER_JITKERNEL(vaddbias, VAddBiasKernel); /* VRelu JitKernel */ template @@ -467,8 +461,6 @@ class VIdentityKernelImpl : public VIdentityKernel { void Compute(const T* x, T* y) const override {} }; -REGISTER_JITKERNEL_DEPRECATED(vscal, VScalKernel); -REGISTER_JITKERNEL_DEPRECATED(vaddb, VAddBiasKernel); REGISTER_JITKERNEL_DEPRECATED(vrelu, VReluKernel); REGISTER_JITKERNEL_DEPRECATED(videntity, VIdentityKernel); diff --git a/paddle/fluid/operators/math/jit_kernel_exp.cc b/paddle/fluid/operators/math/jit_kernel_exp.cc index d7c177e6782e19e199542e10e1d62587ee0df4cf..c55e54a13f539014c0f582436ca1a105d0b0fedd 100644 --- a/paddle/fluid/operators/math/jit_kernel_exp.cc +++ b/paddle/fluid/operators/math/jit_kernel_exp.cc @@ -409,10 +409,11 @@ class VTanhKernelImpl : public VTanhKernel { vaddbias_ = KernelPool::Instance().template Get>(d); } void Compute(const T* x, T* y) const override { - vscal_->Compute(static_cast(2), x, y); + const T a = static_cast(2), b = static_cast(-1); + vscal_->Compute(&a, x, y, this->num_); vsigmoid_->Compute(y, y); - vscal_->Compute(static_cast(2), y); - vaddbias_->Compute(static_cast(-1), y, y); + vscal_->Compute(&a, y, y, this->num_); + vaddbias_->Compute(&b, y, y, this->num_); } private: @@ -472,10 +473,11 @@ class VTanhKernelImpl : public VTanhKernel { _mm256_storeu_ps(y, tmp); \ x += AVX_FLOAT_BLOCK; \ y += AVX_FLOAT_BLOCK; \ - vscal_->Compute(2.f, x, y); \ + const float a = 2.f, b = -1.f; \ + vscal_->Compute(&a, x, y, this->num_); \ vsigmoid_->Compute(y, y); \ - vscal_->Compute(2.f, y); \ - vaddbias_->Compute(-1.f, y, y); \ + vscal_->Compute(&a, y, y, this->num_); \ + vaddbias_->Compute(&b, y, y, this->num_); \ } #define INTRI_GT16_FLOAT(isa, expisa) \ @@ -502,10 +504,11 @@ class VTanhKernelImpl : public VTanhKernel { } \ x += this->end_; \ y += this->end_; \ - vscal_->Compute(2.f, x, y); \ + const float a = 2.f, b = -1.f; \ + vscal_->Compute(&a, x, y, this->num_); \ vsigmoid_->Compute(y, y); \ - vscal_->Compute(2.f, y); \ - vaddbias_->Compute(-1.f, y, y); \ + vscal_->Compute(&a, y, y, this->num_); \ + vaddbias_->Compute(&b, y, y, this->num_); \ } #ifdef __AVX__ diff --git a/paddle/fluid/operators/math/jit_kernel_test.cc b/paddle/fluid/operators/math/jit_kernel_test.cc index 9a19424691fad70c161ca6036c5cdfd3b2b22ada..7dc3e600b564d95b46070ff4436b2d0de2f3e105 100644 --- a/paddle/fluid/operators/math/jit_kernel_test.cc +++ b/paddle/fluid/operators/math/jit_kernel_test.cc @@ -87,7 +87,7 @@ TEST(JitKernel, vrelu) { vrelu_intri8(d, x_data, zref_data); } auto si1 = GetCurrentUS(); - VLOG(3) << "Vec size 8 intr takes: " << (si1 - si0) / repeat; + VLOG(30) << "Vec size 8 intr takes: " << (si1 - si0) / repeat; } #endif auto ttgts = GetCurrentUS(); @@ -95,8 +95,9 @@ TEST(JitKernel, vrelu) { ker->Compute(x_data, ztgt_data); } auto ttgte = GetCurrentUS(); - VLOG(3) << "Vec size " << d << ": refer takes: " << (trefe - trefs) / repeat - << " us, tgt takes: " << (ttgte - ttgts) / repeat; + VLOG(30) << "Vec size " << d + << ": refer takes: " << (trefe - trefs) / repeat + << " us, tgt takes: " << (ttgte - ttgts) / repeat; for (int i = 0; i < d; ++i) { EXPECT_NEAR(ztgt_data[i], zref_data[i], 1e-3); } @@ -128,12 +129,13 @@ TEST(JitKernel, vaddbias) { auto trefe = GetCurrentUS(); auto ttgts = GetCurrentUS(); for (int i = 0; i < repeat; ++i) { - ker->Compute(a, x_data, ztgt_data); + ker->Compute(&a, x_data, ztgt_data, d); } auto ttgte = GetCurrentUS(); - VLOG(3) << "Vec size " << d << ": refer takes: " << (trefe - trefs) / repeat - << " us, tgt takes: " << (ttgte - ttgts) / repeat; + VLOG(30) << "Vec size " << d + << ": refer takes: " << (trefe - trefs) / repeat + << " us, tgt takes: " << (ttgte - ttgts) / repeat; for (int i = 0; i < d; ++i) { EXPECT_NEAR(ztgt_data[i], zref_data[i], 1e-3); } @@ -183,13 +185,14 @@ TEST(JitKernel, vexp) { } auto ttgte = GetCurrentUS(); - VLOG(3) << "Vec size " << d << ": refer takes: " << (trefe - trefs) / repeat + VLOG(30) << "Vec size " << d + << ": refer takes: " << (trefe - trefs) / repeat #ifdef PADDLE_WITH_MKLML - << " us, mkl takes: " << (tmkle - tmkls) / repeat << " us, " + << " us, mkl takes: " << (tmkle - tmkls) / repeat << " us, " #else - << " us, " + << " us, " #endif - << "tgt takes: " << (ttgte - ttgts) / repeat; + << "tgt takes: " << (ttgte - ttgts) / repeat; for (int i = 0; i < d; ++i) { EXPECT_NEAR(ztgt_data[i], zref_data[i], 1e-3); } @@ -254,9 +257,10 @@ TEST(JitKernel, vsigmoid) { } auto ttgte = GetCurrentUS(); - VLOG(3) << "Vec size " << d << ": refer takes: " << (trefe - trefs) / repeat - << " us, better(jit exp) takes: " << (tmkle - tmkls) / repeat - << " us, tgt takes: " << (ttgte - ttgts) / repeat; + VLOG(30) << "Vec size " << d + << ": refer takes: " << (trefe - trefs) / repeat + << " us, better(jit exp) takes: " << (tmkle - tmkls) / repeat + << " us, tgt takes: " << (ttgte - ttgts) / repeat; for (int i = 0; i < d; ++i) { EXPECT_NEAR(ztgt_data[i], zref_data[i], 1e-3); } @@ -281,10 +285,11 @@ void vtanh_better( const paddle::operators::math::jitkernel::VAddBiasKernel>& vaddbias, const int n, const float* x, float* y) { - vscal->Compute(2.f, x, y); + const float a = 2.f, b = -1.f; + vscal->Compute(&a, x, y, n); vsigmoid->Compute(y, y); - vscal->Compute(2.f, y); - vaddbias->Compute(-1.f, y, y); + vscal->Compute(&a, y, y, n); + vaddbias->Compute(&b, y, y, n); } TEST(JitKernel, vtanh) { @@ -320,9 +325,10 @@ TEST(JitKernel, vtanh) { } auto ttgte = GetCurrentUS(); - VLOG(3) << "Vec size " << d << ": refer takes: " << (trefe - trefs) / repeat - << " us, better(jit exp) takes: " << (tmkle - tmkls) / repeat - << " us, tgt takes: " << (ttgte - ttgts) / repeat; + VLOG(30) << "Vec size " << d + << ": refer takes: " << (trefe - trefs) / repeat + << " us, better(jit exp) takes: " << (tmkle - tmkls) / repeat + << " us, tgt takes: " << (ttgte - ttgts) / repeat; for (int i = 0; i < d; ++i) { EXPECT_NEAR(ztgt_data[i], zref_data[i], 1e-3); } @@ -440,9 +446,10 @@ TEST(JitKernel, lstm) { ker->ComputeCtHt(x_data, ct_1_data, ct_tgt_data, ht_tgt_data); } auto ttgte = GetCurrentUS(); - VLOG(3) << "Vec size " << d << ": refer takes: " << (trefe - trefs) / repeat - << " us, better(jit) takes: " << (tmkle - tmkls) / repeat - << " us, tgt takes: " << (ttgte - ttgts) / repeat; + VLOG(30) << "Vec size " << d + << ": refer takes: " << (trefe - trefs) / repeat + << " us, better(jit) takes: " << (tmkle - tmkls) / repeat + << " us, tgt takes: " << (ttgte - ttgts) / repeat; } } @@ -524,30 +531,32 @@ TEST(JitKernel, vscal) { vscal_inp_intri8(d, a, y_data); } auto si3 = GetCurrentUS(); - VLOG(3) << "Vec size 8 intr takes: " << (si1 - si0) / repeat - << " us, inplace: " << (si3 - si2) / repeat; + VLOG(30) << "Vec size 8 intr takes: " << (si1 - si0) / repeat + << " us, inplace: " << (si3 - si2) / repeat; } #endif auto ttgts = GetCurrentUS(); for (int i = 0; i < repeat; ++i) { - ker->Compute(a, x_data, ztgt_data); + ker->Compute(&a, x_data, ztgt_data, d); } auto ttgte = GetCurrentUS(); auto ttgts1 = GetCurrentUS(); for (int i = 0; i < repeat; ++i) { - ker->Compute(a, y_data); + ker->Compute(&a, y_data, y_data, d); } auto ttgte1 = GetCurrentUS(); - VLOG(3) << "Vec size " << d << ": refer takes: " << (trefe - trefs) / repeat - << " us, inplace takes: " << (trefe1 - trefs1) / repeat + VLOG(30) << "Vec size " << d + << ": refer takes: " << (trefe - trefs) / repeat + << " us, inplace takes: " << (trefe1 - trefs1) / repeat #ifdef PADDLE_WITH_MKLML - << " us, mkl inplace takes: " << (tmkle - tmkls) / repeat << " us, " + << " us, mkl inplace takes: " << (tmkle - tmkls) / repeat + << " us, " #else - << " us, " + << " us, " #endif - << "tgt takes: " << (ttgte - ttgts) / repeat - << "us, tgt inplace takes: " << (ttgte1 - ttgts1) / repeat; + << "tgt takes: " << (ttgte - ttgts) / repeat + << "us, tgt inplace takes: " << (ttgte1 - ttgts1) / repeat; for (int i = 0; i < d; ++i) { EXPECT_NEAR(ztgt_data[i], zref_data[i], 1e-3); } @@ -610,7 +619,7 @@ TEST(JitKernel, vmul) { vmul_intri8(d, x_data, y_data, zref_data); } auto si1 = GetCurrentUS(); - VLOG(3) << "Vec size 8 intr takes: " << (si1 - si0) / repeat; + VLOG(30) << "Vec size 8 intr takes: " << (si1 - si0) / repeat; } #endif @@ -620,13 +629,14 @@ TEST(JitKernel, vmul) { } auto ttgte = GetCurrentUS(); - VLOG(3) << "Vec size " << d << ": refer takes: " << (trefe - trefs) / repeat + VLOG(30) << "Vec size " << d + << ": refer takes: " << (trefe - trefs) / repeat #ifdef PADDLE_WITH_MKLML - << " us, mkl takes: " << (tmkle - tmkls) / repeat << " us, " + << " us, mkl takes: " << (tmkle - tmkls) / repeat << " us, " #else - << " us, " + << " us, " #endif - << "tgt takes: " << (ttgte - ttgts) / repeat; + << "tgt takes: " << (ttgte - ttgts) / repeat; for (int i = 0; i < d; ++i) { EXPECT_NEAR(ztgt_data[i], zref_data[i], 1e-3); } @@ -689,7 +699,7 @@ TEST(JitKernel, vadd) { vadd_intri8(d, x_data, y_data, zref_data); } auto si1 = GetCurrentUS(); - VLOG(3) << "Vec size 8 intr takes: " << (si1 - si0) / repeat; + VLOG(30) << "Vec size 8 intr takes: " << (si1 - si0) / repeat; } #endif @@ -699,13 +709,14 @@ TEST(JitKernel, vadd) { } auto ttgte = GetCurrentUS(); - VLOG(3) << "Vec size " << d << ": refer takes: " << (trefe - trefs) / repeat + VLOG(30) << "Vec size " << d + << ": refer takes: " << (trefe - trefs) / repeat #ifdef PADDLE_WITH_MKLML - << " us, mkl takes: " << (tmkle - tmkls) / repeat << " us, " + << " us, mkl takes: " << (tmkle - tmkls) / repeat << " us, " #else - << " us, " + << " us, " #endif - << "tgt takes: " << (ttgte - ttgts) / repeat; + << "tgt takes: " << (ttgte - ttgts) / repeat; for (int i = 0; i < d; ++i) { EXPECT_NEAR(ztgt_data[i], zref_data[i], 1e-3); } @@ -760,9 +771,10 @@ TEST(JitKernel, vaddrelu) { ker->Compute(x_data, y_data, ztgt_data, d); } auto ttgte = GetCurrentUS(); - VLOG(3) << "Vec size " << d << ": refer takes: " << (trefe - trefs) / repeat - << " us, better takes: " << (tmkle - tmkls) / repeat << " us, " - << "tgt takes: " << (ttgte - ttgts) / repeat; + VLOG(30) << "Vec size " << d + << ": refer takes: " << (trefe - trefs) / repeat + << " us, better takes: " << (tmkle - tmkls) / repeat << " us, " + << "tgt takes: " << (ttgte - ttgts) / repeat; for (int i = 0; i < d; ++i) { EXPECT_NEAR(ztgt_data[i], zref_data[i], 1e-3); } diff --git a/paddle/fluid/operators/math/selected_rows_functor.cc b/paddle/fluid/operators/math/selected_rows_functor.cc index 75946740375d74043960b68e94eb048b3bab4b79..9577a4cb9d275df9604b7578f8685e4d2938a5e9 100644 --- a/paddle/fluid/operators/math/selected_rows_functor.cc +++ b/paddle/fluid/operators/math/selected_rows_functor.cc @@ -270,7 +270,7 @@ struct MergeAdd { const std::vector& inputs, framework::SelectedRows* output) { if (inputs.size() == 0) { - VLOG(3) << "no input! return"; + VLOG(30) << "no input! return"; return; } const framework::SelectedRows* has_value_input = nullptr; @@ -281,7 +281,7 @@ struct MergeAdd { } } if (has_value_input == nullptr) { - VLOG(3) << "no input has value! just return" << std::endl; + VLOG(30) << "no input has value! just return" << std::endl; return; } auto input_width = has_value_input->value().dims()[1]; diff --git a/paddle/fluid/operators/math/selected_rows_functor.cu b/paddle/fluid/operators/math/selected_rows_functor.cu index c4fccdbf862fda8a599869c30ae598573ca367aa..74b9659cfd38076bf1948b5c664817a6753b7090 100644 --- a/paddle/fluid/operators/math/selected_rows_functor.cu +++ b/paddle/fluid/operators/math/selected_rows_functor.cu @@ -314,7 +314,7 @@ struct MergeAdd { const std::vector& inputs, framework::SelectedRows* output) { if (inputs.size() == 0) { - VLOG(3) << "no input! return"; + VLOG(30) << "no input! return"; return; } const framework::SelectedRows* has_value_input = nullptr; @@ -325,7 +325,7 @@ struct MergeAdd { } } if (has_value_input == nullptr) { - VLOG(3) << "no input has value! just return" << std::endl; + VLOG(30) << "no input has value! just return" << std::endl; return; } auto input_width = has_value_input->value().dims()[1]; diff --git a/paddle/fluid/operators/mean_op.cc b/paddle/fluid/operators/mean_op.cc index 19426b3c204095bd415cebcd87cff18468acd564..820636defad0be9fb2e6decefc938658ae70ea9b 100644 --- a/paddle/fluid/operators/mean_op.cc +++ b/paddle/fluid/operators/mean_op.cc @@ -13,7 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/mean_op.h" - +#include namespace paddle { namespace operators { @@ -42,6 +42,14 @@ Mean Operator calculates the mean of all elements in X. } }; +class MeanOpInferVarType : public framework::PassInDtypeAndVarTypeToOutput { + protected: + std::unordered_map GetInputOutputWithSameType() + const override { + return std::unordered_map{{"X", /*->*/ "Out"}}; + } +}; + class MeanGradOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; @@ -50,6 +58,14 @@ class MeanGradOp : public framework::OperatorWithKernel { ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X")); ctx->ShareLoD("X", framework::GradVarName("X")); } + + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + auto input_data_type = + framework::ToDataType(ctx.Input("X")->type()); + + return framework::OpKernelType(input_data_type, ctx.GetPlace()); + } }; class MeanGradMaker : public framework::SingleGradOpDescMaker { @@ -71,7 +87,8 @@ class MeanGradMaker : public framework::SingleGradOpDescMaker { } // namespace paddle namespace ops = paddle::operators; -REGISTER_OPERATOR(mean, ops::MeanOp, ops::MeanOpMaker, ops::MeanGradMaker); +REGISTER_OPERATOR(mean, ops::MeanOp, ops::MeanOpMaker, ops::MeanOpInferVarType, + ops::MeanGradMaker); REGISTER_OPERATOR(mean_grad, ops::MeanGradOp); REGISTER_OP_CPU_KERNEL( mean, ops::MeanKernel, diff --git a/paddle/fluid/operators/momentum_op.h b/paddle/fluid/operators/momentum_op.h index 71f079e4d97f5259359ee6572f584894551452ca..e5b756b4fa637f2d4136f8c8a87bf34c6c04413a 100644 --- a/paddle/fluid/operators/momentum_op.h +++ b/paddle/fluid/operators/momentum_op.h @@ -346,7 +346,7 @@ class MomentumOpKernel : public framework::OpKernel { // sparse update maybe empty. if (grad->rows().size() == 0) { - VLOG(3) << "Grad SelectedRows contains no data!"; + VLOG(30) << "Grad SelectedRows contains no data!"; return; } auto* merged_grad = const_cast(ctx.scope()) diff --git a/paddle/fluid/operators/mul_op.cc b/paddle/fluid/operators/mul_op.cc index 363abfb0e0c96e8a4d82124dff168f28e339a9ae..08f2949d4a3774894912ae5251806b46e6240702 100644 --- a/paddle/fluid/operators/mul_op.cc +++ b/paddle/fluid/operators/mul_op.cc @@ -38,9 +38,9 @@ class MulOp : public framework::OperatorWithKernel { int x_num_col_dims = ctx->Attrs().Get("x_num_col_dims"); int y_num_col_dims = ctx->Attrs().Get("y_num_col_dims"); - VLOG(3) << "mul operator x.shape=" << x_dims << " y.shape=" << y_dims - << " x_num_col_dims=" << x_num_col_dims - << " y_num_col_dims=" << y_num_col_dims; + VLOG(30) << "mul operator x.shape=" << x_dims << " y.shape=" << y_dims + << " x_num_col_dims=" << x_num_col_dims + << " y_num_col_dims=" << y_num_col_dims; PADDLE_ENFORCE_GT( x_dims.size(), x_num_col_dims, @@ -126,6 +126,14 @@ or not. But the output only shares the LoD information with input $X$. } }; +class MulOpInferVarType : public framework::PassInDtypeAndVarTypeToOutput { + protected: + std::unordered_map GetInputOutputWithSameType() + const override { + return std::unordered_map{{"X", /*->*/ "Out"}}; + } +}; + class MulGradOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; @@ -178,7 +186,8 @@ class MulOpGradMaker : public framework::SingleGradOpDescMaker { } // namespace paddle namespace ops = paddle::operators; -REGISTER_OPERATOR(mul, ops::MulOp, ops::MulOpMaker, ops::MulOpGradMaker); +REGISTER_OPERATOR(mul, ops::MulOp, ops::MulOpMaker, ops::MulOpInferVarType, + ops::MulOpGradMaker); REGISTER_OPERATOR(mul_grad, ops::MulGradOp); REGISTER_OP_CPU_KERNEL( mul, ops::MulKernel, diff --git a/paddle/fluid/operators/nccl_op.cu.cc b/paddle/fluid/operators/nccl_op.cu.cc index 8de974bc2b333fb6ccc5b5f0bb1af86533139925..9db0031a6934537a7d991b775ecac688ae6b66e9 100644 --- a/paddle/fluid/operators/nccl_op.cu.cc +++ b/paddle/fluid/operators/nccl_op.cu.cc @@ -63,16 +63,16 @@ class NCCLAllReduceKernel : public framework::OpKernel { // device id int gpu_id = boost::get(ctx.GetPlace()).GetDeviceId(); int idx = comm->GetCommId(gpu_id); - VLOG(3) << "gpu : " - << " invoke allreduce. send " << x->numel() << " recv " - << out->numel(); + VLOG(30) << "gpu : " + << " invoke allreduce. send " << x->numel() << " recv " + << out->numel(); PADDLE_ENFORCE(platform::dynload::ncclAllReduce( x->data(), out->mutable_data(ctx.GetPlace()), out->numel(), NCCLTypeWrapper::type, reduction_op_, comm->comms().at(idx), ctx.cuda_device_context().stream())); - VLOG(3) << "gpu : " - << " finished allreduce. send " << x->numel() << " recv " - << out->numel(); + VLOG(30) << "gpu : " + << " finished allreduce. send " << x->numel() << " recv " + << out->numel(); } }; @@ -109,14 +109,14 @@ class NCCLReduceKernel : public framework::OpKernel { } else { out->Resize(framework::make_ddim({0})); } - VLOG(3) << "gpu : " << gpu_id << " invoke reduce. send " << x->numel() - << " recv " << out->numel(); + VLOG(30) << "gpu : " << gpu_id << " invoke reduce. send " << x->numel() + << " recv " << out->numel(); PADDLE_ENFORCE(platform::dynload::ncclReduce( x->data(), recvbuffer, x->numel(), NCCLTypeWrapper::type, reduction_op_, root, comm->comms().at(idx), ctx.cuda_device_context().stream())); - VLOG(3) << "gpu : " << gpu_id << " finished reduce. send " << x->numel() - << " recv " << out->numel(); + VLOG(30) << "gpu : " << gpu_id << " finished reduce. send " << x->numel() + << " recv " << out->numel(); } }; @@ -133,21 +133,22 @@ class NCCLBcastKernel : public framework::OpKernel { int idx = comm->GetCommId(gpu_id); if (idx == root) { auto* x = ctx.Input("X"); - VLOG(3) << "gpu : " << gpu_id << " invoke Bcast. send " << x->numel(); + VLOG(30) << "gpu : " << gpu_id << " invoke Bcast. send " << x->numel(); PADDLE_ENFORCE(platform::dynload::ncclBcast( reinterpret_cast(const_cast(x->data())), x->numel(), NCCLTypeWrapper::type, root, comm->comms().at(idx), ctx.cuda_device_context().stream())); - VLOG(3) << "gpu : " << gpu_id << " finished Bcast."; + VLOG(30) << "gpu : " << gpu_id << " finished Bcast."; } else { auto* out = ctx.Output("Out"); - VLOG(3) << "gpu : " << gpu_id << " invoke Bcast. recv buffer " - << framework::product(out->dims()); + VLOG(30) << "gpu : " << gpu_id << " invoke Bcast. recv buffer " + << framework::product(out->dims()); PADDLE_ENFORCE(platform::dynload::ncclBcast( out->mutable_data(ctx.GetPlace()), out->numel(), NCCLTypeWrapper::type, root, comm->comms().at(idx), ctx.cuda_device_context().stream())); - VLOG(3) << "gpu : " << gpu_id << " finished Bcast. recv " << out->numel(); + VLOG(30) << "gpu : " << gpu_id << " finished Bcast. recv " + << out->numel(); } } }; diff --git a/paddle/fluid/operators/nccl_op_test.cu.cc b/paddle/fluid/operators/nccl_op_test.cu.cc index d5fb7a12e5d9757f3e639f6de7f0129bd531e2a1..f48ccdd97fa5adb475013cf26e7544c2729b4457 100644 --- a/paddle/fluid/operators/nccl_op_test.cu.cc +++ b/paddle/fluid/operators/nccl_op_test.cu.cc @@ -86,9 +86,9 @@ class NCCLTester : public ::testing::Test { (*p_scopes).resize(gpu_list_.size()); auto op = f::OpRegistry::CreateOp(*op1); - VLOG(1) << "invoke NCCLInitOp."; + VLOG(10) << "invoke NCCLInitOp."; op->Run(g_scope_, cpu_place); - VLOG(1) << "NCCLInitOp finished."; + VLOG(10) << "NCCLInitOp finished."; } int GetGPUData(int gpu_id) { return gpu_id + 42; } @@ -109,7 +109,7 @@ class NCCLTester : public ::testing::Test { std::vector send_vector(f::product(kDims), GetGPUData(gpu_id)); paddle::framework::TensorFromVector(send_vector, *ctx, send_tensor); - VLOG(1) << "Send Tensor filled with elements " << send_tensor->numel(); + VLOG(10) << "Send Tensor filled with elements " << send_tensor->numel(); } lk.unlock(); @@ -119,11 +119,11 @@ class NCCLTester : public ::testing::Test { auto op = f::OpRegistry::CreateOp(*op1); - VLOG(1) << "Device : " << gpu_id << " invoke " << op_desc.Type(); - VLOG(1) << " send_tensor : " << send_tensor->numel() - << " recv_tensor : " << recv_tensor->numel(); + VLOG(10) << "Device : " << gpu_id << " invoke " << op_desc.Type(); + VLOG(10) << " send_tensor : " << send_tensor->numel() + << " recv_tensor : " << recv_tensor->numel(); op->Run(*scope, place); - VLOG(1) << "Device : " << gpu_id << " finished " << op_desc.Type(); + VLOG(10) << "Device : " << gpu_id << " finished " << op_desc.Type(); } public: diff --git a/paddle/fluid/operators/parallel_do_op.cc b/paddle/fluid/operators/parallel_do_op.cc index ab25628d45699dbcfc1fc5792958bae9e42e72a3..c795d4bdd10c0ffbf30a4849fc773335036e34c2 100644 --- a/paddle/fluid/operators/parallel_do_op.cc +++ b/paddle/fluid/operators/parallel_do_op.cc @@ -48,7 +48,7 @@ static void SplitTensorAndMoveTensorToScopes( auto lod_tensors = tensor.SplitLoDTensor(places); for (auto &lod : lod_tensors) { - VLOG(3) << lod.dims(); + VLOG(30) << lod.dims(); } if (num_sub_scopes == 0) { num_sub_scopes = lod_tensors.size(); @@ -263,7 +263,7 @@ class ParallelDoGradOp : public framework::OperatorBase { if (s == framework::kEmptyVarName) { continue; } - VLOG(3) << "Moving " << s; + VLOG(30) << "Moving " << s; CopyOrShare(*sub_scopes[0]->FindVar(s), place, scope.FindVar(s)); } WaitOnPlaces(places); @@ -277,7 +277,7 @@ class ParallelDoGradOp : public framework::OperatorBase { if (s == framework::kEmptyVarName) { continue; } - VLOG(3) << "Accumulating " << s; + VLOG(30) << "Accumulating " << s; if (s == framework::kEmptyVarName) continue; std::string tmp_name; auto *tmp = sub_scopes[0]->Var(&tmp_name); @@ -289,7 +289,7 @@ class ParallelDoGradOp : public framework::OperatorBase { auto sum_op = framework::OpRegistry::CreateOp( "sum", {{"X", {s, tmp_name}}}, {{"Out", {s}}}, framework::AttributeMap{{"use_mkldnn", {false}}}); - VLOG(10) << sum_op->DebugStringEx(sub_scopes[0]); + VLOG(100) << sum_op->DebugStringEx(sub_scopes[0]); sum_op->Run(*sub_scopes[0], places[0]); WaitOnPlace(places[0]); } @@ -316,7 +316,7 @@ class ParallelDoGradOpDescMaker : public framework::SingleGradOpDescMaker { auto *grad = new framework::OpDesc(); grad->SetType("parallel_do_grad"); for (auto &input_param : this->InputNames()) { - VLOG(3) << input_param; + VLOG(30) << input_param; grad->SetInput(input_param, this->Input(input_param)); if (input_param != kPlaces) { grad->SetOutput(framework::GradVarName(input_param), diff --git a/paddle/fluid/operators/pool_op.cc b/paddle/fluid/operators/pool_op.cc index 484cb65746612343fafc49fe61b607f2e919cf4f..46a95350a7293c18313811ba9b367fd65955145a 100644 --- a/paddle/fluid/operators/pool_op.cc +++ b/paddle/fluid/operators/pool_op.cc @@ -40,7 +40,7 @@ int PoolOutputSize(int input_size, int filter_size, int padding, int stride, return output_size; } -void PoolOp::InferShape(framework::InferShapeContext *ctx) const { +void PoolOp::InferShape(framework::InferShapeContext* ctx) const { PADDLE_ENFORCE(ctx->HasInput("X"), "X(Input) of Pooling should not be null."); PADDLE_ENFORCE(ctx->HasOutput("Out"), "Out(Output) of Pooling should not be null."); @@ -81,7 +81,7 @@ void PoolOp::InferShape(framework::InferShapeContext *ctx) const { } framework::OpKernelType PoolOp::GetExpectedKernelType( - const framework::ExecutionContext &ctx) const { + const framework::ExecutionContext& ctx) const { framework::LibraryType library_{framework::LibraryType::kPlain}; std::string data_format = ctx.Attr("data_format"); framework::DataLayout layout_ = framework::StringToDataLayout(data_format); @@ -104,7 +104,7 @@ framework::OpKernelType PoolOp::GetExpectedKernelType( layout_, library_); } -void PoolOpGrad::InferShape(framework::InferShapeContext *ctx) const { +void PoolOpGrad::InferShape(framework::InferShapeContext* ctx) const { PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) must not be null."); PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")), "Input(X@GRAD) should not be null."); @@ -112,7 +112,7 @@ void PoolOpGrad::InferShape(framework::InferShapeContext *ctx) const { } framework::OpKernelType PoolOpGrad::GetExpectedKernelType( - const framework::ExecutionContext &ctx) const { + const framework::ExecutionContext& ctx) const { framework::LibraryType library_{framework::LibraryType::kPlain}; std::string data_format = ctx.Attr("data_format"); framework::DataLayout layout_ = framework::StringToDataLayout(data_format); @@ -262,6 +262,14 @@ Example: )DOC"); } +class PoolOpInferVarType : public framework::PassInDtypeAndVarTypeToOutput { + protected: + std::unordered_map GetInputOutputWithSameType() + const override { + return std::unordered_map{{"X", /*->*/ "Out"}}; + } +}; + void Pool3dOpMaker::Make() { AddInput("X", "(Tensor) The input tensor of pooling operator. " @@ -372,6 +380,7 @@ Example: namespace ops = paddle::operators; REGISTER_OPERATOR(pool2d, ops::PoolOp, ops::Pool2dOpMaker, + ops::PoolOpInferVarType, paddle::framework::DefaultGradOpDescMaker); REGISTER_OPERATOR(pool2d_grad, ops::PoolOpGrad); @@ -383,6 +392,7 @@ REGISTER_OP_CPU_KERNEL( ops::PoolGradKernel); REGISTER_OPERATOR(pool3d, ops::PoolOp, ops::Pool3dOpMaker, + ops::PoolOpInferVarType, paddle::framework::DefaultGradOpDescMaker); REGISTER_OPERATOR(pool3d_grad, ops::PoolOpGrad); diff --git a/paddle/fluid/operators/prefetch_op.cc b/paddle/fluid/operators/prefetch_op.cc index 490dfa41be2de987c51b7f06d988ce27980aa5f2..55853d25460bf6e3d07c829d686e71cc9367118c 100644 --- a/paddle/fluid/operators/prefetch_op.cc +++ b/paddle/fluid/operators/prefetch_op.cc @@ -48,12 +48,12 @@ class PrefetchOp : public framework::OperatorBase { std::vector rets; for (size_t i = 0; i < ins.size(); i++) { if (NeedSend(scope, ins[i])) { - VLOG(3) << "sending " << ins[i] << " to " << epmap[i] << " to get " - << outs[i] << " back"; + VLOG(30) << "sending " << ins[i] << " to " << epmap[i] << " to get " + << outs[i] << " back"; rets.push_back(rpc_client->AsyncPrefetchVar(epmap[i], ctx, scope, ins[i], outs[i])); } else { - VLOG(3) << "don't send no-initialied variable: " << ins[i]; + VLOG(30) << "don't send no-initialied variable: " << ins[i]; } } for (size_t i = 0; i < rets.size(); i++) { diff --git a/paddle/fluid/operators/random_crop_op.h b/paddle/fluid/operators/random_crop_op.h index d68ba9d661698bb0d33b139f5748daec2ead6595..5f1a48b6de01550978638917e3c66ef2851ee2ed 100644 --- a/paddle/fluid/operators/random_crop_op.h +++ b/paddle/fluid/operators/random_crop_op.h @@ -155,8 +155,8 @@ class RandomCropKernel : public framework::OpKernel { seed = *cpu_seed.data(); } } else { - VLOG(5) << "WARNING: The input 'Seed' is not initialized, use attribute " - "'startup_seed' instead."; + VLOG(50) << "WARNING: The input 'Seed' is not initialized, use attribute " + "'startup_seed' instead."; seed = ctx.Attr("startup_seed"); } auto shape = ctx.Attr>("shape"); diff --git a/paddle/fluid/operators/reader/blocking_queue.h b/paddle/fluid/operators/reader/blocking_queue.h index 51b980acb5a08d431d96a3a92479dec09119c27e..618248f87298d62078aeccfa135b853b9d2b1744 100644 --- a/paddle/fluid/operators/reader/blocking_queue.h +++ b/paddle/fluid/operators/reader/blocking_queue.h @@ -42,7 +42,7 @@ class BlockingQueue { std::unique_lock lock(mutex_); send_cv_.wait(lock, [&] { return queue_.size() < capacity_ || closed_; }); if (closed_) { - VLOG(5) + VLOG(50) << "WARNING: Sending an element to a closed reader::BlokcingQueue."; return false; } @@ -56,7 +56,7 @@ class BlockingQueue { std::unique_lock lock(mutex_); send_cv_.wait(lock, [&] { return queue_.size() < capacity_ || closed_; }); if (closed_) { - VLOG(5) + VLOG(50) << "WARNING: Sending an element to a closed reader::BlokcingQueue."; return false; } diff --git a/paddle/fluid/operators/reader/create_shuffle_reader_op.cc b/paddle/fluid/operators/reader/create_shuffle_reader_op.cc index 3f72890a7cee1453585d50afa04fa62a9b059dc3..3fe4e9e7adee071fd56cf9f3d2560829f096ba9b 100644 --- a/paddle/fluid/operators/reader/create_shuffle_reader_op.cc +++ b/paddle/fluid/operators/reader/create_shuffle_reader_op.cc @@ -26,7 +26,7 @@ class ShuffleReader : public framework::DecoratedReader { ShuffleReader(const std::shared_ptr& reader, size_t buffer_size, size_t seed = 0) : DecoratedReader(reader), buffer_size_(buffer_size), seed_(seed) { - VLOG(10) << "Create shuffle reader of " << reader_; + VLOG(100) << "Create shuffle reader of " << reader_; if (seed_ == 0) { std::random_device device; seed_ = device(); @@ -37,7 +37,7 @@ class ShuffleReader : public framework::DecoratedReader { void ReadNextImpl(std::vector* out) override { out->clear(); if (iteration_pos_ >= buffer_.size()) { - VLOG(10) << "Resetting shuffle buffer"; + VLOG(100) << "Resetting shuffle buffer"; ReloadBuffer(); if (buffer_.empty()) { return; @@ -73,7 +73,7 @@ class ShuffleReader : public framework::DecoratedReader { std::mt19937 g(seed_); std::shuffle(buffer_.begin(), buffer_.end(), g); seed_ = g(); // update seed_; - VLOG(10) << "random buffer size = " << buffer_.size(); + VLOG(100) << "random buffer size = " << buffer_.size(); } size_t buffer_size_; diff --git a/paddle/fluid/operators/recurrent_op.cc b/paddle/fluid/operators/recurrent_op.cc index 162bfcbb0844d29385d0f8ad5d25a3f8de6bd41b..283dce93212ac91fc4a3276598c1f32cfd36d1e7 100644 --- a/paddle/fluid/operators/recurrent_op.cc +++ b/paddle/fluid/operators/recurrent_op.cc @@ -160,7 +160,7 @@ class RecurrentBase : public framework::OperatorBase { Callback callback) { PADDLE_ENFORCE_EQ(src_vars.size(), dst_vars.size()); for (size_t i = 0; i < dst_vars.size(); ++i) { - VLOG(10) << "Link " << src_vars[i] << " to " << dst_vars[i]; + VLOG(100) << "Link " << src_vars[i] << " to " << dst_vars[i]; AccessTensor(src_scope, src_vars[i], dst_scope, dst_vars[i], callback); } } @@ -176,7 +176,7 @@ class RecurrentBase : public framework::OperatorBase { Callback callback) { PADDLE_ENFORCE_EQ(src_vars.size(), dst_vars.size()); for (size_t i = 0; i < dst_vars.size(); ++i) { - VLOG(10) << "Link " << src_vars[i] << " to " << dst_vars[i]; + VLOG(100) << "Link " << src_vars[i] << " to " << dst_vars[i]; AccessTensor(src_scope, src_vars[i], dst_scope, dst_vars[i], callback); } } @@ -230,7 +230,7 @@ class RecurrentOp : public RecurrentBase { void RunImpl(const framework::Scope &scope, const platform::Place &place) const override { auto seq_len = static_cast(this->GetSequenceLength(scope)); - VLOG(3) << "Static RNN input sequence length = " << seq_len; + VLOG(30) << "Static RNN input sequence length = " << seq_len; StepScopes scopes = CreateStepScopes(scope, seq_len); auto reverse = Attr(kReverse); @@ -241,7 +241,7 @@ class RecurrentOp : public RecurrentBase { for (size_t i = 0; i < seq_len; ++i) { size_t seq_offset = reverse ? seq_len - i - 1 : i; - VLOG(3) << "Recurrent operate at the time step " << seq_offset; + VLOG(30) << "Recurrent operate at the time step " << seq_offset; auto &cur_scope = scopes.CurScope(); @@ -334,7 +334,7 @@ class RecurrentGradOp : public RecurrentBase { for (size_t step_id = 0; step_id < seq_len; ++step_id) { size_t seq_offset = reverse ? step_id : seq_len - step_id - 1; - VLOG(3) << "Recurrent backward operate at the time step " << seq_offset; + VLOG(30) << "Recurrent backward operate at the time step " << seq_offset; auto &cur_scope = scopes.CurScope(); // Link outside::output_grads --> inside::output_grads // inside::output_grad = outside::output_grad[seq_offset:seq_offset+1] @@ -348,11 +348,11 @@ class RecurrentGradOp : public RecurrentBase { }); auto og_set = List2Set(Inputs(kOutputGrads)); - if (VLOG_IS_ON(10)) { + if (VLOG_IS_ON(100)) { std::ostringstream sout; std::copy(og_set.begin(), og_set.end(), std::ostream_iterator(sout, ",")); - VLOG(10) << " RNN output gradients = [" << sout.str() << "]"; + VLOG(100) << " RNN output gradients = [" << sout.str() << "]"; } // Link states @@ -374,7 +374,7 @@ class RecurrentGradOp : public RecurrentBase { auto &ex_tensor = ex_scope.FindVar(ex_grad)->Get(); - VLOG(10) << " RNN link " << cur_grad << " from " << ex_grad; + VLOG(100) << " RNN link " << cur_grad << " from " << ex_grad; auto *cur_grad_var = cur_scope.Var(cur_grad); auto cur_grad_tensor = cur_grad_var->GetMutable(); @@ -382,12 +382,12 @@ class RecurrentGradOp : public RecurrentBase { } } - VLOG(5) << "Recurrent memory linking finished "; + VLOG(50) << "Recurrent memory linking finished "; // Run step block with cur_scope executor.Run(*program, &cur_scope, block->ID(), false /*create_local_scope*/); - VLOG(5) << "executor.Run finished "; + VLOG(50) << "executor.Run finished "; auto local_var_names = LocalVarNames(cur_scope); @@ -436,7 +436,7 @@ class RecurrentGradOp : public RecurrentBase { cur_scope.Rename(new_inside_name, inside_grad_name); } } - VLOG(5) << "Accumulate Parameter finished "; + VLOG(50) << "Accumulate Parameter finished "; // Copy input gradient from inside to outside // outside::input_grad[seq_offset: seq_offset + 1] = inside::input_grad @@ -455,7 +455,7 @@ class RecurrentGradOp : public RecurrentBase { auto dst = outside->Slice(seq_offset, seq_offset + 1); framework::TensorCopy(inside, place, dev_ctx, &dst); }); - VLOG(5) << "Link outside gradient finished "; + VLOG(50) << "Link outside gradient finished "; if (step_id + 1 == seq_len) { // at_end // copy initialize states gradient from inside to outside @@ -468,7 +468,7 @@ class RecurrentGradOp : public RecurrentBase { outside->mutable_data(place, inside.type()); framework::TensorCopy(inside, place, dev_ctx, outside); }); - VLOG(5) << "Link initialize state gradient finished "; + VLOG(50) << "Link initialize state gradient finished "; } scopes.Next(); } diff --git a/paddle/fluid/operators/recv_op.cc b/paddle/fluid/operators/recv_op.cc index 0399ff41007fbe10da8d53a05671eb0cfb475a5f..fbbd86502bfc61c004f88971526195f6a083d5a9 100644 --- a/paddle/fluid/operators/recv_op.cc +++ b/paddle/fluid/operators/recv_op.cc @@ -47,7 +47,7 @@ class RecvOp : public framework::OperatorBase { std::vector rets; for (size_t i = 0; i < outs.size(); i++) { - VLOG(3) << "getting " << outs[i] << " from " << epmap[i]; + VLOG(30) << "getting " << outs[i] << " from " << epmap[i]; rets.push_back(rpc_client->AsyncGetVar(epmap[i], ctx, scope, outs[i])); } if (sync_mode) { diff --git a/paddle/fluid/operators/reduce_max_op.cu b/paddle/fluid/operators/reduce_max_op.cu index 0d86b3127e42f7ee14ba57b1c762e8128a0f2d54..b21da178f3eeaafa41bde5f64cc4abcf7944b032 100644 --- a/paddle/fluid/operators/reduce_max_op.cu +++ b/paddle/fluid/operators/reduce_max_op.cu @@ -23,12 +23,3 @@ REGISTER_OP_CUDA_KERNEL(reduce_max, int, ops::MaxFunctor>, ops::ReduceKernel); -REGISTER_OP_CUDA_KERNEL( - reduce_max_grad, ops::ReduceGradKernel, - ops::ReduceGradKernel, - ops::ReduceGradKernel, - ops::ReduceGradKernel); diff --git a/paddle/fluid/operators/reduce_max_op.part.cu b/paddle/fluid/operators/reduce_max_op.part.cu new file mode 100644 index 0000000000000000000000000000000000000000..6954c8d744faee6f8f0b715d6e4c8e3bcda7fb83 --- /dev/null +++ b/paddle/fluid/operators/reduce_max_op.part.cu @@ -0,0 +1,25 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/operators/reduce_min_max_op.h" + +REGISTER_OP_CUDA_KERNEL( + reduce_max_grad, ops::ReduceGradKernel, + ops::ReduceGradKernel, + ops::ReduceGradKernel, + ops::ReduceGradKernel); diff --git a/paddle/fluid/operators/reduce_mean_op.cu b/paddle/fluid/operators/reduce_mean_op.cu index 59b30244839849d79e3e531953134633503c4090..4408200d2d052c2f68c2dd35619de6ed67f07f6e 100644 --- a/paddle/fluid/operators/reduce_mean_op.cu +++ b/paddle/fluid/operators/reduce_mean_op.cu @@ -69,13 +69,3 @@ REGISTER_OP_CUDA_KERNEL(reduce_mean, ops::ReduceMeanKernel, ops::ReduceMeanKernel, ops::ReduceMeanKernel, ops::ReduceMeanKernel); - -REGISTER_OP_CUDA_KERNEL( - reduce_mean_grad, ops::ReduceGradKernel, - ops::ReduceGradKernel, - ops::ReduceGradKernel, - ops::ReduceGradKernel); diff --git a/paddle/fluid/operators/reduce_mean_op.part.cu b/paddle/fluid/operators/reduce_mean_op.part.cu new file mode 100644 index 0000000000000000000000000000000000000000..4b663bcdca7c20f8802d962a362f429d8eafe9af --- /dev/null +++ b/paddle/fluid/operators/reduce_mean_op.part.cu @@ -0,0 +1,26 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// .part used to speed up nvcc compile +#include "paddle/fluid/operators/reduce_mean_op.h" + +REGISTER_OP_CUDA_KERNEL( + reduce_mean_grad, ops::ReduceGradKernel, + ops::ReduceGradKernel, + ops::ReduceGradKernel, + ops::ReduceGradKernel); diff --git a/paddle/fluid/operators/reduce_min_op.cu b/paddle/fluid/operators/reduce_min_op.cu index da466f805eff4709dc23471baef03e94052ee6c1..5a04a12b79444dcea30d3c1140d9708a98b55fe3 100644 --- a/paddle/fluid/operators/reduce_min_op.cu +++ b/paddle/fluid/operators/reduce_min_op.cu @@ -23,12 +23,3 @@ REGISTER_OP_CUDA_KERNEL(reduce_min, int, ops::MinFunctor>, ops::ReduceKernel); -REGISTER_OP_CUDA_KERNEL( - reduce_min_grad, ops::ReduceGradKernel, - ops::ReduceGradKernel, - ops::ReduceGradKernel, - ops::ReduceGradKernel); diff --git a/paddle/fluid/operators/reduce_min_op.part.cu b/paddle/fluid/operators/reduce_min_op.part.cu new file mode 100644 index 0000000000000000000000000000000000000000..5b8f061b2d03eb76863401905ac87044fd5ea778 --- /dev/null +++ b/paddle/fluid/operators/reduce_min_op.part.cu @@ -0,0 +1,25 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/operators/reduce_min_max_op.h" + +REGISTER_OP_CUDA_KERNEL( + reduce_min_grad, ops::ReduceGradKernel, + ops::ReduceGradKernel, + ops::ReduceGradKernel, + ops::ReduceGradKernel); diff --git a/paddle/fluid/operators/reduce_prod_op.cu b/paddle/fluid/operators/reduce_prod_op.cu index d62e677d92cffecf629d1684026b0c7bcfec29e3..d8692afb96e4d5d3206210060684dd12fb4d79a7 100644 --- a/paddle/fluid/operators/reduce_prod_op.cu +++ b/paddle/fluid/operators/reduce_prod_op.cu @@ -23,12 +23,3 @@ REGISTER_OP_CUDA_KERNEL(reduce_prod, int, ops::ProdFunctor>, ops::ReduceKernel); -REGISTER_OP_CUDA_KERNEL( - reduce_prod_grad, ops::ReduceGradKernel, - ops::ReduceGradKernel, - ops::ReduceGradKernel, - ops::ReduceGradKernel); diff --git a/paddle/fluid/operators/reduce_prod_op.part.cu b/paddle/fluid/operators/reduce_prod_op.part.cu new file mode 100644 index 0000000000000000000000000000000000000000..486c578c64b9a2d80abc940a7c4266ef5fd23c7f --- /dev/null +++ b/paddle/fluid/operators/reduce_prod_op.part.cu @@ -0,0 +1,25 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/operators/reduce_prod_op.h" + +REGISTER_OP_CUDA_KERNEL( + reduce_prod_grad, ops::ReduceGradKernel, + ops::ReduceGradKernel, + ops::ReduceGradKernel, + ops::ReduceGradKernel); diff --git a/paddle/fluid/operators/reduce_sum_op.cu b/paddle/fluid/operators/reduce_sum_op.cu index 53cd9e9419dd9aecee730917ae21d7a4ab332ffc..2b031e8df99768c9208146640bddbe51149b2614 100644 --- a/paddle/fluid/operators/reduce_sum_op.cu +++ b/paddle/fluid/operators/reduce_sum_op.cu @@ -64,13 +64,3 @@ class ReduceSumKernel : public framework::OpKernel { REGISTER_OP_CUDA_KERNEL(reduce_sum, ops::ReduceSumKernel, ops::ReduceSumKernel, ops::ReduceSumKernel, ops::ReduceSumKernel); - -REGISTER_OP_CUDA_KERNEL( - reduce_sum_grad, ops::ReduceGradKernel, - ops::ReduceGradKernel, - ops::ReduceGradKernel, - ops::ReduceGradKernel); diff --git a/paddle/fluid/operators/reduce_sum_op.part.cu b/paddle/fluid/operators/reduce_sum_op.part.cu new file mode 100644 index 0000000000000000000000000000000000000000..525633f62a95b2d0d677fcbebe551b75cb2a180d --- /dev/null +++ b/paddle/fluid/operators/reduce_sum_op.part.cu @@ -0,0 +1,26 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/operators/cub_reduce.h" +#include "paddle/fluid/operators/reduce_sum_op.h" + +REGISTER_OP_CUDA_KERNEL( + reduce_sum_grad, ops::ReduceGradKernel, + ops::ReduceGradKernel, + ops::ReduceGradKernel, + ops::ReduceGradKernel); diff --git a/paddle/fluid/operators/rnn_memory_helper_op.cc b/paddle/fluid/operators/rnn_memory_helper_op.cc index 0fb7776fd9dbf437673820c7cf9411644272626c..b840e690960cf77a37895f5b3d83c4cdbc2fca35 100644 --- a/paddle/fluid/operators/rnn_memory_helper_op.cc +++ b/paddle/fluid/operators/rnn_memory_helper_op.cc @@ -93,7 +93,7 @@ class RNNMemoryHelperGradOp : public framework::OperatorBase { in_grad_var_name); if (out_grad_var == nullptr) { - VLOG(5) << "Using fill constant 0 as starting gradient"; + VLOG(50) << "Using fill constant 0 as starting gradient"; auto in_var_name = Input("X"); auto *in_var = scope.FindVar(in_var_name); auto &in_var_tensor = in_var->Get(); diff --git a/paddle/fluid/operators/save_op.cc b/paddle/fluid/operators/save_op.cc index e79cffcf498c52ed14db235f6221cfdf08399c9d..0dcf3f0e372f07370078553465973edfd7c96e07 100644 --- a/paddle/fluid/operators/save_op.cc +++ b/paddle/fluid/operators/save_op.cc @@ -110,7 +110,7 @@ class SaveOp : public framework::OperatorBase { lt_var != nullptr, "Can not find variable kLookupTablePath for SaveSelectedRows"); std::string filename = lt_var->data(); - VLOG(4) << "SaveSelectedRows get File name: " << filename; + VLOG(40) << "SaveSelectedRows get File name: " << filename; MkDirRecursively(DirName(filename).c_str()); diff --git a/paddle/fluid/operators/scatter.cu.h b/paddle/fluid/operators/scatter.cu.h index ac7d69bfb549fd98c76fcf834e8d3ad9bec2ef23..b2e79f6c82bb748293f4219845e6798347c8c46e 100644 --- a/paddle/fluid/operators/scatter.cu.h +++ b/paddle/fluid/operators/scatter.cu.h @@ -51,7 +51,8 @@ void GPUScatterAssign(const platform::DeviceContext& ctx, const Tensor& src, const Tensor& index, Tensor* output) { // PADDLE_ENFORCE(platform::is_gpu_place(place)); // check index of shape 1-D - PADDLE_ENFORCE(index.dims().size() == 1); + PADDLE_ENFORCE(index.dims().size() == 1 || + (index.dims().size() == 2 && index.dims()[1] == 1)); int index_size = index.dims()[0]; auto src_dims = src.dims(); diff --git a/paddle/fluid/operators/scatter.h b/paddle/fluid/operators/scatter.h index 39af717615c01f5c121e32b176b74d05be738531..8bae6606c94620ab4fa8ae34f69236e7e87e9670 100644 --- a/paddle/fluid/operators/scatter.h +++ b/paddle/fluid/operators/scatter.h @@ -37,7 +37,8 @@ void ScatterAssign(const platform::DeviceContext& ctx, const Tensor& src, const Tensor& index, Tensor* output) { PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace())); // check index of shape 1-D - PADDLE_ENFORCE(index.dims().size() == 1); + PADDLE_ENFORCE(index.dims().size() == 1 || + (index.dims().size() == 2 && index.dims()[1] == 1)); int index_size = index.dims()[0]; auto src_dims = src.dims(); diff --git a/paddle/fluid/operators/send_barrier_op.cc b/paddle/fluid/operators/send_barrier_op.cc index 8ca2877d8adad643089587fcee0917affa537f7d..02ca107ca35348df1827805e40730acd39f39e87 100644 --- a/paddle/fluid/operators/send_barrier_op.cc +++ b/paddle/fluid/operators/send_barrier_op.cc @@ -42,12 +42,12 @@ class SendBarrierOp : public framework::OperatorBase { distributed::RPCClient::GetInstance( Attr("trainer_id")); - VLOG(3) << "SendBarrierOp sync"; + VLOG(30) << "SendBarrierOp sync"; // need to wait before sending send_barrier message PADDLE_ENFORCE(rpc_client->Wait(), "internal error in RPCClient"); for (auto& ep : eps) { - VLOG(3) << "send barrier, ep: " << ep; + VLOG(30) << "send barrier, ep: " << ep; rpc_client->AsyncSendBatchBarrier(ep); } PADDLE_ENFORCE(rpc_client->Wait(), "internal error in RPCClient"); diff --git a/paddle/fluid/operators/send_op.cc b/paddle/fluid/operators/send_op.cc index be1dc4bf14c3394963822b065ca088afbfacd858..0ad43d56d3cd7500290dc1e386a2dbaf4453a191 100644 --- a/paddle/fluid/operators/send_op.cc +++ b/paddle/fluid/operators/send_op.cc @@ -50,10 +50,10 @@ class SendOp : public framework::OperatorBase { std::vector rets; for (size_t i = 0; i < ins.size(); i++) { if (NeedSend(scope, ins[i])) { - VLOG(3) << "sending " << ins[i] << " to " << epmap[i]; + VLOG(30) << "sending " << ins[i] << " to " << epmap[i]; rets.push_back(rpc_client->AsyncSendVar(epmap[i], ctx, scope, ins[i])); } else { - VLOG(3) << "don't send no-initialied variable: " << ins[i]; + VLOG(30) << "don't send no-initialied variable: " << ins[i]; } } if (sync_send) { diff --git a/paddle/fluid/operators/send_recv_op_test.cc b/paddle/fluid/operators/send_recv_op_test.cc index aee6180add5708d31f7ce927b37c4524a291fe3c..d79b16e3cca714d44c88834082cea9367480da9a 100644 --- a/paddle/fluid/operators/send_recv_op_test.cc +++ b/paddle/fluid/operators/send_recv_op_test.cc @@ -120,7 +120,7 @@ void AddOp(const std::string &type, const f::VariableNameMap &inputs, void StartServerNet(bool is_sparse, std::atomic *initialized) { f::Scope scope; p::CPUPlace place; - VLOG(4) << "before init tensor"; + VLOG(40) << "before init tensor"; if (is_sparse) { InitSelectedRowsInScope(place, &scope); } else { @@ -146,7 +146,7 @@ void StartServerNet(bool is_sparse, std::atomic *initialized) { attrs.insert({"PrefetchBlock", prefetch_block}); attrs.insert({"grad_to_block_id", std::vector({""})}); attrs.insert({"sync_mode", true}); - VLOG(4) << "before init op"; + VLOG(40) << "before init op"; listen_and_serv_op = f::OpRegistry::CreateOp("listen_and_serv", {{"X", {"x1"}}}, {}, attrs); *initialized = true; diff --git a/paddle/fluid/operators/sequence_mask_op.h b/paddle/fluid/operators/sequence_mask_op.h index 18acb735cecabd1e01f7821c880fd8ed5e52971f..7ff68f9c715e4c7243afe9de84af9474e7e4e260 100644 --- a/paddle/fluid/operators/sequence_mask_op.h +++ b/paddle/fluid/operators/sequence_mask_op.h @@ -127,7 +127,7 @@ class SequenceMaskKernel : public framework::OpKernel { auto x_numel = x->numel(); if (maxlen < 0) { #ifdef __NVCC__ - VLOG(10) + VLOG(100) << "SequenceMaskOp on GPU may be slow when maxlen is not provided."; maxlen = static_cast( thrust::reduce(thrust::device_pointer_cast(x_data), diff --git a/paddle/fluid/operators/sgd_op.h b/paddle/fluid/operators/sgd_op.h index d8b0165b2a89b04bd55671a37d96ee4ba275b2eb..2e206c963ea009b436bb03433d30683a29fe83aa 100644 --- a/paddle/fluid/operators/sgd_op.h +++ b/paddle/fluid/operators/sgd_op.h @@ -98,10 +98,10 @@ class SGDOpKernel : public framework::OpKernel { auto param_row_width = param.value().dims()[1]; auto grad_row_width = grad.value().dims()[1]; - VLOG(4) << " param rows: " << param.rows().size() - << " param memory rows: " << param.value().dims()[0] - << " grad rows: " << grad.rows().size() - << " grad memory rows: " << grad.value().dims()[0]; + VLOG(40) << " param rows: " << param.rows().size() + << " param memory rows: " << param.value().dims()[0] + << " grad rows: " << grad.rows().size() + << " grad memory rows: " << grad.value().dims()[0]; PADDLE_ENFORCE_EQ(param_row_width, grad_row_width, "param_row should have the same size with grad_row"); diff --git a/paddle/fluid/operators/similarity_focus_op.cc b/paddle/fluid/operators/similarity_focus_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..9612f82b6d45dc4e08bfe288ddd1c7790875ee4d --- /dev/null +++ b/paddle/fluid/operators/similarity_focus_op.cc @@ -0,0 +1,87 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/similarity_focus_op.h" + +namespace paddle { +namespace operators { +class SimilarityFocusOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("X", + "(Tensor, default Tensor), a 4-D tensor with shape," + " [BatchSize, X, Y, Z]"); + AddOutput("Out", + "(Tensor, default Tensor), the similarity focus mask" + " with the same shape of input X."); + AddAttr("axis", + "(int32), indicating the dimension to be select. It can" + " only be 1, 2, or 3."); + AddAttr>("indexes", + "(std::vector), indicating the indexes" + " of the selected dimension."); + AddComment(R"DOC( +SimilarityFocus Operator. + +Generate a similarity focus mask with the same shape of input using the following method: +1. Extract the 3-D tensor(here the first dimension is BatchSize) corresponding + to the axis according to the indexes. For example, if axis=1 and indexes=[a], + it will get the matrix T=X[:, a, :, :]. In this case, if the shape of input X + is (BatchSize, A, B, C), the shape of tensor T is (BatchSize, B, C). +2. For each index, find the largest numbers in the tensor T, so that the same + row and same column has at most one number(what it means is that if the + largest number has been found in the i-th row and the j-th column, then + the numbers in the i-th row or j-th column will be skipped. And then the + next largest number will be selected from the remaining numbers. Obviously + there will be min(B, C) numbers), and mark the corresponding position of the + 3-D similarity focus mask as 1, otherwise as 0. Do elementwise-or for + each index. +3. Broadcast the 3-D similarity focus mask to the same shape of input X. + +Refer to `Similarity Focus Layer `_ +)DOC"); + } +}; + +class SimilarityFocusOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should be not null."); + PADDLE_ENFORCE(ctx->HasOutput("Out"), "Output(Out) should be not null."); + auto x_dims = ctx->GetInputDim("X"); + PADDLE_ENFORCE_EQ(x_dims.size(), 4, "Input(X)'s rank should be 4."); + ctx->SetOutputDim("Out", x_dims); + ctx->ShareLoD("X", /*->*/ "Out"); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType( + framework::ToDataType(ctx.Input("X")->type()), + platform::CPUPlace()); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OPERATOR(similarity_focus, ops::SimilarityFocusOp, + ops::SimilarityFocusOpMaker, + paddle::framework::EmptyGradOpMaker); +REGISTER_OP_CPU_KERNEL(similarity_focus, ops::SimilarityFocusKernel, + ops::SimilarityFocusKernel); diff --git a/paddle/fluid/operators/similarity_focus_op.h b/paddle/fluid/operators/similarity_focus_op.h new file mode 100644 index 0000000000000000000000000000000000000000..bf3fed2aaf2cf92d5619ae5bce6dd70d9dfe9621 --- /dev/null +++ b/paddle/fluid/operators/similarity_focus_op.h @@ -0,0 +1,168 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include +#include +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace operators { +using Tensor = framework::Tensor; + +template +class SimilarityFocusKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + Tensor* out = context.Output("Out"); + const Tensor* x = context.Input("X"); + T* out_data = out->mutable_data(context.GetPlace()); + const T* x_data = x->data(); + + int axis = context.Attr("axis"); + std::vector indexes = context.Attr>("indexes"); + + int64_t batch_size = x->dims()[0]; + int64_t dim[4]; + for (int i = 1; i <= 3; ++i) { + dim[i] = x->dims()[i]; + } + + if (indexes.size() < 1) { + PADDLE_THROW("Indexes' size can not be 0."); + } + for (auto index : indexes) { + if (dim[axis] < index) { + PADDLE_THROW("Index exceeds tensor shape limit."); + } + } + + int64_t array_size = 1; + for (int i = 1; i <= 3; ++i) { + if (i != axis) { + array_size *= dim[i]; + } + } + + std::vector> array(array_size); + + bool (*cmp)(std::pair, std::pair) = []( + std::pair x, std::pair y) { + return x.first > y.first; + }; + + int64_t (*compute_index)(int64_t*, int, int, int, int) = []( + int64_t* dim, int d1, int d2, int d3, int d4) { + return d1 * dim[1] * dim[2] * dim[3] + d2 * dim[2] * dim[3] + + d3 * dim[3] + d4; + }; + + memset(out_data, 0, sizeof(T) * batch_size * dim[1] * dim[2] * dim[3]); + for (int i = 0; i < batch_size; ++i) { + for (auto index : indexes) { + if (axis == 1) { + for (int j = 0; j < dim[2]; ++j) { + for (int k = 0; k < dim[3]; ++k) { + array[j * dim[3] + k] = std::make_pair( + x_data[compute_index(dim, i, index, j, k)], j * dim[3] + k); + } + } + + std::sort(array.begin(), array.end(), cmp); + int tag_num = 0; + std::vector tag2(dim[2]), tag3(dim[3]); + for (auto x : array) { + int idx2 = x.second / dim[3]; + int idx3 = x.second % dim[3]; + if (tag2[idx2] || tag3[idx3]) { + continue; + } + tag_num++; + tag2[idx2] = true; + tag3[idx3] = true; + for (int j = 0; j < dim[1]; ++j) { + out_data[compute_index(dim, i, j, idx2, idx3)] = 1; + } + if (tag_num == std::min(dim[2], dim[3])) { + break; + } + } + } else if (axis == 2) { + for (int j = 0; j < dim[1]; ++j) { + for (int k = 0; k < dim[3]; ++k) { + array[j * dim[3] + k] = std::make_pair( + x_data[compute_index(dim, i, j, index, k)], j * dim[3] + k); + } + } + + std::sort(array.begin(), array.end(), cmp); + int tag_num = 0; + std::vector tag1(dim[1]), tag3(dim[3]); + for (auto x : array) { + int idx1 = x.second / dim[3]; + int idx3 = x.second % dim[3]; + if (tag1[idx1] || tag3[idx3]) { + continue; + } + tag_num++; + tag1[idx1] = true; + tag3[idx3] = true; + for (int j = 0; j < dim[2]; ++j) { + out_data[compute_index(dim, i, idx1, j, idx3)] = 1; + } + if (tag_num == std::min(dim[1], dim[3])) { + break; + } + } + } else if (axis == 3) { + for (int j = 0; j < dim[1]; ++j) { + for (int k = 0; k < dim[2]; ++k) { + array[j * dim[2] + k] = std::make_pair( + x_data[compute_index(dim, i, j, k, index)], j * dim[2] + k); + } + } + + std::sort(array.begin(), array.end(), cmp); + int tag_num = 0; + std::vector tag1(dim[1]), tag2(dim[2]); + for (auto x : array) { + int idx1 = x.second / dim[2]; + int idx2 = x.second % dim[2]; + if (tag1[idx1] || tag2[idx2]) { + continue; + } + tag_num++; + tag1[idx1] = true; + tag2[idx2] = true; + for (int j = 0; j < dim[3]; ++j) { + out_data[compute_index(dim, i, idx1, idx2, j)] = 1; + } + if (tag_num == std::min(dim[1], dim[2])) { + break; + } + } + } else { + PADDLE_THROW("Axis must be 1 or 2 or 3"); + } + } + } + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/softmax_op.cc b/paddle/fluid/operators/softmax_op.cc index a4bdbe6648afa7c91a056af4737bb5d826229022..9e21b6c824bfd7d1c1090e5ba3ba2f6aa9bdb230 100644 --- a/paddle/fluid/operators/softmax_op.cc +++ b/paddle/fluid/operators/softmax_op.cc @@ -124,6 +124,14 @@ For each row $i$ and each column $j$ in the matrix, we have: } }; +class SoftmaxOpInferVarType : public framework::PassInDtypeAndVarTypeToOutput { + protected: + std::unordered_map GetInputOutputWithSameType() + const override { + return std::unordered_map{{"X", /*->*/ "Out"}}; + } +}; + class SoftmaxOpGrad : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; @@ -196,7 +204,7 @@ class SoftmaxOpGradMaker : public framework::SingleGradOpDescMaker { namespace ops = paddle::operators; REGISTER_OPERATOR(softmax, ops::SoftmaxOp, ops::SoftmaxOpMaker, - ops::SoftmaxOpGradMaker); + ops::SoftmaxOpInferVarType, ops::SoftmaxOpGradMaker); REGISTER_OPERATOR(softmax_grad, ops::SoftmaxOpGrad); REGISTER_OP_CPU_KERNEL( softmax, ops::SoftmaxKernel, diff --git a/paddle/fluid/operators/split_byref_op.h b/paddle/fluid/operators/split_byref_op.h index fedd7218dd6cc9481e94a92a3820cafbe4157bd0..3b7ae6fc91e0a9e08406e38b9a557cab442c2560 100644 --- a/paddle/fluid/operators/split_byref_op.h +++ b/paddle/fluid/operators/split_byref_op.h @@ -32,7 +32,7 @@ class SplitByrefOpKernel : public framework::OpKernel { for (size_t i = 0; i < outs.size(); ++i) { // NOTE: no need to call mutable_data here to allocate memory. auto* out = outs[i]; - VLOG(3) << "spliting by ref: " << row_offset << " " << out->dims()[0]; + VLOG(30) << "spliting by ref: " << row_offset << " " << out->dims()[0]; *out = in->Slice(row_offset, row_offset + out->dims()[0]); row_offset += out->dims()[0]; } diff --git a/paddle/fluid/operators/split_ids_op.h b/paddle/fluid/operators/split_ids_op.h index c8b0e7152106db6fa822a7b17cdb6888e75e310b..6dbada3da8826f0e7cb07a9642d327e5ee38c309 100644 --- a/paddle/fluid/operators/split_ids_op.h +++ b/paddle/fluid/operators/split_ids_op.h @@ -44,7 +44,7 @@ class SplitIdsOpKernel : public framework::OpKernel { for (size_t i = 0; i < ids_tensors.size(); ++i) { batch_size += ids_tensors[i]->dims()[0]; } - VLOG(4) << "Get Total BatchSize is: " << batch_size; + VLOG(40) << "Get Total BatchSize is: " << batch_size; std::vector all_ids(batch_size); int offset = 0; diff --git a/paddle/fluid/operators/stack_op.cc b/paddle/fluid/operators/stack_op.cc index 3f4b48bc7391def082c82ed451fc5a752009a2f1..9345b495415d203728238c19621a20f446c40bf5 100644 --- a/paddle/fluid/operators/stack_op.cc +++ b/paddle/fluid/operators/stack_op.cc @@ -21,8 +21,12 @@ REGISTER_OPERATOR(stack, ops::StackOp, ops::StackOpMaker, REGISTER_OPERATOR(stack_grad, ops::StackOpGrad); REGISTER_OP_CPU_KERNEL(stack, ops::StackKernel, - ops::StackKernel); + ops::StackKernel, + ops::StackKernel, + ops::StackKernel); REGISTER_OP_CPU_KERNEL(stack_grad, ops::StackGradKernel, - ops::StackGradKernel); + ops::StackGradKernel, + ops::StackGradKernel, + ops::StackGradKernel); diff --git a/paddle/fluid/operators/stack_op.cu b/paddle/fluid/operators/stack_op.cu index 92c1bde2bcf089e5c715e90e564408e6ad37ba17..bf2a9e5b3d22996e688621727cb280dc9aed7859 100644 --- a/paddle/fluid/operators/stack_op.cu +++ b/paddle/fluid/operators/stack_op.cu @@ -18,8 +18,12 @@ namespace plat = paddle::platform; namespace ops = paddle::operators; REGISTER_OP_CUDA_KERNEL(stack, ops::StackKernel, - ops::StackKernel); + ops::StackKernel, + ops::StackKernel, + ops::StackKernel); REGISTER_OP_CUDA_KERNEL(stack_grad, ops::StackGradKernel, - ops::StackGradKernel); + ops::StackGradKernel, + ops::StackGradKernel, + ops::StackGradKernel); diff --git a/paddle/fluid/operators/sum_mkldnn_op.cc b/paddle/fluid/operators/sum_mkldnn_op.cc index f9a16ef35ecb9eeb6c8eda9d124ecb17e7f9d5ce..2ae5c17bf6465874572e80da54e40fbe22403660 100644 --- a/paddle/fluid/operators/sum_mkldnn_op.cc +++ b/paddle/fluid/operators/sum_mkldnn_op.cc @@ -186,7 +186,7 @@ class SumMKLDNNOpKernel : public paddle::framework::OpKernel { } if (in_dim.empty()) { - VLOG(3) << "WARNING: all the inputs are empty"; + VLOG(30) << "WARNING: all the inputs are empty"; in_dim = framework::vectorize(get_selected_row(N - 1).value().dims()); } else { in_dim[0] = static_cast(first_dim); diff --git a/paddle/fluid/operators/sum_op.cc b/paddle/fluid/operators/sum_op.cc index 7df14158f3429e25fa972a51ef2615cf569e9a73..c67b694283cd8f0203021c0329f5ac16ae7854a5 100644 --- a/paddle/fluid/operators/sum_op.cc +++ b/paddle/fluid/operators/sum_op.cc @@ -45,7 +45,7 @@ class SumOp : public framework::OperatorWithKernel { size_t N = x_dims.size(); PADDLE_ENFORCE_GT(N, 0, "Input tensors count should > 0."); if (N == 1) { - VLOG(3) << "Warning: sum have only one input, may waste memory"; + VLOG(30) << "Warning: sum have only one input, may waste memory"; } framework::DDim in_dim({0}); @@ -157,8 +157,8 @@ class SumOpVarTypeInference : public framework::VarTypeInference { auto& inputs = op_desc.Input("X"); auto var_type = framework::proto::VarType::SELECTED_ROWS; for (auto& name : op_desc.Input("X")) { - VLOG(10) << name << " " - << block->FindRecursiveOrCreateVar(name).GetType(); + VLOG(100) << name << " " + << block->FindRecursiveOrCreateVar(name).GetType(); } bool any_input_is_lod_tensor = std::any_of( diff --git a/paddle/fluid/operators/tensor_array_read_write_op.cc b/paddle/fluid/operators/tensor_array_read_write_op.cc index a2d44284e9de1ace42cabbce82e0b45929432d7b..484160aeb8de573c6a6c1bb2ea5da23600d2d287 100644 --- a/paddle/fluid/operators/tensor_array_read_write_op.cc +++ b/paddle/fluid/operators/tensor_array_read_write_op.cc @@ -34,8 +34,8 @@ class WriteToArrayOp : public ArrayOp { auto *out = scope.FindVar(Output("Out"))->GetMutable(); if (offset >= out->size()) { - VLOG(10) << "Resize " << Output("Out") << " from " << out->size() - << " to " << offset + 1; + VLOG(100) << "Resize " << Output("Out") << " from " << out->size() + << " to " << offset + 1; out->resize(offset + 1); } auto *out_tensor = &out->at(offset); @@ -47,9 +47,9 @@ class WriteToArrayOp : public ArrayOp { TensorCopy(x_tensor, place, dev_ctx, out_tensor); } else { - VLOG(10) << "WARNING: The input tensor 'x_tensor' holds no memory, so " - "nothing has been written to output array[" - << offset << "]."; + VLOG(100) << "WARNING: The input tensor 'x_tensor' holds no memory, so " + "nothing has been written to output array[" + << offset << "]."; } } }; @@ -104,7 +104,7 @@ class WriteToArrayInferVarType : public framework::VarTypeInference { framework::BlockDesc *block) const override { auto x_name = op_desc.Input("X")[0]; auto out_name = op_desc.Output("Out")[0]; - VLOG(10) << "Set Variable " << out_name << " as LOD_TENSOR_ARRAY"; + VLOG(100) << "Set Variable " << out_name << " as LOD_TENSOR_ARRAY"; auto &out = block->FindRecursiveOrCreateVar(out_name); out.SetType(framework::proto::VarType::LOD_TENSOR_ARRAY); auto *x = block->FindVarRecursive(x_name); @@ -139,7 +139,7 @@ class ReadFromArrayOp : public ArrayOp { framework::TensorCopy(x_array[offset], place, dev_ctx, out_tensor); out_tensor->set_lod(x_array[offset].lod()); } else { - VLOG(10) << "offset " << offset << " >= " << x_array.size(); + VLOG(100) << "offset " << offset << " >= " << x_array.size(); } } }; diff --git a/paddle/fluid/operators/tensor_array_to_tensor_op.cc b/paddle/fluid/operators/tensor_array_to_tensor_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..96dc123f6a36e1a2b6ae04e0d97dffe1e10ac4ea --- /dev/null +++ b/paddle/fluid/operators/tensor_array_to_tensor_op.cc @@ -0,0 +1,246 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include + +#include "paddle/fluid/framework/lod_tensor_array.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/variable.h" + +namespace paddle { +namespace operators { +using framework::Tensor; + +void LodTensorArray2LodTensorVector(const framework::Scope &scope, + const std::string &base_name, + const std::string &lod_tensor_array_name, + std::vector *res_names) { + auto &inx = + scope.FindVar(lod_tensor_array_name)->Get(); + for (size_t i = 0; i < inx.size(); i++) { + std::string var_name = base_name + std::to_string(i); + framework::Variable *g_feed_value = + const_cast(scope).Var(var_name); + auto &feed_input = + *(g_feed_value->GetMutable()); + feed_input.ShareDataWith(inx[i]); + res_names->push_back(var_name); + } +} + +void LodTensorVectorResizeFromLodTensorArray( + const framework::Scope &scope, const std::string &base_name, + const std::string &lod_tensor_array_name, + std::vector *res_names) { + auto &inx = + scope.FindVar(lod_tensor_array_name)->Get(); + for (size_t i = 0; i < inx.size(); i++) { + std::string var_name = base_name + std::to_string(i); + framework::Variable *g_feed_value = + const_cast(scope).Var(var_name); + auto &feed_input = + *(g_feed_value->GetMutable()); + auto dims = inx[i].dims(); + feed_input.Resize(dims); + res_names->push_back(var_name); + } +} + +void LodTensorArrayCreateFromLodTensorArray( + const framework::Scope &scope, + const std::string &input_lod_tensor_array_name, + const std::string &output_lod_tensor_array_name) { + auto &inx = scope.FindVar(input_lod_tensor_array_name) + ->Get(); + auto &grad_inx = *scope.FindVar(output_lod_tensor_array_name) + ->GetMutable(); + + for (size_t i = 0; i < inx.size(); i++) { + std::string var_name = output_lod_tensor_array_name + std::to_string(i); + framework::Variable *g_feed_value = + const_cast(scope).Var(var_name); + auto &feed_input = + *(g_feed_value->GetMutable()); + grad_inx.push_back(feed_input); + } +} + +class LoDTensorArray2TensorOp : public framework::OperatorBase { + public: + using OperatorBase::OperatorBase; + + private: + void RunImpl(const framework::Scope &scope, + const platform::Place &place) const override { + auto axis = Attr("axis"); + + framework::AttributeMap attrs; + attrs["axis"] = axis; + + auto &inx = scope.FindVar(Input("X"))->Get(); + auto &out = + *scope.FindVar(Output("Out"))->GetMutable(); + auto &out_inx = + *scope.FindVar(Output("OutIndex"))->GetMutable(); + + const size_t n = inx.size(); + PADDLE_ENFORCE_GT(n, 0, "Input tensorarray size should > 0."); + + std::string base_name = Inputs("X")[0]; + std::vector names; + + // get the input tensorarray items' dim in out_inx + auto out_inx_dim = out_inx.dims(); + out_inx_dim[0] = inx.size(); + out_inx.Resize(out_inx_dim); + + std::string var_name = "out_index"; + framework::Variable *tmp_index_var = + const_cast(scope).Var(var_name); + auto &tmp_index_tensor = + *(tmp_index_var->GetMutable()); + tmp_index_tensor.Resize(out_inx_dim); + int *tmp_index_data = + tmp_index_tensor.mutable_data(platform::CPUPlace()); + + auto out_dims = inx[0].dims(); + size_t out_dim_sum = 0; + for (size_t index = 0; index < inx.size(); index++) { + auto inx_dims = inx[index].dims(); + out_dim_sum += inx_dims[axis]; + tmp_index_data[index] = inx_dims[axis]; + } + out_inx.ShareDataWith(tmp_index_tensor); + + // get input array items' dims + out_dims[axis] = out_dim_sum; + out.Resize(out_dims); + + LodTensorArray2LodTensorVector(scope, base_name, Input("X"), &names); + // Invoke Reshape Op + auto concat_op = framework::OpRegistry::CreateOp( + "concat", {{"X", names}}, {{"Out", {Output("Out")}}}, attrs); + + concat_op->Run(scope, place); + } +}; + +class LoDTensorArray2TensorOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("X", "Input LoDTensorArray of tensor_array_to_tensor operator."); + AddOutput("Out", "Output tensor of tensor_array_to_tensor operator."); + AddOutput("OutIndex", + "Output input LoDTensorArray items' dims of " + "tensor_array_to_tensor operator."); + AddAttr("axis", + "The axis along which the input tensors will be concatenated.") + .SetDefault(0); + AddComment(R"DOC( +tensor_array_to_tensor Operator. + +Concatenate the input LoDTensorArray along dimension axis to the output Tensor. +Examples: + Input = {[1,2], [3,4], [5,6]} + axis = 0 + Output = [[1,2], + [3,4], + [5,6]] + OutputIndex = [1,1,1] + +)DOC"); + } +}; + +class LoDTensorArray2TensorOpInferShape : public framework::InferShapeBase { + public: + void operator()(framework::InferShapeContext *ctx) const override {} +}; + +class LoDTensorArray2TensorGradInferShape : public framework::InferShapeBase { + public: + void operator()(framework::InferShapeContext *context) const override {} +}; + +class LoDTensorArray2TensorGradInferVarType + : public framework::VarTypeInference { + public: + void operator()(const framework::OpDesc &op_desc, + framework::BlockDesc *block) const override { + for (auto &out_var : op_desc.Output(framework::GradVarName("X"))) { + block->Var(out_var)->SetType(framework::proto::VarType::LOD_TENSOR_ARRAY); + } + } +}; + +class LoDTensorArray2TensorGradOp : public framework::OperatorBase { + public: + using OperatorBase::OperatorBase; + + private: + void RunImpl(const framework::Scope &scope, + const platform::Place &place) const override { + auto axis = Attr("axis"); + framework::AttributeMap attrs; + attrs["axis"] = axis; + + auto &inx = scope.FindVar(Input("X"))->Get(); + const size_t n = inx.size(); + PADDLE_ENFORCE_GT(n, 0, "Input tensorarray size should > 0."); + + std::string base_name = Inputs("X")[0]; + std::vector names; + + LodTensorArray2LodTensorVector(scope, base_name, Input("X"), &names); + + // grad + auto dx_name = Output(framework::GradVarName("X")); + auto dout_name = Input(framework::GradVarName("Out")); + + std::vector grad_names; + + LodTensorVectorResizeFromLodTensorArray(scope, "grad_name", Input("X"), + &grad_names); + + auto concat_grad_op = framework::OpRegistry::CreateOp( + "concat_grad", {{"X", names}, {"Out@GRAD", {dout_name}}}, + {{"X@GRAD", grad_names}}, attrs); + + concat_grad_op->Run(scope, place); + + LodTensorArrayCreateFromLodTensorArray(scope, Input("X"), dx_name); + auto &grad_inx = + *scope.FindVar(dx_name)->GetMutable(); + + for (size_t i = 0; i < grad_names.size(); i++) { + std::string var_name = grad_names[i]; + auto &feed_input = scope.FindVar(var_name)->Get(); + grad_inx[i].ShareDataWith(feed_input); + } + } +}; + +} // namespace operators +} // namespace paddle +USE_OP(concat); + +namespace ops = paddle::operators; +REGISTER_OPERATOR(tensor_array_to_tensor, ops::LoDTensorArray2TensorOp, + ops::LoDTensorArray2TensorOpMaker, + ops::LoDTensorArray2TensorOpInferShape, + paddle::framework::DefaultGradOpDescMaker); +REGISTER_OPERATOR(tensor_array_to_tensor_grad, ops::LoDTensorArray2TensorGradOp, + ops::LoDTensorArray2TensorGradInferShape, + ops::LoDTensorArray2TensorGradInferVarType); diff --git a/paddle/fluid/operators/tensorrt_engine_op.h b/paddle/fluid/operators/tensorrt_engine_op.h index 673f86da76ee0712b4d941f5b33594f89926b973..3af9376da1d3fa096b277e6b5a9d1a8de197d6f1 100644 --- a/paddle/fluid/operators/tensorrt_engine_op.h +++ b/paddle/fluid/operators/tensorrt_engine_op.h @@ -34,7 +34,7 @@ namespace operators { using FluidDT = framework::proto::VarType_Type; using TRT_DT = nvinfer1::DataType; -namespace { +namespace { // NOLINT TRT_DT FluidDataType2TRT(FluidDT type) { switch (type) { @@ -60,7 +60,7 @@ nvinfer1::Dims Vec2TRT_Dims(const std::vector& shape) { return nvinfer1::DimsCHW(shape[1], 1, 1); } -} // namespace +} // namespace // NOLINT using inference::Singleton; using inference::tensorrt::TRT_EngineManager; @@ -127,9 +127,9 @@ class TensorRTEngineKernel : public framework::OpKernel { // Convert output tensor from engine to fluid int output_index = 0; - VLOG(4) << "TensorRT Engine Op Outputs:"; + VLOG(40) << "TensorRT Engine Op Outputs:"; for (const auto& y : context.Outputs("Ys")) { - VLOG(4) << y; + VLOG(40) << y; // convert output and copy to fluid. nvinfer1::ITensor* trt_t = engine->GetITensor(output_maps[output_index]); auto dims = trt_t->getDimensions(); @@ -167,7 +167,7 @@ class TensorRTEngineKernel : public framework::OpKernel { protected: void Prepare(const framework::ExecutionContext& context) const { - VLOG(4) << "Prepare engine"; + VLOG(40) << "Prepare engine"; // Get the ProgramDesc and pass to convert. framework::proto::BlockDesc block_desc; block_desc.ParseFromString(context.Attr("subgraph")); @@ -192,12 +192,12 @@ class TensorRTEngineKernel : public framework::OpKernel { engine->InitNetwork(); framework::BlockDesc block(nullptr /*programdesc*/, &block_desc); - VLOG(4) << "parsed var size " << block.AllVars().size(); + VLOG(40) << "parsed var size " << block.AllVars().size(); // Add inputs - VLOG(4) << "declare inputs"; + VLOG(40) << "declare inputs"; for (auto& input : context.Inputs("Xs")) { if (parameters.count(input)) continue; - VLOG(4) << "declare input " << input; + VLOG(40) << "declare input " << input; auto* var = block.FindVar(input); // TensorRT engine need to create parameters. The parameter's description // should be set in diff --git a/paddle/fluid/operators/while_op.cc b/paddle/fluid/operators/while_op.cc index 3c8a01b6e47459760b05b5ca7fa4fa5e1d37d112..aa6af055decc4856fcf2036d324af6b1ff3a5de0 100644 --- a/paddle/fluid/operators/while_op.cc +++ b/paddle/fluid/operators/while_op.cc @@ -129,15 +129,15 @@ class WhileGradOp : public framework::OperatorBase { for (auto cur_scope_iter = step_scopes->rbegin(); cur_scope_iter != step_scopes->rend(); ++cur_scope_iter) { - VLOG(3) << "Start backward at time_step " - << cur_scope_iter - step_scopes->rbegin(); + VLOG(30) << "Start backward at time_step " + << cur_scope_iter - step_scopes->rbegin(); framework::Scope &cur_scope = **cur_scope_iter; // Link OG from outside to inside for (size_t i = 0; i < outside_og_names.size(); ++i) { auto outside_og_name = outside_og_names[i]; auto inside_og_name = inside_og_names[i]; - VLOG(8) << "Linking outside " << outside_og_name << " --> inside " - << inside_og_name; + VLOG(80) << "Linking outside " << outside_og_name << " --> inside " + << inside_og_name; if (scope.FindVar(outside_og_name) == nullptr) { continue; } @@ -159,11 +159,11 @@ class WhileGradOp : public framework::OperatorBase { auto &outside_array = og_outside.Get(); auto &inside_array = detail::Ref(og_inside.GetMutable()); - VLOG(8) << outside_og_name << " size = " << outside_array.size(); + VLOG(80) << outside_og_name << " size = " << outside_array.size(); inside_array.resize(outside_array.size()); for (size_t j = 0; j < inside_array.size(); ++j) { - VLOG(8) << j << " " << outside_array[j].numel(); + VLOG(80) << j << " " << outside_array[j].numel(); if (outside_array[j].numel() != 0) { inside_array[j].set_lod(outside_array[j].lod()); inside_array[j].ShareDataWith(outside_array[j]); @@ -289,7 +289,7 @@ class WhileGradOpDescMaker : public framework::SingleGradOpDescMaker { auto igs = InputGrad(kX, /*do not drop empty gradient*/ false); for (auto &each_ig : igs) { if (inner_op_outputs.find(each_ig) == inner_op_outputs.end()) { - VLOG(8) << "Ignore " << each_ig; + VLOG(80) << "Ignore " << each_ig; each_ig = framework::kEmptyVarName; } } @@ -353,8 +353,8 @@ class WhileGradOpVarTypeInference : public framework::VarTypeInference { auto &p_var = detail::Ref(block->FindVarRecursive(p_names[i])); auto *g_var = block->FindVarRecursive(pg_ig_names[i]); if (g_var != nullptr) { // Gradient could be @EMPTY@ - VLOG(5) << "Setting " << pg_ig_names[i] << " following " << p_names[i] - << " type: " << p_var.GetType(); + VLOG(50) << "Setting " << pg_ig_names[i] << " following " << p_names[i] + << " type: " << p_var.GetType(); g_var->SetType(p_var.GetType()); g_var->SetDataType(p_var.GetDataType()); } diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc index ff49a1d57fd977a6d6b4502b44e48aad34cde872..f5541014af5170488efbb10f6e7e331ef015a848 100644 --- a/paddle/fluid/platform/device_context.cc +++ b/paddle/fluid/platform/device_context.cc @@ -204,7 +204,10 @@ CUDADeviceContext::CUDADeviceContext(CUDAPlace place) << "." << (driver_version_ % 100) / 10 << ", Runtime Version: " << runtime_version_ / 1000 << "." << (runtime_version_ % 100) / 10; - + size_t cudnn_dso_ver = dynload::cudnnGetVersion(); + LOG_FIRST_N(WARNING, 1) << "device: " << place_.device + << ", cuDNN Version: " << cudnn_dso_ver / 1000 << "." + << (cudnn_dso_ver % 100) / 10 << "."; callback_manager_.reset(new StreamCallbackManager(stream_)); } diff --git a/paddle/fluid/platform/device_tracer.cc b/paddle/fluid/platform/device_tracer.cc index dc1d751141187edb7738e42c41514614d4d399b0..ea4564058d602a9abe43bd063f1ed73f88a2de08 100644 --- a/paddle/fluid/platform/device_tracer.cc +++ b/paddle/fluid/platform/device_tracer.cc @@ -203,7 +203,7 @@ class DeviceTracerImpl : public DeviceTracer { void AddCPURecords(const std::string &anno, uint64_t start_ns, uint64_t end_ns, int64_t device_id, int64_t thread_id) { if (anno.empty()) { - VLOG(1) << "Empty timeline annotation."; + VLOG(10) << "Empty timeline annotation."; return; } std::lock_guard l(trace_mu_); @@ -216,7 +216,7 @@ class DeviceTracerImpl : public DeviceTracer { uint32_t correlation_id, uint64_t bytes) { // 0 means timestamp information could not be collected for the kernel. if (start_ns == 0 || end_ns == 0) { - VLOG(3) << name << " cannot be traced"; + VLOG(30) << name << " cannot be traced"; return; } std::lock_guard l(trace_mu_); @@ -228,7 +228,7 @@ class DeviceTracerImpl : public DeviceTracer { int64_t stream_id, uint32_t correlation_id) { // 0 means timestamp information could not be collected for the kernel. if (start == 0 || end == 0) { - VLOG(3) << correlation_id << " cannot be traced"; + VLOG(30) << correlation_id << " cannot be traced"; return; } std::lock_guard l(trace_mu_); @@ -347,7 +347,7 @@ class DeviceTracerImpl : public DeviceTracer { tracer->AddAnnotation(cbInfo->correlationId, anno); } } else { - VLOG(1) << "Unhandled API Callback for " << domain << " " << cbid; + VLOG(10) << "Unhandled API Callback for " << domain << " " << cbid; } } CUpti_SubscriberHandle subscriber_; diff --git a/paddle/fluid/platform/dynload/cudnn.h b/paddle/fluid/platform/dynload/cudnn.h index d3d754b6f58d25a9dfacafaf55d50b353a71ee6d..c26143d2f2780f3042f66b99808c6b85866f9dc4 100644 --- a/paddle/fluid/platform/dynload/cudnn.h +++ b/paddle/fluid/platform/dynload/cudnn.h @@ -65,51 +65,54 @@ extern void EnforceCUDNNLoaded(const char* fn_name); * include all needed cudnn functions in HPPL * different cudnn version has different interfaces **/ -#define CUDNN_DNN_ROUTINE_EACH(__macro) \ - __macro(cudnnSetTensor4dDescriptor); \ - __macro(cudnnSetTensor4dDescriptorEx); \ - __macro(cudnnSetTensorNdDescriptor); \ - __macro(cudnnGetTensorNdDescriptor); \ - __macro(cudnnGetConvolutionNdForwardOutputDim); \ - __macro(cudnnGetConvolutionForwardAlgorithm); \ - __macro(cudnnCreateTensorDescriptor); \ - __macro(cudnnDestroyTensorDescriptor); \ - __macro(cudnnCreateFilterDescriptor); \ - __macro(cudnnSetFilter4dDescriptor); \ - __macro(cudnnSetFilterNdDescriptor); \ - __macro(cudnnGetFilterNdDescriptor); \ - __macro(cudnnSetPooling2dDescriptor); \ - __macro(cudnnSetPoolingNdDescriptor); \ - __macro(cudnnGetPoolingNdDescriptor); \ - __macro(cudnnDestroyFilterDescriptor); \ - __macro(cudnnCreateConvolutionDescriptor); \ - __macro(cudnnCreatePoolingDescriptor); \ - __macro(cudnnDestroyPoolingDescriptor); \ - __macro(cudnnSetConvolution2dDescriptor); \ - __macro(cudnnDestroyConvolutionDescriptor); \ - __macro(cudnnSetConvolutionNdDescriptor); \ - __macro(cudnnGetConvolutionNdDescriptor); \ - __macro(cudnnDeriveBNTensorDescriptor); \ - __macro(cudnnCreateSpatialTransformerDescriptor); \ - __macro(cudnnSetSpatialTransformerNdDescriptor); \ - __macro(cudnnDestroySpatialTransformerDescriptor); \ - __macro(cudnnSpatialTfGridGeneratorForward); \ - __macro(cudnnSpatialTfGridGeneratorBackward); \ - __macro(cudnnSpatialTfSamplerForward); \ - __macro(cudnnSpatialTfSamplerBackward); \ - __macro(cudnnCreate); \ - __macro(cudnnDestroy); \ - __macro(cudnnSetStream); \ - __macro(cudnnActivationForward); \ - __macro(cudnnConvolutionForward); \ - __macro(cudnnConvolutionBackwardBias); \ - __macro(cudnnGetConvolutionForwardWorkspaceSize); \ - __macro(cudnnTransformTensor); \ - __macro(cudnnPoolingForward); \ - __macro(cudnnPoolingBackward); \ - __macro(cudnnSoftmaxBackward); \ - __macro(cudnnSoftmaxForward); \ - __macro(cudnnGetVersion); \ +#define CUDNN_DNN_ROUTINE_EACH(__macro) \ + __macro(cudnnSetTensor4dDescriptor); \ + __macro(cudnnSetTensor4dDescriptorEx); \ + __macro(cudnnSetTensorNdDescriptor); \ + __macro(cudnnGetTensorNdDescriptor); \ + __macro(cudnnGetConvolutionNdForwardOutputDim); \ + __macro(cudnnGetConvolutionForwardAlgorithm); \ + __macro(cudnnCreateTensorDescriptor); \ + __macro(cudnnDestroyTensorDescriptor); \ + __macro(cudnnCreateFilterDescriptor); \ + __macro(cudnnSetFilter4dDescriptor); \ + __macro(cudnnSetFilterNdDescriptor); \ + __macro(cudnnGetFilterNdDescriptor); \ + __macro(cudnnSetPooling2dDescriptor); \ + __macro(cudnnSetPoolingNdDescriptor); \ + __macro(cudnnGetPoolingNdDescriptor); \ + __macro(cudnnDestroyFilterDescriptor); \ + __macro(cudnnCreateConvolutionDescriptor); \ + __macro(cudnnCreatePoolingDescriptor); \ + __macro(cudnnDestroyPoolingDescriptor); \ + __macro(cudnnSetConvolution2dDescriptor); \ + __macro(cudnnDestroyConvolutionDescriptor); \ + __macro(cudnnSetConvolutionNdDescriptor); \ + __macro(cudnnGetConvolutionNdDescriptor); \ + __macro(cudnnDeriveBNTensorDescriptor); \ + __macro(cudnnCreateSpatialTransformerDescriptor); \ + __macro(cudnnSetSpatialTransformerNdDescriptor); \ + __macro(cudnnDestroySpatialTransformerDescriptor); \ + __macro(cudnnSpatialTfGridGeneratorForward); \ + __macro(cudnnSpatialTfGridGeneratorBackward); \ + __macro(cudnnSpatialTfSamplerForward); \ + __macro(cudnnSpatialTfSamplerBackward); \ + __macro(cudnnCreate); \ + __macro(cudnnDestroy); \ + __macro(cudnnSetStream); \ + __macro(cudnnActivationForward); \ + __macro(cudnnConvolutionForward); \ + __macro(cudnnConvolutionBackwardBias); \ + __macro(cudnnGetConvolutionForwardWorkspaceSize); \ + __macro(cudnnTransformTensor); \ + __macro(cudnnPoolingForward); \ + __macro(cudnnPoolingBackward); \ + __macro(cudnnSoftmaxBackward); \ + __macro(cudnnSoftmaxForward); \ + __macro(cudnnGetVersion); \ + __macro(cudnnFindConvolutionForwardAlgorithmEx); \ + __macro(cudnnFindConvolutionBackwardFilterAlgorithmEx); \ + __macro(cudnnFindConvolutionBackwardDataAlgorithmEx); \ __macro(cudnnGetErrorString); CUDNN_DNN_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP) diff --git a/paddle/fluid/platform/dynload/dynamic_loader.cc b/paddle/fluid/platform/dynload/dynamic_loader.cc index cc5cda6106c188f3156d33480b5d3641eed32556..d53907b749805d9c16737da3105d6c66cacb12fb 100644 --- a/paddle/fluid/platform/dynload/dynamic_loader.cc +++ b/paddle/fluid/platform/dynload/dynamic_loader.cc @@ -72,8 +72,8 @@ static inline std::string join(const std::string& part1, static inline void* GetDsoHandleFromDefaultPath(const std::string& dso_path, int dynload_flags) { - VLOG(3) << "Try to find library: " << dso_path - << " from default system path."; + VLOG(30) << "Try to find library: " << dso_path + << " from default system path."; // default search from LD_LIBRARY_PATH/DYLD_LIBRARY_PATH // and /usr/local/lib path void* dso_handle = dlopen(dso_path.c_str(), dynload_flags); diff --git a/paddle/fluid/platform/gpu_info.cc b/paddle/fluid/platform/gpu_info.cc index 8fff9844db738dbd6508569a8aaeed044e445e5f..c78f159ad25a17b38333a57a0650d9843c4c5632 100644 --- a/paddle/fluid/platform/gpu_info.cc +++ b/paddle/fluid/platform/gpu_info.cc @@ -124,8 +124,8 @@ size_t GpuMaxChunkSize() { size_t available = 0; GpuMemoryUsage(&available, &total); - VLOG(10) << "GPU Usage " << available / 1024 / 1024 << "M/" - << total / 1024 / 1024 << "M"; + VLOG(100) << "GPU Usage " << available / 1024 / 1024 << "M/" + << total / 1024 / 1024 << "M"; size_t reserving = static_cast(0.05 * total); // If available less than minimum chunk size, no usable memory exists. available = diff --git a/paddle/fluid/platform/init.cc b/paddle/fluid/platform/init.cc index 2211e5504373b4a30e5fda0db22a41bdcd9f2421..4cbfe0a69c06cb6793c877263b2feaafa7c3dc60 100644 --- a/paddle/fluid/platform/init.cc +++ b/paddle/fluid/platform/init.cc @@ -45,7 +45,7 @@ void InitGflags(std::vector argv) { line += ' '; } google::ParseCommandLineFlags(&argc, &arr, true); - VLOG(1) << "Init commandline: " << line; + VLOG(10) << "Init commandline: " << line; }); } diff --git a/paddle/fluid/platform/nccl_helper.h b/paddle/fluid/platform/nccl_helper.h index 115abb98d56e633c938695c8127c832eab602110..40af1f95208905231b933e5184a807b061164799 100644 --- a/paddle/fluid/platform/nccl_helper.h +++ b/paddle/fluid/platform/nccl_helper.h @@ -112,7 +112,7 @@ struct NCCLContextMap { NCCLGroupGuard gurad; for (auto &gpu_id : order_) { int rank = trainer_id * order_.size() + gpu_id; - VLOG(3) << "init nccl rank: " << rank << " nranks: " << nranks; + VLOG(30) << "init nccl rank: " << rank << " nranks: " << nranks; PADDLE_ENFORCE(cudaSetDevice(gpu_id)); PADDLE_ENFORCE(platform::dynload::ncclCommInitRank( comms.get() + gpu_id, nranks, *nccl_id, rank)); diff --git a/paddle/fluid/pybind/protobuf.cc b/paddle/fluid/pybind/protobuf.cc index d3b0d4a22954c1d67dc9551b997dcffa0625cbeb..586e92c2b3146d75a673d1fe326dbee7297a3bfb 100644 --- a/paddle/fluid/pybind/protobuf.cc +++ b/paddle/fluid/pybind/protobuf.cc @@ -61,9 +61,9 @@ struct variant_caster> { if (std::is_same>::value) { auto caster_ints = make_caster>(); if (caster_ints.load(src, convert)) { - VLOG(4) << "This value are floats and int64_ts satisfy " - "simultaneously, will set it's type to " - "std::vector"; + VLOG(40) << "This value are floats and int64_ts satisfy " + "simultaneously, will set it's type to " + "std::vector"; value = cast_op>(caster_ints); return true; } diff --git a/paddle/fluid/train/demo/demo_trainer.cc b/paddle/fluid/train/demo/demo_trainer.cc index a0757b53f37b29de0b3802c345b1ad9db69f16e9..ac1ac8e7c2348289516240b6eddf454d02828e2f 100644 --- a/paddle/fluid/train/demo/demo_trainer.cc +++ b/paddle/fluid/train/demo/demo_trainer.cc @@ -40,7 +40,7 @@ void ReadBinaryFile(const std::string& filename, std::string* contents) { std::unique_ptr Load( paddle::framework::Executor* executor, const std::string& model_filename) { - VLOG(3) << "loading model from " << model_filename; + VLOG(30) << "loading model from " << model_filename; std::string program_desc_str; ReadBinaryFile(model_filename, &program_desc_str); diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh index 2f5fef36c423736666695c07ebf69d812c3488ed..a51c9becd416af243cb473c8856141db8d9f3bf0 100755 --- a/paddle/scripts/paddle_build.sh +++ b/paddle/scripts/paddle_build.sh @@ -139,6 +139,7 @@ function cmake_gen() { -DWITH_AMD_GPU=${WITH_AMD_GPU:-OFF} -DWITH_DISTRIBUTE=${WITH_DISTRIBUTE:-OFF} -DWITH_MKL=${WITH_MKL:-ON} + -DWITH_NGRAPH=${WITH_NGRAPH:-OFF} -DWITH_AVX=${WITH_AVX:-OFF} -DWITH_GOLANG=${WITH_GOLANG:-OFF} -DCUDA_ARCH_NAME=${CUDA_ARCH_NAME:-All} @@ -171,6 +172,7 @@ EOF -DWITH_AMD_GPU=${WITH_AMD_GPU:-OFF} \ -DWITH_DISTRIBUTE=${WITH_DISTRIBUTE:-OFF} \ -DWITH_MKL=${WITH_MKL:-ON} \ + -DWITH_NGRAPH=${WITH_NGRAPH:-OFF} \ -DWITH_AVX=${WITH_AVX:-OFF} \ -DWITH_GOLANG=${WITH_GOLANG:-OFF} \ -DCUDA_ARCH_NAME=${CUDA_ARCH_NAME:-All} \ @@ -614,7 +616,24 @@ EOF CMD='"true"' fi - cat >> ${PADDLE_ROOT}/build/Dockerfile <> ${PADDLE_ROOT}/build/Dockerfile <> ${PADDLE_ROOT}/build/Dockerfile <> ${PADDLE_ROOT}/build/Dockerfile <`_ + + .. code-block:: text + + * Example : + + Given a 4-D tensor x with the shape (BatchSize, C, A, B), where C is + the number of channels and the shape of feature map is (A, B): + x.shape = (2, 3, 2, 2) + x.data = [[[[0.8, 0.1], + [0.4, 0.5]], + + [[0.9, 0.7], + [0.9, 0.9]], + + [[0.8, 0.9], + [0.1, 0.2]]], + + + [[[0.2, 0.5], + [0.3, 0.4]], + + [[0.9, 0.7], + [0.8, 0.4]], + + [[0.0, 0.2], + [0.4, 0.7]]]] + + Given axis: 1 (the axis of the channel) + Given indexes: [0] + + then we get a 4-D tensor out with the same shape of input x: + out.shape = (2, 3, 2, 2) + out.data = [[[[1.0, 0.0], + [0.0, 1.0]], + + [[1.0, 0.0], + [0.0, 1.0]], + + [[1.0, 0.0], + [0.0, 1.0]]], + + [[[0.0, 1.0], + [1.0, 0.0]], + + [[0.0, 1.0], + [1.0, 0.0]], + + [[0.0, 1.0], + [1.0, 0.0]]]] + + Args: + input(Variable): The input tensor variable(default float). It should + be a 4-D tensor with shape [BatchSize, A, B, C]. + axis(int): Indicating the dimension to be selected. It can only be + 1, 2 or 3. + indexes(list): Indicating the indexes of the selected dimension. + + Returns: + Variable: A tensor variable with the same shape and same type + as the input. + + Examples: + .. code-block:: python + data = fluid.layers.data( + name='data', shape=[2, 3, 2, 2], dtype='float32') + x = fluid.layers.layer_norm(input=data, axis=1, indexes=[0]) + """ + helper = LayerHelper('similarity_focus', **locals()) + # check attrs + if isinstance(axis, int) is False: + raise TypeError("axis must be int type.") + if isinstance(indexes, list) is False: + raise TypeError("indexes must be list type.") + if axis != 1 and axis != 2 and axis != 3: + raise ValueError("axis must be 1, 2 or 3.") + if len(indexes) == 0: + raise ValueError("indexes can not be empty.") + + if name is None: + out = helper.create_variable_for_type_inference(dtype=input.dtype) + else: + out = helper.create_variable( + name=name, dtype=input.dtype, persistable=False) + helper.append_op( + type='similarity_focus', + inputs={'X': input}, + outputs={'Out': out}, + attrs={"axis": axis, + "indexes": indexes}) + return out + + def hash(input, hash_size, num_hash=1, name=None): """ Hash the input to an integer whose value is less than the given hash size. @@ -8046,3 +8290,72 @@ def add_position_encoding(input, alpha, beta, name=None): attrs={"alpha": alpha, "beta": beta}) return out + + +def bilinear_tensor_product(x, + y, + size, + act=None, + name=None, + param_attr=None, + bias_attr=None): + """ + **Add Bilinear Tensor Product Layer** + + This layer performs bilinear tensor product on two inputs. + For example: + + .. math:: + out{i} = x * W_{i} * {y^\mathrm{T}}, i=0,1,...,size-1 + + In this formula: + - :math:`x`: the first input contains M elements, shape is [batch_size, M]. + - :math:`y`: the second input contains N elements, shape is [batch_size, N]. + - :math:`W_{i}`: the i-th learned weight, shape is [M, N] + - :math:`out{i}`: the i-th element of out, shape is [batch_size, size]. + - :math:`y^\mathrm{T}`: the transpose of :math:`y_{2}`. + + Args: + x (Variable): 2-D input tensor with shape [batch_size, M] + y (Variable): 2-D input tensor with shape [batch_size, N] + size (int): The dimension of this layer. + act (str, default None): Activation to be applied to the output of this layer. + name (str, default None): The name of this layer. + param_attr (ParamAttr, default None): The parameter attribute for the learnable w. + parameters/weights of this layer. + bias_attr (ParamAttr, default None): The parameter attribute for the bias + of this layer. If it is set to False, no bias will be added to the output units. + If it is set to None, the bias is initialized zero. Default: None. + + Returns: + Variable: A 2-D Tensor of shape [batch_size, size]. + + Examples: + .. code-block:: python + + tensor = bilinear_tensor_product(x=layer1, y=layer2, size=1000) + """ + helper = LayerHelper('bilinear_tensor_product', **locals()) + dtype = helper.input_dtype('x') + + param_shape = [size, x.shape[1], y.shape[1]] + + w = helper.create_parameter( + attr=helper.param_attr, shape=param_shape, dtype=dtype, is_bias=False) + + if name is None: + out = helper.create_variable_for_type_inference(dtype=dtype) + else: + out = helper.create_variable(name=name, dtype=dtype, persistable=False) + + inputs = {"X": x, "Y": y, "Weight": w} + if helper.bias_attr: + bias_size = [1, size] + bias = helper.create_parameter( + attr=helper.bias_attr, shape=bias_size, dtype=dtype, is_bias=True) + inputs["Bias"] = bias + helper.append_op( + type="bilinear_tensor_product", inputs=inputs, outputs={"Out": out}) + + # add activation + return helper.append_activation(out) diff --git a/python/paddle/fluid/layers/tensor.py b/python/paddle/fluid/layers/tensor.py index 09a7cb8dc9339afa666f8cf09e92a27ffba8a9b3..57e5d197b618615b32a7f446df0a81e18c25b097 100644 --- a/python/paddle/fluid/layers/tensor.py +++ b/python/paddle/fluid/layers/tensor.py @@ -24,10 +24,10 @@ from .layer_function_generator import templatedoc import numpy __all__ = [ - 'create_tensor', 'create_parameter', 'create_global_var', 'cast', 'concat', - 'sums', 'assign', 'fill_constant_batch_size_like', 'fill_constant', - 'argmin', 'argmax', 'argsort', 'ones', 'zeros', 'reverse', 'has_inf', - 'has_nan', 'isfinite' + 'create_tensor', 'create_parameter', 'create_global_var', 'cast', + 'tensor_array_to_tensor', 'concat', 'sums', 'assign', + 'fill_constant_batch_size_like', 'fill_constant', 'argmin', 'argmax', + 'argsort', 'ones', 'zeros', 'reverse', 'has_inf', 'has_nan', 'isfinite' ] @@ -193,6 +193,60 @@ def concat(input, axis=0, name=None): return out +def tensor_array_to_tensor(input, axis=1, name=None): + """ + This function concatenates the input LodTensorArray along the axis mentioned + and returns that as the output. + + A simple example as below: + + .. code-block:: text + + Given: + + input.data = {[[0.6, 0.1, 0.3], + [0.5, 0.3, 0.2]], + [[1.3], + [1.8]], + [[2.3, 2.1], + [2.5, 2.4]]} + + axis = 1 + + Then: + + output.data = [[0.6, 0.1, 0.3, 1.3, 2.3, 2.1], + [0.5, 0.3, 0.2, 1.8, 2.5, 2.4]] + + output_index.data = [3, 1, 2] + + Args: + input(list): Input LodTensorArray + axis(int): Integer axis along which the tensors will be concatenated + name(str|None): A name for this layer(optional). If set None, the layer + will be named automatically. + + Returns: + Variable: Output variable of the concatenation + Variable: The input LodTensorArray items' dims along the axis + + Examples: + .. code-block:: python + + output, output_index = fluid.layers.tensor_array_to_tensor(input=tensor_array) + """ + helper = LayerHelper('tensor_array_concat', **locals()) + out = helper.create_variable_for_type_inference(dtype=helper.input_dtype()) + out_index = helper.create_variable_for_type_inference(dtype="int32") + helper.append_op( + type='tensor_array_concat', + inputs={'X': input}, + outputs={'Out': [out], + 'OutIndex': [out_index]}, + attrs={'axis': axis}) + return out, out_index + + def sums(input, out=None): """ This function performs the sum operation on the input and returns the diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py index 7e2364a5a872cdd8cf590438cc081ab070db767d..da92826d410505c9a80820f655162dd22e6b5966 100644 --- a/python/paddle/fluid/optimizer.py +++ b/python/paddle/fluid/optimizer.py @@ -13,21 +13,23 @@ # limitations under the License. from __future__ import print_function -import re -import sys + from collections import defaultdict +from contextlib import contextmanager + from paddle.fluid.framework import Program, Variable, name_scope, default_main_program +from paddle.fluid.distribute_lookup_table import find_distributed_lookup_table + from . import framework from . import layers +from . import unique_name from .backward import append_backward +from .clip import append_gradient_clip_ops, error_clip_callback from .framework import program_guard -from . import unique_name from .initializer import Constant from .layer_helper import LayerHelper -from .regularizer import append_regularization_ops -from .clip import append_gradient_clip_ops, error_clip_callback -from contextlib import contextmanager from .layers import ops +from .regularizer import append_regularization_ops __all__ = [ 'SGD', 'Momentum', 'Adagrad', 'Adam', 'Adamax', 'DecayedAdagrad', 'Ftrl', @@ -85,7 +87,7 @@ class Optimizer(object): name=unique_name.generate("learning_rate"), shape=[1], value=float(self._learning_rate), - dtype='float32' if self._dtype == None else self._dtype, + dtype='float32' if self._dtype is None else self._dtype, persistable=True) def _global_learning_rate(self, program=None): @@ -245,6 +247,50 @@ class Optimizer(object): end = len(global_block.ops) return global_block._slice_ops(start, end) + def _process_distribute_lookuptable(self, param_grads, loss, + startup_program): + """ + Because distribute lookup table only support SGD optimizer for now, not support + other optimizer and regularization, so we should find the table parameter out, + and avoid to add regularization and other op for it, and add sgd optimize op + for it independently. + :param param_grads(list((Var, Var))): list of (param, grad) pair. + :param loss: the loss variable. + :param startup_program: the startup program + """ + program = loss.block.program + table_name = find_distributed_lookup_table(program) + table_param = None + table_grad = None + new_param_grads = [] + for p, g in param_grads: + if p.name == table_name: + if table_param is not None: + raise RuntimeError( + "multi dist table var found, only support one now!") + table_param = p + table_grad = g + else: + new_param_grads.append((p, g)) + sgd_op = None + if table_param is not None: + with program_guard(program, startup_program): + param_and_grad = [table_param, table_grad] + with table_param.block.program._optimized_guard(param_and_grad), \ + framework.name_scope("optimizer"): + self._create_global_learning_rate() + # create the optimize op + sgd_op = loss.block.append_op( + type='sgd', + inputs={ + "Param": table_param, + "Grad": table_grad, + "LearningRate": + self._create_param_lr(param_and_grad) + }, + outputs={"ParamOut": param_and_grad[0]}) + return new_param_grads, (table_param, table_grad), sgd_op + def minimize(self, loss, startup_program=None, @@ -260,6 +306,9 @@ class Optimizer(object): params_grads = sorted(params_grads, key=lambda x: x[0].name) + params_grads, table_param_and_grad, table_optimize_op = \ + self._process_distribute_lookuptable(params_grads, loss, startup_program) + params_grads = append_gradient_clip_ops(params_grads) # Add regularization if any @@ -268,6 +317,9 @@ class Optimizer(object): optimize_ops = self._create_optimization_pass(params_grads, loss, startup_program) + if table_optimize_op is not None: + optimize_ops.append(table_optimize_op) + params_grads.append(table_param_and_grad) return optimize_ops, params_grads diff --git a/python/paddle/fluid/tests/book/test_label_semantic_roles.py b/python/paddle/fluid/tests/book/test_label_semantic_roles.py index f63387a90617dc4e9b7c9ee7caa2d01595237a03..42ab9b231153f7ede7b8f8dd4e754f8cc92f65fe 100644 --- a/python/paddle/fluid/tests/book/test_label_semantic_roles.py +++ b/python/paddle/fluid/tests/book/test_label_semantic_roles.py @@ -38,7 +38,7 @@ depth = 8 mix_hidden_lr = 1e-3 IS_SPARSE = True -PASS_NUM = 10 +PASS_NUM = 1 BATCH_SIZE = 10 embedding_name = 'emb' diff --git a/python/paddle/fluid/tests/unittests/test_bilinear_interp_op.py b/python/paddle/fluid/tests/unittests/test_bilinear_interp_op.py deleted file mode 100644 index bed847c3c168c906a89c32631b2a8f0ba2e6e7be..0000000000000000000000000000000000000000 --- a/python/paddle/fluid/tests/unittests/test_bilinear_interp_op.py +++ /dev/null @@ -1,168 +0,0 @@ -# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import print_function - -import unittest -import numpy as np -from op_test import OpTest -import paddle.fluid.core as core - - -def bilinear_interp_np(input, out_h, out_w, out_size): - if out_size is not None: - out_h = out_size[0] - out_w = out_size[1] - batch_size, channel, in_h, in_w = input.shape - if out_h > 1: - ratio_h = (in_h - 1.0) / (out_h - 1.0) - else: - ratio_h = 0.0 - if out_w > 1: - ratio_w = (in_w - 1.0) / (out_w - 1.0) - else: - ratio_w = 0.0 - - out = np.zeros((batch_size, channel, out_h, out_w)) - for i in range(out_h): - h = int(ratio_h * i) - hid = 1 if h < in_h - 1 else 0 - h1lambda = ratio_h * i - h - h2lambda = 1.0 - h1lambda - for j in range(out_w): - w = int(ratio_w * j) - wid = 1 if w < in_w - 1 else 0 - w1lambda = ratio_w * j - w - w2lambda = 1.0 - w1lambda - - out[:, :, i, j] = h2lambda*(w2lambda*input[:, :, h, w] + - w1lambda*input[:, :, h, w+wid]) + \ - h1lambda*(w2lambda*input[:, :, h+hid, w] + - w1lambda*input[:, :, h+hid, w+wid]) - return out.astype(input.dtype) - - -class TestBilinearInterpOp(OpTest): - def setUp(self): - self.out_size = None - self.init_test_case() - self.op_type = "bilinear_interp" - input_np = np.random.random(self.input_shape).astype("float32") - output_np = bilinear_interp_np(input_np, self.out_h, self.out_w, - self.out_size) - self.inputs = {'X': input_np} - if self.out_size is not None: - self.inputs['OutSize'] = self.out_size - self.attrs = {'out_h': self.out_h, 'out_w': self.out_w} - self.outputs = {'Out': output_np} - - def test_check_output(self): - self.check_output() - - def test_check_grad(self): - self.check_grad(['X'], 'Out', in_place=True) - - def init_test_case(self): - self.input_shape = [2, 3, 4, 4] - self.out_h = 2 - self.out_w = 2 - self.out_size = np.array([3, 3]).astype("int32") - - -class TestCase1(TestBilinearInterpOp): - def init_test_case(self): - self.input_shape = [4, 1, 7, 8] - self.out_h = 1 - self.out_w = 1 - - -class TestCase2(TestBilinearInterpOp): - def init_test_case(self): - self.input_shape = [3, 3, 9, 6] - self.out_h = 12 - self.out_w = 12 - - -class TestCase3(TestBilinearInterpOp): - def init_test_case(self): - self.input_shape = [1, 1, 128, 64] - self.out_h = 64 - self.out_w = 128 - - -class TestCase4(TestBilinearInterpOp): - def init_test_case(self): - self.input_shape = [4, 1, 7, 8] - self.out_h = 1 - self.out_w = 1 - self.out_size = np.array([2, 2]).astype("int32") - - -class TestCase5(TestBilinearInterpOp): - def init_test_case(self): - self.input_shape = [3, 3, 9, 6] - self.out_h = 12 - self.out_w = 12 - self.out_size = np.array([11, 11]).astype("int32") - - -class TestCase6(TestBilinearInterpOp): - def init_test_case(self): - self.input_shape = [1, 1, 128, 64] - self.out_h = 64 - self.out_w = 128 - self.out_size = np.array([65, 129]).astype("int32") - - -class TestBilinearInterpOpUint8(OpTest): - def setUp(self): - self.out_size = None - self.init_test_case() - self.op_type = "bilinear_interp" - input_np = np.random.randint( - low=0, high=256, size=self.input_shape).astype("uint8") - output_np = bilinear_interp_np(input_np, self.out_h, self.out_w, - self.out_size) - self.inputs = {'X': input_np} - if self.out_size is not None: - self.inputs['OutSize'] = self.out_size - self.attrs = {'out_h': self.out_h, 'out_w': self.out_w} - self.outputs = {'Out': output_np} - - def test_check_output(self): - self.check_output_with_place(place=core.CPUPlace(), atol=1) - - def init_test_case(self): - self.input_shape = [1, 3, 9, 6] - self.out_h = 10 - self.out_w = 9 - - -class TestCase1Uint8(TestBilinearInterpOpUint8): - def init_test_case(self): - self.input_shape = [2, 3, 128, 64] - self.out_h = 120 - self.out_w = 50 - - -class TestCase2Uint8(TestBilinearInterpOpUint8): - def init_test_case(self): - self.input_shape = [4, 1, 7, 8] - self.out_h = 5 - self.out_w = 13 - self.out_size = np.array([6, 15]).astype("int32") - - -if __name__ == "__main__": - unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_conv2d_op.py b/python/paddle/fluid/tests/unittests/test_conv2d_op.py index 6ab13b5106064112cc7886eeec899802a5be60da..ebbbf3ab8b00ff49d55ea5d472a2f7c4eae0da52 100644 --- a/python/paddle/fluid/tests/unittests/test_conv2d_op.py +++ b/python/paddle/fluid/tests/unittests/test_conv2d_op.py @@ -67,6 +67,7 @@ class TestConv2dOp(OpTest): def setUp(self): self.op_type = "conv2d" self.use_cudnn = False + self.exhaustive_search = False self.use_cuda = False self.use_mkldnn = False self.data_format = "AnyLayout" @@ -98,7 +99,8 @@ class TestConv2dOp(OpTest): 'dilations': self.dilations, 'use_cudnn': self.use_cudnn, 'use_mkldnn': self.use_mkldnn, - 'data_format': self.data_format + 'data_format': self.data_format, + 'exhaustive_search': self.exhaustive_search } self.outputs = {'Output': output} @@ -361,6 +363,12 @@ class TestDepthwiseConvWithDilation2(TestConv2dOp): self.op_type = "depthwise_conv2d" +class TestCUDNNExhaustiveSearch(TestConv2dOp): + def init_kernel_type(self): + self.use_cudnn = True + self.exhaustive_search = True + + # Please Don't remove the following code. # Currently, CI use cudnn V5.0 which not support dilation conv. # class TestCUDNNWithDilation(TestWithDilation): diff --git a/python/paddle/fluid/tests/unittests/test_conv3d_op.py b/python/paddle/fluid/tests/unittests/test_conv3d_op.py index ddaf99fe061205f0f2e4c592c9e28e27e657c16a..69c5ab7a4a4cbd552d27dcb07052d46752eeb54a 100644 --- a/python/paddle/fluid/tests/unittests/test_conv3d_op.py +++ b/python/paddle/fluid/tests/unittests/test_conv3d_op.py @@ -335,6 +335,12 @@ class TestFP16WithInput1x1Filter1x1CUDNN(TestWithInput1x1Filter1x1): self.check_output_with_place(place, atol=2e-2) +class TestCUDNNExhaustiveSearch(TestCUDNN): + def init_kernel_type(self): + self.use_cudnn = True + self.exhaustive_search = True + + # FIXME(typhoonzero): find a way to determine if # using cudnn > 6 in python # class TestWithDilationCUDNN(TestWithDilation): diff --git a/python/paddle/fluid/tests/unittests/test_dist_transpiler.py b/python/paddle/fluid/tests/unittests/test_dist_transpiler.py index 3a5b6b5cb8ee4f83c26a96e868e7c75933d28c15..d132dd3c48f55c07725515e40faeb5076398adeb 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_transpiler.py +++ b/python/paddle/fluid/tests/unittests/test_dist_transpiler.py @@ -567,7 +567,6 @@ class TestDistLookupTable(TestDistLookupTableBase): 'fill_constant', 'fill_constant', 'fill_constant', 'fill_constant', 'fill_constant', 'fill_constant', 'fill_constant', 'fill_constant', 'fill_constant', 'fill_constant', 'fill_constant', 'fill_constant', - 'fill_constant', 'fill_constant', 'fill_constant', 'fill_constant', 'fill_constant', 'fill_constant', 'uniform_random', 'uniform_random', 'recv', 'recv', 'recv', 'fetch_barrier', 'concat', 'fake_init' @@ -639,7 +638,7 @@ class TestAsyncDistLookupTable(TestDistLookupTableBase): # 5 save table self.assertEqual([op.type for op in pserver1.blocks[5].ops], ["save"]) - trainer, _ = self.get_trainer(config) + trainer, trainer_startup = self.get_trainer(config) self.assertEqual(len(trainer.blocks), 1) ops = [ 'split_ids', 'prefetch', 'merge_ids', 'sequence_pool', @@ -653,6 +652,16 @@ class TestAsyncDistLookupTable(TestDistLookupTableBase): 'recv', 'concat' ] self.assertEqual([op.type for op in trainer.blocks[0].ops], ops) + startup_ops = [ + 'fill_constant', 'fill_constant', 'fill_constant', 'fill_constant', + 'fill_constant', 'fill_constant', 'fill_constant', 'fill_constant', + 'fill_constant', 'fill_constant', 'fill_constant', 'fill_constant', + 'fill_constant', 'fill_constant', 'uniform_random', + 'uniform_random', 'recv', 'recv', 'recv', 'fetch_barrier', 'concat', + 'fake_init' + ] + self.assertEqual([op.type for op in trainer_startup.blocks[0].ops], + startup_ops) class TestDistLookupTableSliceSize(TestDistLookupTableBase): diff --git a/python/paddle/fluid/tests/unittests/test_interpolate_op.py b/python/paddle/fluid/tests/unittests/test_interpolate_op.py new file mode 100644 index 0000000000000000000000000000000000000000..9748d094cda6ee9dc649d95d1ca7f1c4b55d1031 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_interpolate_op.py @@ -0,0 +1,335 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest +import numpy as np +from op_test import OpTest +import paddle.fluid.core as core + + +def nearest_neighbor_interp_np(X, + out_h, + out_w, + out_size=None, + actual_shape=None): + """nearest neighbor interpolation implement in shape [N, C, H, W]""" + if out_size is not None: + out_h = out_size[0] + out_w = out_size[1] + if actual_shape is not None: + out_h = actual_shape[0] + out_w = actual_shape[1] + n, c, in_h, in_w = X.shape + + ratio_h = ratio_w = 0.0 + if out_h > 1: + ratio_h = (in_h - 1.0) / (out_h - 1.0) + if out_w > 1: + ratio_w = (in_w - 1.0) / (out_w - 1.0) + + out = np.zeros((n, c, out_h, out_w)) + for i in range(out_h): + in_i = int(ratio_h * i + 0.5) + for j in range(out_w): + in_j = int(ratio_w * j + 0.5) + out[:, :, i, j] = X[:, :, in_i, in_j] + + return out.astype(X.dtype) + + +def bilinear_interp_np(input, out_h, out_w, out_size=None, actual_shape=None): + """bilinear interpolation implement in shape [N, C, H, W]""" + if out_size is not None: + out_h = out_size[0] + out_w = out_size[1] + if actual_shape is not None: + out_h = actual_shape[0] + out_w = actual_shape[1] + batch_size, channel, in_h, in_w = input.shape + if out_h > 1: + ratio_h = (in_h - 1.0) / (out_h - 1.0) + else: + ratio_h = 0.0 + if out_w > 1: + ratio_w = (in_w - 1.0) / (out_w - 1.0) + else: + ratio_w = 0.0 + + out = np.zeros((batch_size, channel, out_h, out_w)) + for i in range(out_h): + h = int(ratio_h * i) + hid = 1 if h < in_h - 1 else 0 + h1lambda = ratio_h * i - h + h2lambda = 1.0 - h1lambda + for j in range(out_w): + w = int(ratio_w * j) + wid = 1 if w < in_w - 1 else 0 + w1lambda = ratio_w * j - w + w2lambda = 1.0 - w1lambda + + out[:, :, i, j] = h2lambda*(w2lambda*input[:, :, h, w] + + w1lambda*input[:, :, h, w+wid]) + \ + h1lambda*(w2lambda*input[:, :, h+hid, w] + + w1lambda*input[:, :, h+hid, w+wid]) + return out.astype(input.dtype) + + +INTERPOLATE_FUNCS = { + 'bilinear': bilinear_interp_np, + 'nearest': nearest_neighbor_interp_np, +} + + +class TestInterpolateOp(OpTest): + def setUp(self): + self.out_size = None + self.actual_shape = None + self.init_test_case() + self.op_type = "interpolate" + input_np = np.random.random(self.input_shape).astype("float32") + + output_np = INTERPOLATE_FUNCS[self.interp_method]( + input_np, self.out_h, self.out_w, self.out_size, self.actual_shape) + self.inputs = {'X': input_np} + if self.out_size is not None: + self.inputs['OutSize'] = self.out_size + if self.actual_shape is not None: + self.inputs['OutSize'] = self.actual_shape + self.attrs = { + 'out_h': self.out_h, + 'out_w': self.out_w, + 'interp_method': self.interp_method + } + self.outputs = {'Out': output_np} + + def test_check_output(self): + self.check_output() + + def test_check_grad(self): + self.check_grad(['X'], 'Out', in_place=True) + + def init_test_case(self): + self.interp_method = 'bilinear' + self.input_shape = [2, 3, 4, 4] + self.out_h = 2 + self.out_w = 2 + self.out_size = np.array([3, 3]).astype("int32") + + +class TestBilinearInterpCase1(TestInterpolateOp): + def init_test_case(self): + self.interp_method = 'bilinear' + self.input_shape = [4, 1, 7, 8] + self.out_h = 1 + self.out_w = 1 + + +class TestBilinearInterpCase2(TestInterpolateOp): + def init_test_case(self): + self.interp_method = 'bilinear' + self.input_shape = [3, 3, 9, 6] + self.out_h = 12 + self.out_w = 12 + + +class TestBilinearInterpCase3(TestInterpolateOp): + def init_test_case(self): + self.interp_method = 'bilinear' + self.input_shape = [1, 1, 128, 64] + self.out_h = 64 + self.out_w = 128 + + +class TestBilinearInterpCase4(TestInterpolateOp): + def init_test_case(self): + self.interp_method = 'bilinear' + self.input_shape = [4, 1, 7, 8] + self.out_h = 1 + self.out_w = 1 + self.out_size = np.array([2, 2]).astype("int32") + + +class TestBilinearInterpCase5(TestInterpolateOp): + def init_test_case(self): + self.interp_method = 'bilinear' + self.input_shape = [3, 3, 9, 6] + self.out_h = 12 + self.out_w = 12 + self.out_size = np.array([11, 11]).astype("int32") + + +class TestBilinearInterpCase6(TestInterpolateOp): + def init_test_case(self): + self.interp_method = 'bilinear' + self.input_shape = [1, 1, 128, 64] + self.out_h = 64 + self.out_w = 128 + self.out_size = np.array([65, 129]).astype("int32") + + +class TestBilinearInterpActualShape(TestInterpolateOp): + def init_test_case(self): + self.interp_method = 'bilinear' + self.input_shape = [3, 2, 32, 16] + self.out_h = 64 + self.out_w = 32 + self.out_size = np.array([66, 40]).astype("int32") + + +class TestBilinearInterpBigScale(TestInterpolateOp): + def init_test_case(self): + self.interp_method = 'bilinear' + self.input_shape = [4, 4, 64, 32] + self.out_h = 100 + self.out_w = 50 + self.out_size = np.array([101, 51]).astype('int32') + + +class TestInterpolateOpUint8(OpTest): + def setUp(self): + self.out_size = None + self.actual_shape = None + self.init_test_case() + self.op_type = "interpolate" + input_np = np.random.randint( + low=0, high=256, size=self.input_shape).astype("uint8") + output_np = INTERPOLATE_FUNCS[self.interp_method]( + input_np, self.out_h, self.out_w, self.out_size, self.actual_shape) + self.inputs = {'X': input_np} + if self.out_size is not None: + self.inputs['OutSize'] = self.out_size + self.attrs = { + 'out_h': self.out_h, + 'out_w': self.out_w, + 'interp_method': self.interp_method + } + self.outputs = {'Out': output_np} + + def test_check_output(self): + self.check_output_with_place(place=core.CPUPlace(), atol=1) + + def init_test_case(self): + self.interp_method = 'bilinear' + self.input_shape = [1, 3, 9, 6] + self.out_h = 10 + self.out_w = 9 + + +class TestBilinearInterpCase1Uint8(TestInterpolateOpUint8): + def init_test_case(self): + self.interp_method = 'bilinear' + self.input_shape = [2, 3, 128, 64] + self.out_h = 120 + self.out_w = 50 + + +class TestBilinearInterpCase2Uint8(TestInterpolateOpUint8): + def init_test_case(self): + self.interp_method = 'bilinear' + self.input_shape = [4, 1, 7, 8] + self.out_h = 5 + self.out_w = 13 + self.out_size = np.array([6, 15]).astype("int32") + + +class TestNearestNeighborInterpCase1(TestInterpolateOp): + def init_test_case(self): + self.interp_method = 'nearest' + self.input_shape = [4, 1, 7, 8] + self.out_h = 1 + self.out_w = 1 + + +class TestNearestNeighborInterpCase2(TestInterpolateOp): + def init_test_case(self): + self.interp_method = 'nearest' + self.input_shape = [3, 3, 9, 6] + self.out_h = 12 + self.out_w = 12 + + +class TestNearestNeighborInterpCase3(TestInterpolateOp): + def init_test_case(self): + self.interp_method = 'nearest' + self.input_shape = [1, 1, 128, 64] + self.out_h = 64 + self.out_w = 128 + + +class TestNearestNeighborInterpCase4(TestInterpolateOp): + def init_test_case(self): + self.interp_method = 'nearest' + self.input_shape = [4, 1, 7, 8] + self.out_h = 1 + self.out_w = 1 + self.out_size = np.array([2, 2]).astype("int32") + + +class TestNearestNeighborInterpCase5(TestInterpolateOp): + def init_test_case(self): + self.interp_method = 'nearest' + self.input_shape = [3, 3, 9, 6] + self.out_h = 12 + self.out_w = 12 + self.out_size = np.array([11, 11]).astype("int32") + + +class TestNearestNeighborInterpCase6(TestInterpolateOp): + def init_test_case(self): + self.interp_method = 'nearest' + self.input_shape = [1, 1, 128, 64] + self.out_h = 64 + self.out_w = 128 + self.out_size = np.array([65, 129]).astype("int32") + + +class TestNearestNeighborInterpActualShape(TestInterpolateOp): + def init_test_case(self): + self.interp_method = 'nearest' + self.input_shape = [3, 2, 32, 16] + self.out_h = 64 + self.out_w = 32 + self.out_size = np.array([66, 40]).astype("int32") + + +class TestNearestNeighborInterpBigScale(TestInterpolateOp): + def init_test_case(self): + self.interp_method = 'nearest' + self.input_shape = [4, 4, 64, 32] + self.out_h = 100 + self.out_w = 50 + self.out_size = np.array([101, 51]).astype('int32') + + +class TestNearestNeighborInterpCase1Uint8(TestInterpolateOpUint8): + def init_test_case(self): + self.interp_method = 'nearest' + self.input_shape = [2, 3, 128, 64] + self.out_h = 120 + self.out_w = 50 + + +class TestNearestNeighborInterpCase2Uint8(TestInterpolateOpUint8): + def init_test_case(self): + self.interp_method = 'nearest' + self.input_shape = [4, 1, 7, 8] + self.out_h = 5 + self.out_w = 13 + self.out_size = np.array([6, 15]).astype("int32") + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_layers.py b/python/paddle/fluid/tests/unittests/test_layers.py index 49ba41e6fc908e9713414120bbeb45ca715042c3..f48d9c84f9c10b0eff8e41a510d168543c9795fa 100644 --- a/python/paddle/fluid/tests/unittests/test_layers.py +++ b/python/paddle/fluid/tests/unittests/test_layers.py @@ -496,6 +496,16 @@ class TestBook(unittest.TestCase): self.assertIsNotNone(output) print(str(program)) + def test_resize_nearest(self): + program = Program() + with program_guard(program): + x = layers.data(name='x', shape=[3, 9, 6], dtype="float32") + output = layers.resize_nearest(x, out_shape=[12, 12]) + self.assertIsNotNone(output) + output = layers.resize_nearest(x, scale=3) + self.assertIsNotNone(output) + print(str(program)) + def test_polygon_box_transform(self): program = Program() with program_guard(program): @@ -901,6 +911,16 @@ class TestBook(unittest.TestCase): self.assertIsNotNone(data_1) print(str(program)) + def test_bilinear_tensor_product_layer(self): + program = Program() + with program_guard(program): + data = layers.data(name='data', shape=[4], dtype="float32") + + theta = layers.data(name="theta", shape=[5], dtype="float32") + out = layers.bilinear_tensor_product(data, theta, 6) + + print(str(program)) + if __name__ == '__main__': unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_dry_run.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_dry_run.py index c93740669f40aee3a6c143d153cfd0f5bb72dbd9..18d95c94ad36316b7149eb5412260b40a57ac002 100644 --- a/python/paddle/fluid/tests/unittests/test_parallel_executor_dry_run.py +++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_dry_run.py @@ -21,8 +21,8 @@ import six class TestBase(unittest.TestCase): def main(self, network_func, - iter=100, - iter_per_pe=100, + iter=10, + iter_per_pe=10, use_gpu=True, use_experimental_executor=False): if use_gpu and not fluid.core.is_compiled_with_cuda(): @@ -45,7 +45,7 @@ class TestBase(unittest.TestCase): exe_strategy._dry_run = True exe_strategy.use_experimental_executor = use_experimental_executor pe = fluid.ParallelExecutor( - use_cuda=True, + use_cuda=use_gpu, loss_name=loss.name, main_program=main_prog, exec_strategy=exe_strategy) diff --git a/python/paddle/fluid/tests/unittests/test_similarity_focus_op.py b/python/paddle/fluid/tests/unittests/test_similarity_focus_op.py new file mode 100755 index 0000000000000000000000000000000000000000..b3833f05f1aa3aac7b5bcc5b6fdc138870cc8844 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_similarity_focus_op.py @@ -0,0 +1,217 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest +import numpy as np +import paddle.fluid.core as core +from op_test import OpTest + + +class TestSimilarityFocusOp(OpTest): + def setUp(self): + self.op_type = "similarity_focus" + batch_size = 2 + x_dim, y_dim, z_dim = 3, 2, 2 + self.inputs = { + 'X': np.array([[[[0.8, 0.1], [0.4, 0.5]], [[0.9, 0.7], [0.9, 0.9]], + [[0.8, 0.9], [0.1, 0.2]]], + [[[0.2, 0.5], [0.3, 0.4]], [[0.9, 0.7], [0.8, 0.4]], + [[0.0, 0.2], [0.4, 0.7]]]]), + } + self.attrs = { + 'axis': 1, + 'indexes': [0], + } + + output = None + for batch in range(batch_size): + res = np.zeros((1, y_dim, z_dim)).astype("float32").reshape(-1) + for index in self.attrs['indexes']: + channel = self.inputs['X'][batch, index, :, :].reshape(-1).copy( + ) + tag1 = [0 for i in range(y_dim)] + tag2 = [0 for i in range(z_dim)] + cnt = 0 + for i in range(channel.size): + index = channel.argmax() + idx1 = index // z_dim + idx2 = index % z_dim + if tag1[idx1] + tag2[idx2] == 0: + tag1[idx1] = 1 + tag2[idx2] = 1 + res[index] = 1 + cnt += 1 + if cnt == min(y_dim, z_dim): + break + channel[index] = -1 + res = res.reshape(1, y_dim, z_dim).repeat([x_dim], axis=0) + res = res.reshape(1, x_dim, y_dim, z_dim) + if output is not None: + output = np.concatenate((output, res), axis=0) + else: + output = res + self.outputs = {'Out': output} + + def test_check_output(self): + self.check_output() + + +class TestSimilarityFocusOp_axis1(OpTest): + def setUp(self): + self.op_type = "similarity_focus" + batch_size = 3 + x_dim, y_dim, z_dim = 4, 5, 6 + self.inputs = { + 'X': np.random.random( + (batch_size, x_dim, y_dim, z_dim)).astype("float32"), + } + self.attrs = { + 'axis': 1, + 'indexes': [0, 3], + } + + output = None + for batch in range(batch_size): + res = np.zeros((1, y_dim, z_dim)).astype("float32").reshape(-1) + for index in self.attrs['indexes']: + channel = self.inputs['X'][batch, index, :, :].reshape(-1).copy( + ) + tag1 = [0 for i in range(y_dim)] + tag2 = [0 for i in range(z_dim)] + cnt = 0 + for i in range(channel.size): + index = channel.argmax() + idx1 = index // z_dim + idx2 = index % z_dim + if tag1[idx1] + tag2[idx2] == 0: + tag1[idx1] = 1 + tag2[idx2] = 1 + res[index] = 1 + cnt += 1 + if cnt == min(y_dim, z_dim): + break + channel[index] = -1 + res = res.reshape(1, y_dim, z_dim) + res = res.repeat([x_dim], axis=0) + res = res.reshape(1, x_dim, y_dim, z_dim) + if output is not None: + output = np.concatenate((output, res), axis=0) + else: + output = res + self.outputs = {'Out': output} + + def test_check_output(self): + self.check_output() + + +class TestSimilarityFocusOp_axis2(OpTest): + def setUp(self): + self.op_type = "similarity_focus" + batch_size = 6 + x_dim, y_dim, z_dim = 7, 8, 9 + self.inputs = { + 'X': np.random.random( + (batch_size, x_dim, y_dim, z_dim)).astype("float32"), + } + self.attrs = { + 'axis': 2, + 'indexes': [0, 3, 5], + } + + output = None + for batch in range(batch_size): + res = np.zeros((x_dim, 1, z_dim)).astype("float32").reshape(-1) + for index in self.attrs['indexes']: + channel = self.inputs['X'][batch, :, index, :].reshape(-1).copy( + ) + tag1 = [0 for i in range(x_dim)] + tag2 = [0 for i in range(z_dim)] + cnt = 0 + for i in range(channel.size): + index = channel.argmax() + idx1 = index // z_dim + idx2 = index % z_dim + if tag1[idx1] + tag2[idx2] == 0: + tag1[idx1] = 1 + tag2[idx2] = 1 + res[index] = 1 + cnt += 1 + if cnt == min(x_dim, z_dim): + break + channel[index] = -1 + res = res.reshape(x_dim, 1, z_dim) + res = res.repeat([y_dim], axis=1) + res = res.reshape(1, x_dim, y_dim, z_dim) + if output is not None: + output = np.concatenate((output, res), axis=0) + else: + output = res + self.outputs = {'Out': output} + + def test_check_output(self): + self.check_output() + + +class TestSimilarityFocusOp_axis3(OpTest): + def setUp(self): + self.op_type = "similarity_focus" + batch_size = 64 + x_dim, y_dim, z_dim = 48, 48, 13 + self.inputs = { + 'X': np.random.random( + (batch_size, x_dim, y_dim, z_dim)).astype("float32"), + } + self.attrs = { + 'axis': 3, + 'indexes': [0, 2, 7, 9], + } + + output = None + for batch in range(batch_size): + res = np.zeros((x_dim, y_dim, 1)).astype("float32").reshape(-1) + for index in self.attrs['indexes']: + channel = self.inputs['X'][batch, :, :, index].reshape(-1).copy( + ) + tag1 = [0 for i in range(x_dim)] + tag2 = [0 for i in range(y_dim)] + cnt = 0 + for i in range(channel.size): + index = channel.argmax() + idx1 = index // y_dim + idx2 = index % y_dim + if tag1[idx1] + tag2[idx2] == 0: + tag1[idx1] = 1 + tag2[idx2] = 1 + res[index] = 1 + cnt += 1 + if cnt == min(x_dim, y_dim): + break + channel[index] = -1 + res = res.reshape(x_dim, y_dim, 1) + res = res.repeat([z_dim], axis=2) + res = res.reshape(1, x_dim, y_dim, z_dim) + if output is not None: + output = np.concatenate((output, res), axis=0) + else: + output = res + self.outputs = {'Out': output} + + def test_check_output(self): + self.check_output() + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_tensor_array_to_tensor.py b/python/paddle/fluid/tests/unittests/test_tensor_array_to_tensor.py new file mode 100644 index 0000000000000000000000000000000000000000..78b95de7e07b1d1fcdeeae63498e740c2b474c6d --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_tensor_array_to_tensor.py @@ -0,0 +1,142 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest +import numpy +import paddle.fluid as fluid +import paddle.fluid.core as core +from paddle.fluid.op import Operator +from paddle.fluid.executor import Executor + + +class TestLoDTensorArrayConcat(unittest.TestCase): + def setUp(self): + self.op_type = "tensor_array_to_tensor" + self.attrs = {"axis": 0} + self.outputs = ["Out"] + + def test_get_set(self): + scope = core.Scope() + program = fluid.Program() + block = program.global_block() + + input_arr = block.create_var( + name="tmp_lod_tensor_array", + type=core.VarDesc.VarType.LOD_TENSOR_ARRAY) + input_arr.persistable = True + input_arr_var = scope.var('tmp_lod_tensor_array') + input_tensor_array = input_arr_var.get_lod_tensor_array() + self.assertEqual(0, len(input_tensor_array)) + + cpu = core.CPUPlace() + for i in range(10): + t = core.LoDTensor() + if i == 0: + t.set(numpy.array([[i], [i]], dtype='float32'), cpu) + else: + t.set(numpy.array([[i]], dtype='float32'), cpu) + input_tensor_array.append(t) + + self.assertEqual(10, len(input_tensor_array)) + + random_grad = numpy.random.random_sample([11]).astype(numpy.float32) + + y_out = block.create_var(name="Out") + y_out.persistable = True + y_out_index = block.create_var(name="OutIndex") + y_out_index.persistable = True + + y_grad_arr = block.create_var( + name='Out@GRAD', dtype='float32', shape=[11]) + y_grad_arr.persistable = True + y_grad = scope.var('Out@GRAD') + y_grad_tensor = y_grad.get_tensor() + y_grad_tensor.set(random_grad, cpu) + + op = block.append_op( + type=self.op_type, + inputs={"X": input_arr}, + outputs={"Out": y_out, + "OutIndex": y_out_index}, + attrs=self.attrs) + + out_grad = block.create_var( + name="tmp_lod_tensor_array@GRAD", + type=core.VarDesc.VarType.LOD_TENSOR_ARRAY) + out_grad.persistable = True + + grad_op_desc_list, op_grad_to_var = core.get_grad_op_desc(op.desc, + set(), []) + grad_op_desc = grad_op_desc_list[0] + new_op_desc = block.desc.append_op() + new_op_desc.copy_from(grad_op_desc) + for var_name in grad_op_desc.output_arg_names(): + block.desc.var(var_name.encode("ascii")) + + grad_op_desc.infer_var_type(block.desc) + grad_op_desc.infer_shape(block.desc) + for arg in grad_op_desc.output_arg_names(): + grad_var = block.desc.find_var(arg.encode("ascii")) + grad_var.set_dtype(core.VarDesc.VarType.FP32) + + fetch_list = [] + fetch_list.append(block.var('Out')) + fetch_list.append(block.var('OutIndex')) + + exe = fluid.Executor(fluid.CPUPlace()) + out = exe.run(program, fetch_list=fetch_list, scope=scope) + #print ("index: ", numpy.array(out[1])) + + # test forward + tensor_res = numpy.array(out[0]) + tensor_res_out_idx = numpy.array(out[1]) + tensor_gt = numpy.array( + [0] + [0, 1, 2, 3, 4, 5, 6, 7, 8, 9], dtype='float32') + + self.assertEqual(len(tensor_res), len(tensor_gt)) + self.assertEqual(len(tensor_res_out_idx), 10) + + for i in range(len(tensor_res)): + self.assertEqual(tensor_res[i], tensor_gt[i]) + + for i in range(len(tensor_res_out_idx)): + if i == 0: + self.assertEqual(tensor_res_out_idx[i], 2) + else: + self.assertEqual(tensor_res_out_idx[i], 1) + + # test backward + grad_tensor = scope.var('tmp_lod_tensor_array@GRAD') + grad_tensor_array = grad_tensor.get_lod_tensor_array() + + self.assertEqual(10, len(grad_tensor_array)) + + for i in range(len(grad_tensor_array)): + if i == 0: + self.assertEqual( + numpy.array(grad_tensor_array[i])[0], + numpy.array(random_grad[i])) + self.assertEqual( + numpy.array(grad_tensor_array[i])[1], + numpy.array(random_grad[i + 1])) + if i == 1: + self.assertEqual( + numpy.array(grad_tensor_array[i]), + numpy.array(random_grad[i + 1])) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/transpiler/distribute_transpiler.py b/python/paddle/fluid/transpiler/distribute_transpiler.py index 094eaeb59ce7ab73012f6e6a5fc24778933270c1..89bc24802751340b6d4657be8673d714f3d3dc2b 100644 --- a/python/paddle/fluid/transpiler/distribute_transpiler.py +++ b/python/paddle/fluid/transpiler/distribute_transpiler.py @@ -31,18 +31,17 @@ Steps to transpile pserver: """ import math -import sys import numpy as np import collections -import six import logging -from .ps_dispatcher import RoundRobin, HashName, PSDispatcher +from .ps_dispatcher import RoundRobin, PSDispatcher from .. import core, framework, unique_name from ..framework import Program, default_main_program, \ default_startup_program, Block, \ Parameter, grad_var_name from .details import * +from ..distribute_lookup_table import find_distributed_lookup_table from functools import reduce LOOKUP_TABLE_TYPE = "lookup_table" @@ -292,7 +291,8 @@ class DistributeTranspiler(object): self.optimize_ops, self.params_grads = self._get_optimize_pass() ps_dispatcher = self.config.split_method(self.pserver_endpoints) - self.has_distributed_lookup_table = self._has_distributed_lookup_table() + self.table_name = find_distributed_lookup_table(self.origin_program) + self.has_distributed_lookup_table = self.table_name != None self.param_name_to_grad_name = dict() self.grad_name_to_param_name = dict() for param_var, grad_var in self.params_grads: @@ -966,28 +966,6 @@ to transpile() call.") # ====================== private transpiler functions ===================== - def _has_distributed_lookup_table(self): - # process lookup_table_op - # 1. check all lookup_table_op is distributed - # 2. check all lookup_table_op share the same table. - distributed_lookup_table_ops = [] - # support only one distributed_lookup_table now - self.table_name = None - for op in self.origin_program.global_block().ops: - if op.type == LOOKUP_TABLE_TYPE: - if op.attr('is_distributed') is True: - if self.table_name is None: - self.table_name = op.input("W")[0] - if self.table_name != op.input("W")[0]: - raise RuntimeError("all distributed lookup_table_ops" - " should have only one table") - distributed_lookup_table_ops.append(op) - else: - if self.table_name is not None: - assert op.input("W")[0] != self.table_name - - return len(distributed_lookup_table_ops) > 0 - def _update_dist_lookup_table_vars(self, param_list, grad_list, params_grads): # TODO(wuyi): put find a way to put dist lookup table stuff all together. @@ -1341,7 +1319,6 @@ to transpile() call.") """ create a new block to handle save checkpoint. """ - import os pserver_program.global_block().create_var( name="kLookupTablePath", diff --git a/python/setup.py.in b/python/setup.py.in index b1ff9f3a5c3d877edb6bc6a12efce053a44b4c9c..c623057d5081a6fedcd90eb5f5d53531a5d62bb8 100644 --- a/python/setup.py.in +++ b/python/setup.py.in @@ -174,6 +174,18 @@ if '${CMAKE_BUILD_TYPE}' == 'Release': raise Exception("patch libmkldnn.so failed, command: %s" % command) package_data['paddle.libs']+=['libmkldnn.so.0'] shutil.copy('${MKLDNN_SHARED_LIB}', libs_path) +if '${WITH_NGRAPH}' == 'ON': + if '${CMAKE_BUILD_TYPE}' == 'Release': + # only change rpath in Release mode. + command = "patchelf --set-rpath '$ORIGIN/' ${NGRAPH_SHARED_LIB}" + if os.system(command) != 0: + raise Exception("patch ${NGRAPH_SHARED_LIB_NAME} failed, command: %s" % command) + shutil.copy('${NGRAPH_SHARED_LIB}', libs_path) + shutil.copy('${NGRAPH_CPU_LIB}', libs_path) + shutil.copy('${NGRAPH_TBB_LIB}', libs_path) + package_data['paddle.libs']+=['${NGRAPH_SHARED_LIB_NAME}', + '${NGRAPH_CPU_LIB_NAME}', + '${NGRAPH_TBB_LIB_NAME}'] # remove unused paddle/libs/__init__.py os.remove(libs_path+'/__init__.py') package_dir['paddle.libs']=libs_path