diff --git a/Dockerfile b/Dockerfile index fbec88c7966d6ea93495519843d6cda63f622661..7856d3bbc492af4cad2d6b9f49001c90eadbea43 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,6 +1,6 @@ # A image for building paddle binaries # Use cuda devel base image for both cpu and gpu environment -FROM nvidia/cuda:8.0-cudnn5-devel-ubuntu16.04 +FROM nvidia/cuda:8.0-cudnn7-devel-ubuntu16.04 MAINTAINER PaddlePaddle Authors ARG UBUNTU_MIRROR diff --git a/cmake/cblas.cmake b/cmake/cblas.cmake index 6320b17520a687f88993b6f464d9115838b0f96b..e3b9d94215a858c5c9a34e1b7e97540f1876801d 100644 --- a/cmake/cblas.cmake +++ b/cmake/cblas.cmake @@ -62,29 +62,33 @@ endif() ## Then find the reference-cblas. www.netlib.org/blas/ - - set(REFERENCE_CBLAS_ROOT $ENV{REFERENCE_CBLAS_ROOT} CACHE PATH "Folder contains reference-cblas") -set(REFERENCE_CBLAS_INCLUDE_SEARCH_PATHS - ${REFERENCE_CBLAS_ROOT}/include - /usr/include - /usr/include/cblas -) - -set(REFERENCE_CBLAS_LIB_SEARCH_PATHS - ${REFERENCE_CBLAS_ROOT}/lib - /usr/lib - /usr/lib/blas/reference/ - /usr/lib/reference/ -) +if(NOT CMAKE_CROSSCOMPILING) + set(REFERENCE_CBLAS_INCLUDE_SEARCH_PATHS + ${REFERENCE_CBLAS_ROOT}/include + /usr/include + /usr/include/cblas + ) + + set(REFERENCE_CBLAS_LIB_SEARCH_PATHS + ${REFERENCE_CBLAS_ROOT}/lib + /usr/lib + /usr/lib/blas/reference/ + /usr/lib/reference/ + ) +else() + # Disable the finding of reference cblas under host's system path + set(REFERENCE_CBLAS_INCLUDE_SEARCH_PATHS ${REFERENCE_CBLAS_ROOT}/include) + set(REFERENCE_CBLAS_LIB_SEARCH_PATHS ${REFERENCE_CBLAS_ROOT}/lib) +endif() find_path(REFERENCE_CBLAS_INCLUDE_DIR NAMES cblas.h PATHS ${REFERENCE_CBLAS_INCLUDE_SEARCH_PATHS}) find_library(REFERENCE_CBLAS_LIBRARY NAMES cblas PATHS ${REFERENCE_CBLAS_LIB_SEARCH_PATHS}) -if (REFERENCE_CBLAS_INCLUDE_DIR AND REFERENCE_CBLAS_LIBRARY) +if(REFERENCE_CBLAS_INCLUDE_DIR AND REFERENCE_CBLAS_LIBRARY) set(CBLAS_FOUND ON) set(CBLAS_PROVIDER REFERENCE) set(CBLAS_INC_DIR ${REFERENCE_CBLAS_INCLUDE_DIR}) diff --git a/cmake/external/grpc.cmake b/cmake/external/grpc.cmake index 0853b981813c5d60a12603471df7e0b216b0822f..aa249159470773241e0f6da2e8e086264634dd4a 100644 --- a/cmake/external/grpc.cmake +++ b/cmake/external/grpc.cmake @@ -24,16 +24,16 @@ SET(GRPC_INSTALL_DIR ${THIRD_PARTY_PATH}/install/grpc) SET(GRPC_INCLUDE_DIR "${GRPC_INSTALL_DIR}/include/" CACHE PATH "grpc include directory." FORCE) SET(GRPC_CPP_PLUGIN "${GRPC_INSTALL_DIR}/bin/grpc_cpp_plugin" CACHE FILEPATH "GRPC_CPP_PLUGIN" FORCE) IF(APPLE) - SET(BUILD_CMD make -n HAS_SYSTEM_PROTOBUF=false -s -j8 static grpc_cpp_plugin | sed "s/-Werror//g" | sh) + SET(BUILD_CMD make -n HAS_SYSTEM_PROTOBUF=false -s -j static grpc_cpp_plugin | sed "s/-Werror//g" | sh) ELSE() - SET(BUILD_CMD make HAS_SYSTEM_PROTOBUF=false -s -j8 static grpc_cpp_plugin) + SET(BUILD_CMD make HAS_SYSTEM_PROTOBUF=false -s -j static grpc_cpp_plugin) ENDIF() ExternalProject_Add( extern_grpc DEPENDS protobuf zlib GIT_REPOSITORY "https://github.com/grpc/grpc.git" - GIT_TAG "v1.8.x" + GIT_TAG "v1.11.x" PREFIX ${GRPC_SOURCES_DIR} UPDATE_COMMAND "" CONFIGURE_COMMAND "" diff --git a/cmake/external/snappy.cmake b/cmake/external/snappy.cmake index 71f54c425d4c38e271a8f1b78887d95a27252443..80282329c6ac65fbd1493a6838efca4bd9cadaad 100644 --- a/cmake/external/snappy.cmake +++ b/cmake/external/snappy.cmake @@ -11,19 +11,20 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -# -IF(MOBILE_INFERENCE) +if(MOBILE_INFERENCE OR RPI) return() -ENDIF() +endif() include (ExternalProject) # NOTE: snappy is needed when linking with recordio -SET(SNAPPY_SOURCES_DIR ${THIRD_PARTY_PATH}/snappy) -SET(SNAPPY_INSTALL_DIR ${THIRD_PARTY_PATH}/install/snappy) -SET(SNAPPY_INCLUDE_DIR "${SNAPPY_INSTALL_DIR}/include/" CACHE PATH "snappy include directory." FORCE) +set(SNAPPY_SOURCES_DIR ${THIRD_PARTY_PATH}/snappy) +set(SNAPPY_INSTALL_DIR ${THIRD_PARTY_PATH}/install/snappy) +set(SNAPPY_INCLUDE_DIR "${SNAPPY_INSTALL_DIR}/include" CACHE PATH "snappy include directory." FORCE) + +set(SNAPPY_LIBRARIES "${SNAPPY_INSTALL_DIR}/lib/libsnappy.a") ExternalProject_Add( extern_snappy @@ -51,8 +52,7 @@ ExternalProject_Add( ) add_library(snappy STATIC IMPORTED GLOBAL) -set_property(TARGET snappy PROPERTY IMPORTED_LOCATION - "${SNAPPY_INSTALL_DIR}/lib/libsnappy.a") +set_property(TARGET snappy PROPERTY IMPORTED_LOCATION ${SNAPPY_LIBRARIES}) include_directories(${SNAPPY_INCLUDE_DIR}) add_dependencies(snappy extern_snappy) diff --git a/cmake/external/snappystream.cmake b/cmake/external/snappystream.cmake index 8f7a3bf8eeaef75c8840f4ea318b484d33249bb7..20a96430823d07a07d4bb4602e7fc0cfe55c3bf2 100644 --- a/cmake/external/snappystream.cmake +++ b/cmake/external/snappystream.cmake @@ -11,9 +11,8 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -# -IF(MOBILE_INFERENCE) +IF(MOBILE_INFERENCE OR RPI) return() ENDIF() @@ -21,9 +20,11 @@ include (ExternalProject) # NOTE: snappy is needed when linking with recordio -SET(SNAPPYSTREAM_SOURCES_DIR ${THIRD_PARTY_PATH}/snappy_stream) -SET(SNAPPYSTREAM_INSTALL_DIR ${THIRD_PARTY_PATH}/install/snappy_stream) -SET(SNAPPYSTREAM_INCLUDE_DIR "${SNAPPYSTREAM_INSTALL_DIR}/include/" CACHE PATH "snappy stream include directory." FORCE) +set(SNAPPYSTREAM_SOURCES_DIR ${THIRD_PARTY_PATH}/snappy_stream) +set(SNAPPYSTREAM_INSTALL_DIR ${THIRD_PARTY_PATH}/install/snappy_stream) +set(SNAPPYSTREAM_INCLUDE_DIR "${SNAPPYSTREAM_INSTALL_DIR}/include" CACHE PATH "snappy stream include directory." FORCE) + +set(SNAPPYSTREAM_LIBRARIES "${SNAPPYSTREAM_INSTALL_DIR}/lib/libsnappystream.a") ExternalProject_Add( extern_snappystream @@ -51,8 +52,7 @@ ExternalProject_Add( ) add_library(snappystream STATIC IMPORTED GLOBAL) -set_property(TARGET snappystream PROPERTY IMPORTED_LOCATION - "${SNAPPYSTREAM_INSTALL_DIR}/lib/libsnappystream.a") +set_property(TARGET snappystream PROPERTY IMPORTED_LOCATION ${SNAPPYSTREAM_LIBRARIES}) include_directories(${SNAPPYSTREAM_INCLUDE_DIR}) # For snappysteam to include its own headers. include_directories(${THIRD_PARTY_PATH}/install) # For Paddle to include snappy stream headers. diff --git a/cmake/generic.cmake b/cmake/generic.cmake index c4c9f77df8d57fe162616d2250bd4dfe5b7754e7..1d3e2ade6d393c6e4c37eea0dc1064cdb18808a5 100644 --- a/cmake/generic.cmake +++ b/cmake/generic.cmake @@ -195,14 +195,7 @@ function(cc_library TARGET_NAME) list(REMOVE_ITEM cc_library_DEPS warpctc) add_dependencies(${TARGET_NAME} warpctc) endif() - if("${cc_library_DEPS}" MATCHES "ARCHIVE_START") - # Support linking flags: --whole-archive (Linux) / -force_load (MacOS). - # WARNING: Please don't use ARCHIVE_START&ARCHIVE_END if TARGET_NAME will be linked by other libraries. - target_circle_link_libraries(${TARGET_NAME} ${cc_library_DEPS}) - list(REMOVE_ITEM cc_library_DEPS ARCHIVE_START ARCHIVE_END) - else() - target_link_libraries(${TARGET_NAME} ${cc_library_DEPS}) - endif() + target_link_libraries(${TARGET_NAME} ${cc_library_DEPS}) add_dependencies(${TARGET_NAME} ${cc_library_DEPS}) endif() @@ -243,11 +236,7 @@ function(cc_test TARGET_NAME) set(multiValueArgs SRCS DEPS ARGS) cmake_parse_arguments(cc_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) add_executable(${TARGET_NAME} ${cc_test_SRCS}) - # Support linking flags: --whole-archive (Linux) / -force_load (MacOS) - target_circle_link_libraries(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main memory gtest gflags glog) - if("${cc_test_DEPS}" MATCHES "ARCHIVE_START") - list(REMOVE_ITEM cc_test_DEPS ARCHIVE_START ARCHIVE_END) - endif() + target_link_libraries(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main memory gtest gflags glog) add_dependencies(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main memory gtest gflags glog) add_test(NAME ${TARGET_NAME} COMMAND ${TARGET_NAME} ${cc_test_ARGS} diff --git a/cmake/inference_lib.cmake b/cmake/inference_lib.cmake index 0323cd9698cba916d2aa04403be97c0a6a463830..cc758019827b9a5416a801e4da43d754d4492a73 100644 --- a/cmake/inference_lib.cmake +++ b/cmake/inference_lib.cmake @@ -1,7 +1,22 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + set_property(GLOBAL PROPERTY FLUID_MODULES "") # find all fluid modules is used for paddle fluid static library function(find_fluid_modules TARGET_NAME) get_filename_component(__target_path ${TARGET_NAME} ABSOLUTE) + string(REGEX REPLACE "^${PADDLE_SOURCE_DIR}/" "" __target_path ${__target_path}) string(FIND "${__target_path}" "fluid" pos) if(pos GREATER 1) get_property(fluid_modules GLOBAL PROPERTY FLUID_MODULES) @@ -77,6 +92,23 @@ elseif (WITH_MKLML) ) endif() +if(NOT MOBILE_INFERENCE AND NOT RPI) + set(dst_dir "${CMAKE_INSTALL_PREFIX}/third_party/install/snappy") + copy(snappy_lib + SRCS ${SNAPPY_INCLUDE_DIR} ${SNAPPY_LIBRARIES} + DSTS ${dst_dir} ${dst_dir}/lib) + + set(dst_dir "${CMAKE_INSTALL_PREFIX}/third_party/install/snappystream") + copy(snappystream_lib + SRCS ${SNAPPYSTREAM_INCLUDE_DIR} ${SNAPPYSTREAM_LIBRARIES} + DSTS ${dst_dir} ${dst_dir}/lib) + + set(dst_dir "${CMAKE_INSTALL_PREFIX}/third_party/install/zlib") + copy(zlib_lib + SRCS ${ZLIB_INCLUDE_DIR} ${ZLIB_LIBRARIES} + DSTS ${dst_dir} ${dst_dir}/lib) +endif() + # paddle fluid module set(src_dir "${PADDLE_SOURCE_DIR}/paddle/fluid") set(dst_dir "${CMAKE_INSTALL_PREFIX}/paddle/fluid") diff --git a/doc/fluid/design/motivation/fluid.md b/doc/fluid/design/motivation/fluid.md index 5e147f8263e685a4665b5793f7127178cbc3cfdd..4b7696cc1bbf57ace72c4d31ffc2bfe6c1071939 100644 --- a/doc/fluid/design/motivation/fluid.md +++ b/doc/fluid/design/motivation/fluid.md @@ -119,7 +119,7 @@ An actual Fluid example is described [here](https://github.com/PaddlePaddle/Pad From the example, the Fluid programs look very similar to their PyTorch equivalent programs, except that Fluid's loop structure, wrapped with Python's `with` statement, could run much faster than just a Python loop. -We have more examples of the [`if-then-else`](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/if_else_op.md) structure of Fluid. +We have more examples of the [`if-then-else`](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/execution/if_else_op.md) structure of Fluid. ## Turing Completeness diff --git a/paddle/CMakeLists.txt b/paddle/CMakeLists.txt index c44f8a8a8ecc1ba1f886fc41aec863b4ca3458a6..8b1ca5e16548334ed0c9a6d31b88e0805304579e 100644 --- a/paddle/CMakeLists.txt +++ b/paddle/CMakeLists.txt @@ -24,6 +24,6 @@ if(NOT WITH_FLUID_ONLY) endif() add_subdirectory(testing) -if(NOT MOBILE_INFERENCE AND NOT ANDROID AND NOT IOS) +if(NOT MOBILE_INFERENCE AND NOT RPI) add_subdirectory(fluid) endif() diff --git a/paddle/fluid/CMakeLists.txt b/paddle/fluid/CMakeLists.txt index d725763b01d5953985f8e090605f68a8419b5498..d274d96c29bdbf5973d568d783369c3975bdc436 100644 --- a/paddle/fluid/CMakeLists.txt +++ b/paddle/fluid/CMakeLists.txt @@ -3,6 +3,7 @@ add_subdirectory(platform) add_subdirectory(framework) add_subdirectory(operators) add_subdirectory(pybind) -add_subdirectory(inference) add_subdirectory(string) add_subdirectory(recordio) +# NOTE: please add subdirectory inference at last. +add_subdirectory(inference) diff --git a/paddle/fluid/framework/block_desc.h b/paddle/fluid/framework/block_desc.h index 873969b2a884f6d9e133fe87bf72725c36ce8b98..eef19c4f09c60b9df18f154c85c421f5bff9413f 100644 --- a/paddle/fluid/framework/block_desc.h +++ b/paddle/fluid/framework/block_desc.h @@ -92,7 +92,7 @@ class BlockDesc { /* * Remove Op and its input/output variables. - * Note that for either input or ouput variable, if it is also an input or + * Note that for either input or output variable, if it is also an input or * output variable of other ops, we should remain it. */ void RemoveOp(size_t s, size_t e); diff --git a/paddle/fluid/framework/details/computation_op_handle.cc b/paddle/fluid/framework/details/computation_op_handle.cc index 7a1b40c0b60a788b1f0a70e688f8fcbe427ad076..e3f8bbb72f2a1b75b6041d41496cef0efc81874f 100644 --- a/paddle/fluid/framework/details/computation_op_handle.cc +++ b/paddle/fluid/framework/details/computation_op_handle.cc @@ -14,6 +14,8 @@ #include "paddle/fluid/framework/details/computation_op_handle.h" +#include + namespace paddle { namespace framework { namespace details { @@ -33,7 +35,7 @@ void ComputationOpHandle::RunImpl() { } } - op_->Run(*scope_->FindVar("@TMP_SCOPE@")->Get(), place_); + op_->Run(*scope_->FindVar(kLocalExecScopeName)->Get(), place_); } std::string ComputationOpHandle::Name() const { return op_->Type(); } diff --git a/paddle/fluid/framework/details/fetch_op_handle.cc b/paddle/fluid/framework/details/fetch_op_handle.cc index 9180903b864d03e59f55f41410b2240fa4199496..e3e7c55d153aec8ce9c25c962821b266eaa84fe4 100644 --- a/paddle/fluid/framework/details/fetch_op_handle.cc +++ b/paddle/fluid/framework/details/fetch_op_handle.cc @@ -14,6 +14,9 @@ #include "paddle/fluid/framework/details/fetch_op_handle.h" +#include +#include + namespace paddle { namespace framework { namespace details { @@ -57,7 +60,10 @@ void FetchOpHandle::RunImpl() { for (size_t i = 0; i < scopes.size(); ++i) { auto &scope = scopes[i]; - auto &t = scope->FindVar(var_name)->Get(); + auto &t = scope->FindVar(kLocalExecScopeName) + ->Get() + ->FindVar(var_name) + ->Get(); if (platform::is_gpu_place(var->place_)) { #ifdef PADDLE_WITH_CUDA TensorCopy(t, cpu, *dev_ctxes_[t.place()], &tensors_[i]); diff --git a/paddle/fluid/framework/details/op_handle_base.h b/paddle/fluid/framework/details/op_handle_base.h index d7a541ac4bb83625060db337446d03a1afda3ed0..fbdb54ba8d940c8dedd44a42a85825af5d2ec664 100644 --- a/paddle/fluid/framework/details/op_handle_base.h +++ b/paddle/fluid/framework/details/op_handle_base.h @@ -24,6 +24,8 @@ namespace paddle { namespace framework { namespace details { +constexpr char kLocalExecScopeName[] = "@LCOAL_SCOPE@"; + class OpHandleBase { private: DISABLE_COPY_AND_ASSIGN(OpHandleBase); diff --git a/paddle/fluid/framework/details/ssa_graph_executor.h b/paddle/fluid/framework/details/ssa_graph_executor.h index 3b818b1a45b56351e34f9e52ec22b6d02a0c1591..a8833b7388ab907020a260d356f1484ffd227658 100644 --- a/paddle/fluid/framework/details/ssa_graph_executor.h +++ b/paddle/fluid/framework/details/ssa_graph_executor.h @@ -15,13 +15,15 @@ #pragma once #include +#include +#include + #include "paddle/fluid/framework/details/ssa_graph.h" #include "paddle/fluid/framework/feed_fetch_type.h" namespace paddle { namespace framework { namespace details { - class SSAGraphExecutor { DISABLE_COPY_AND_ASSIGN(SSAGraphExecutor); diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc index 62af4c1d79ded5eaa30e4e6d43cc0d7327ae9689..1ce69ab02b09fe7ec17f479bcef97c931e853dc4 100644 --- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc +++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc @@ -136,12 +136,6 @@ FeedFetchList ThreadedSSAGraphExecutor::Run( ready_ops.clear(); }; - // Create local scopes. - for (auto &scope : local_scopes_) { - auto &local_scope = scope->NewScope(); - *scope->Var("@TMP_SCOPE@")->GetMutable() = &local_scope; - } - // Step 3. Execution while (!pending_vars.empty() || !ready_ops.empty() || !delayed_ops.empty()) { // 1. Run All Ready ops @@ -189,34 +183,10 @@ FeedFetchList ThreadedSSAGraphExecutor::Run( PADDLE_ENFORCE(ready_ops.empty()); PADDLE_ENFORCE(delayed_ops.empty()); PADDLE_ENFORCE(blocked_by_delayed_ops.empty()); - ++computation_count_; - - auto sync_computation = [&] { - computation_count_ = 0; - // Wait All computational streams - for (auto p : this->places_) { - platform::DeviceContextPool::Instance().Get(p)->Wait(); - } - for (auto &scope : local_scopes_) { - scope->DropKids(); - } - }; // Wait FetchOps. if (!fetch_ops.empty()) { fetch_ops.clear(); - sync_computation(); - } - - if (computation_count_ == max_async_computation) { - sync_computation(); - } - - // NOTE: the temp scope can be dropped lazily if needed. - // Drop tmp scopes; - for (auto &scope : local_scopes_) { - auto &kid = *scope->Var("@TMP_SCOPE@")->GetMutable(); - kid = nullptr; } return fetch_data; diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.h b/paddle/fluid/framework/details/threaded_ssa_graph_executor.h index 79cfc26b461a39811a9a125e5aeac3492d967386..bb5e837b135c35b5aea403496b45aab1ccc288ff 100644 --- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.h +++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.h @@ -99,9 +99,6 @@ class ThreadedSSAGraphExecutor : public SSAGraphExecutor { std::unique_ptr exception_; std::atomic running_ops_; bool allow_op_delay_; - - size_t computation_count_{0}; - size_t max_async_computation{100}; }; } // namespace details diff --git a/paddle/fluid/framework/executor.cc b/paddle/fluid/framework/executor.cc index c2ca1bbc78f3ebc6066df6b666720af0d1fbbf59..513e720fd099bcd898a6c73afd1a3a16f6f53aab 100644 --- a/paddle/fluid/framework/executor.cc +++ b/paddle/fluid/framework/executor.cc @@ -83,8 +83,8 @@ static void CheckTensorNANOrInf(const std::string& name, if (tensor.memory_size() == 0) { return; } - if (tensor.type().hash_code() != typeid(float).hash_code() && - tensor.type().hash_code() != typeid(double).hash_code()) { + if (tensor.type().hash_code() != typeid(float).hash_code() && // NOLINT + tensor.type().hash_code() != typeid(double).hash_code()) { // NOLINT return; } PADDLE_ENFORCE(!framework::TensorContainsInf(tensor), @@ -145,12 +145,13 @@ void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id, // Return true if the block has feed operators and holder of matching info. static bool has_feed_operators( const BlockDesc& block, - std::map& feed_targets, + const std::map& feed_targets, const std::string& feed_holder_name) { size_t feed_count = 0; for (auto* op : block.AllOps()) { if (op->Type() == kFeedOpType) { feed_count++; + // The input variable's name of feed_op should be feed_holder_name. PADDLE_ENFORCE_EQ(op->Input("X")[0], feed_holder_name, "Input to feed op should be '%s'", feed_holder_name); std::string feed_target_name = op->Output("Out")[0]; @@ -166,13 +167,15 @@ static bool has_feed_operators( feed_count, feed_targets.size(), "The number of feed operators should match 'feed_targets'"); - // When feed operator are present, so should be feed_holder - auto var = block.FindVar(feed_holder_name); - PADDLE_ENFORCE_NOT_NULL(var, "Block should already have a '%s' variable", - feed_holder_name); - PADDLE_ENFORCE_EQ(var->GetType(), proto::VarType::FEED_MINIBATCH, - "'%s' variable should be 'FEED_MINIBATCH' type", - feed_holder_name); + if (!feed_holder_name.empty()) { + // When feed operator are present, so should be feed_holder. + auto var = block.FindVar(feed_holder_name); + PADDLE_ENFORCE_NOT_NULL(var, "Block should already have a '%s' variable", + feed_holder_name); + PADDLE_ENFORCE_EQ(var->GetType(), proto::VarType::FEED_MINIBATCH, + "'%s' variable should be 'FEED_MINIBATCH' type", + feed_holder_name); + } } return feed_count > 0; @@ -185,12 +188,14 @@ static bool has_feed_operators( // and fetch_holder_name. Raise exception when any mismatch is found. // Return true if the block has fetch operators and holder of matching info. static bool has_fetch_operators( - const BlockDesc& block, std::map& fetch_targets, + const BlockDesc& block, + const std::map& fetch_targets, const std::string& fetch_holder_name) { size_t fetch_count = 0; for (auto* op : block.AllOps()) { if (op->Type() == kFetchOpType) { fetch_count++; + // The output variable's name of fetch_op should be fetch_holder_name. PADDLE_ENFORCE_EQ(op->Output("Out")[0], fetch_holder_name, "Output of fetch op should be '%s'", fetch_holder_name); std::string fetch_target_name = op->Input("X")[0]; @@ -206,13 +211,15 @@ static bool has_fetch_operators( fetch_count, fetch_targets.size(), "The number of fetch operators should match 'fetch_targets'"); - // When fetch operator are present, so should be fetch_holder - auto var = block.FindVar(fetch_holder_name); - PADDLE_ENFORCE_NOT_NULL(var, "Block should already have a '%s' variable", - fetch_holder_name); - PADDLE_ENFORCE_EQ(var->GetType(), proto::VarType::FETCH_LIST, - "'%s' variable should be 'FETCH_LIST' type", - fetch_holder_name); + if (!fetch_holder_name.empty()) { + // When fetch operator are present, so should be fetch_holder. + auto var = block.FindVar(fetch_holder_name); + PADDLE_ENFORCE_NOT_NULL(var, "Block should already have a '%s' variable", + fetch_holder_name); + PADDLE_ENFORCE_EQ(var->GetType(), proto::VarType::FETCH_LIST, + "'%s' variable should be 'FETCH_LIST' type", + fetch_holder_name); + } } return fetch_count > 0; @@ -259,16 +266,6 @@ void Executor::Run(const ProgramDesc& program, Scope* scope, } } - // map the data of feed_targets to feed_holder - for (auto* op : global_block->AllOps()) { - if (op->Type() == kFeedOpType) { - std::string feed_target_name = op->Output("Out")[0]; - int idx = boost::get(op->GetAttr("col")); - SetFeedVariable(scope, *feed_targets[feed_target_name], feed_holder_name, - idx); - } - } - if (!has_fetch_ops) { // create fetch_holder variable auto* fetch_holder = global_block->Var(fetch_holder_name); @@ -292,17 +289,9 @@ void Executor::Run(const ProgramDesc& program, Scope* scope, } } - Run(*copy_program, scope, 0, create_vars, create_vars); - - // obtain the data of fetch_targets from fetch_holder - for (auto* op : global_block->AllOps()) { - if (op->Type() == kFetchOpType) { - std::string fetch_target_name = op->Input("X")[0]; - int idx = boost::get(op->GetAttr("col")); - *fetch_targets[fetch_target_name] = - GetFetchVariable(*scope, fetch_holder_name, idx); - } - } + auto ctx = Prepare(*copy_program, 0); + RunPreparedContext(ctx.get(), scope, feed_targets, fetch_targets, create_vars, + feed_holder_name, fetch_holder_name); } std::unique_ptr Executor::Prepare( @@ -370,5 +359,42 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope, } } +void Executor::RunPreparedContext( + ExecutorPrepareContext* ctx, Scope* scope, + std::map& feed_targets, + std::map& fetch_targets, bool create_vars, + const std::string& feed_holder_name, const std::string& fetch_holder_name) { + auto& global_block = ctx->prog_.Block(ctx->block_id_); + + PADDLE_ENFORCE( + has_feed_operators(global_block, feed_targets, feed_holder_name), + "Program in ExecutorPrepareContext should has feed_ops."); + PADDLE_ENFORCE( + has_fetch_operators(global_block, fetch_targets, fetch_holder_name), + "Program in the prepared context should has fetch_ops."); + + // map the data of feed_targets to feed_holder + for (auto* op : global_block.AllOps()) { + if (op->Type() == kFeedOpType) { + std::string feed_target_name = op->Output("Out")[0]; + int idx = boost::get(op->GetAttr("col")); + SetFeedVariable(scope, *feed_targets[feed_target_name], feed_holder_name, + idx); + } + } + + RunPreparedContext(ctx, scope, create_vars, create_vars); + + // obtain the data of fetch_targets from fetch_holder + for (auto* op : global_block.AllOps()) { + if (op->Type() == kFetchOpType) { + std::string fetch_target_name = op->Input("X")[0]; + int idx = boost::get(op->GetAttr("col")); + *fetch_targets[fetch_target_name] = + GetFetchVariable(*scope, fetch_holder_name, idx); + } + } +} + } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/executor.h b/paddle/fluid/framework/executor.h index 75b29b2f4065ad75b62a134b890b8f9f6730fdc7..43defdacf2a1c2f59cf3af2461ae6cfc4c61f5be 100644 --- a/paddle/fluid/framework/executor.h +++ b/paddle/fluid/framework/executor.h @@ -14,6 +14,9 @@ limitations under the License. */ #pragma once +#include +#include +#include #include "paddle/fluid/framework/op_info.h" #include "paddle/fluid/framework/program_desc.h" #include "paddle/fluid/framework/scope.h" @@ -70,6 +73,13 @@ class Executor { bool create_local_scope = true, bool create_vars = true); + void RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope, + std::map& feed_targets, + std::map& fetch_targets, + bool create_vars = true, + const std::string& feed_holder_name = "feed", + const std::string& fetch_holder_name = "fetch"); + private: const platform::Place place_; }; diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc index a3b4a8c0829ae3324e933309b2eaea35fe571997..f97bd0827428feeb590fcad16c48f3461517a646 100644 --- a/paddle/fluid/framework/operator.cc +++ b/paddle/fluid/framework/operator.cc @@ -46,7 +46,8 @@ proto::VarType::Type GetDataTypeOfVar(const Variable* var) { } } -static DDim GetDims(const Scope& scope, const std::string& name) { +static DDim GetDims(const Scope& scope, const std::string& name, + bool get_actual_dim = false) { Variable* var = scope.FindVar(name); if (var == nullptr) { return DDim({-1}); @@ -55,7 +56,11 @@ static DDim GetDims(const Scope& scope, const std::string& name) { if (var->IsType()) { return var->Get().dims(); } else if (var->IsType()) { - return var->Get().GetCompleteDims(); + if (get_actual_dim) { + return var->Get().value().dims(); + } else { + return var->Get().GetCompleteDims(); + } } else { return DDim({-1}); } @@ -129,7 +134,7 @@ std::string OperatorBase::DebugStringEx(const Scope* scope) const { for (size_t i = 0; i < input.second.size(); ++i) { ss << input.second[i]; if (scope) { - ss << "[" << GetDims(*scope, input.second[i]) << "]"; + ss << "[" << GetDims(*scope, input.second[i], true) << "]"; ss << "(" << GetLoD(*scope, input.second[i]) << ")"; } if (i != input.second.size() - 1) { @@ -149,7 +154,7 @@ std::string OperatorBase::DebugStringEx(const Scope* scope) const { for (size_t i = 0; i < output.second.size(); ++i) { ss << output.second[i]; if (scope) { - ss << "[" << GetDims(*scope, output.second[i]) << "]"; + ss << "[" << GetDims(*scope, output.second[i], true) << "]"; ss << "(" << GetLoD(*scope, output.second[i]) << ")"; } if (i != output.second.size() - 1) { diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index 20dcc080b696431b9972c0a972904d957f9b47d8..c1486b527d2e06d2b3f7e0f89458bf9a22564586 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -15,6 +15,7 @@ limitations under the License. */ #include "paddle/fluid/framework/parallel_executor.h" #include +#include #include #ifdef PADDLE_WITH_CUDA @@ -41,6 +42,8 @@ class ParallelExecutorPrivate { #ifdef PADDLE_WITH_CUDA std::unique_ptr nccl_ctxs_; #endif + + std::vector> var_types_; }; std::vector &ParallelExecutor::GetLocalScopes() { @@ -97,14 +100,9 @@ ParallelExecutor::ParallelExecutor( allow_op_delay)); // Step 3. Create vars in each scope; - for (auto *scope : member_->local_scopes_) { - for (auto *var : main_program.Block(0).AllVars()) { - if (scope->FindVar(var->Name()) != nullptr) { - continue; - } - - InitializeVariable(scope->Var(var->Name()), var->GetType()); - } + for (auto *var : main_program.Block(0).AllVars()) { + member_->var_types_.emplace_back(var->Name(), var->GetType(), + var->Persistable()); } } @@ -163,9 +161,42 @@ void ParallelExecutor::Run( const std::unordered_map &feed_tensors) { platform::RecordBlock b(0); SplitTensorToPlaces(feed_tensors); + + // Create local scopes. + for (auto &scope : member_->local_scopes_) { + Scope &local_scope = scope->NewScope(); + *scope->Var(details::kLocalExecScopeName)->GetMutable() = + &local_scope; + + for (auto &name_type_pair : member_->var_types_) { + if (scope->FindVar(std::get<0>(name_type_pair)) != nullptr) { + continue; + } + + if (std::get<2>(name_type_pair)) { // Persistable + InitializeVariable(scope->Var(std::get<0>(name_type_pair)), + std::get<1>(name_type_pair)); + } else { + InitializeVariable(scope->Var(std::get<0>(name_type_pair)), + std::get<1>(name_type_pair)); + } + } + } + auto fetch_data = member_->executor_->Run(fetch_tensors); *member_->global_scope_->Var(fetched_var_name)->GetMutable() = fetch_data; + + // Wait All computational streams + for (auto p : member_->places_) { + platform::DeviceContextPool::Instance().Get(p)->Wait(); + } + for (auto &scope : member_->local_scopes_) { + auto &local_scope = + *scope->Var(details::kLocalExecScopeName)->GetMutable(); + scope->DeleteScope(local_scope); + local_scope = nullptr; + } } void ParallelExecutor::SplitTensorToPlaces( diff --git a/paddle/fluid/framework/threadpool.cc b/paddle/fluid/framework/threadpool.cc index 9854d618d2b29ed123833f55198179638c95d6db..f26f212d4d5793b88fd1e6d782cdf983bf341879 100644 --- a/paddle/fluid/framework/threadpool.cc +++ b/paddle/fluid/framework/threadpool.cc @@ -14,8 +14,12 @@ #include "paddle/fluid/framework/threadpool.h" +#include "gflags/gflags.h" #include "paddle/fluid/platform/enforce.h" +DEFINE_int32(io_threadpool_size, 100, + "number of threads used for doing IO, default 100"); + namespace paddle { namespace framework { @@ -91,5 +95,20 @@ void ThreadPool::TaskLoop() { } } +std::unique_ptr ThreadPoolIO::io_threadpool_(nullptr); +std::once_flag ThreadPoolIO::io_init_flag_; + +ThreadPool* ThreadPoolIO::GetInstanceIO() { + std::call_once(io_init_flag_, &ThreadPoolIO::InitIO); + return io_threadpool_.get(); +} + +void ThreadPoolIO::InitIO() { + if (io_threadpool_.get() == nullptr) { + // TODO(typhoonzero1986): make this configurable + io_threadpool_.reset(new ThreadPool(FLAGS_io_threadpool_size)); + } +} + } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/threadpool.h b/paddle/fluid/framework/threadpool.h index f9dce7105e32ff0ba03d03f8faaac3a4ed1a3595..94111ee335b1a5df327b3e46d62069b4735c54f6 100644 --- a/paddle/fluid/framework/threadpool.h +++ b/paddle/fluid/framework/threadpool.h @@ -14,12 +14,12 @@ limitations under the License. */ #pragma once -#include +#include // NOLINT #include -#include -#include +#include // NOLINT +#include // NOLINT #include -#include +#include // NOLINT #include #include "glog/logging.h" #include "paddle/fluid/platform/enforce.h" @@ -28,6 +28,22 @@ limitations under the License. */ namespace paddle { namespace framework { +struct ExceptionHandler { + mutable std::future> future_; + explicit ExceptionHandler( + std::future>&& f) + : future_(std::move(f)) {} + void operator()() const { + auto ex = this->future_.get(); + if (ex != nullptr) { + LOG(FATAL) << "The exception is thrown inside the thread pool. You " + "should use RunAndGetException to handle the exception.\n" + "The default exception handler is LOG(FATAL)." + << ex->what(); + } + } +}; + // ThreadPool maintains a queue of tasks, and runs them using a fixed // number of threads. class ThreadPool { @@ -87,22 +103,6 @@ class ThreadPool { void Wait(); private: - struct ExceptionHandler { - mutable std::future> future_; - explicit ExceptionHandler( - std::future>&& f) - : future_(std::move(f)) {} - void operator()() const { - auto ex = this->future_.get(); - if (ex != nullptr) { - LOG(FATAL) << "The exception is thrown inside the thread pool. You " - "should use RunAndGetException to handle the exception.\n" - "The default exception handler is LOG(FATAL)." - << ex->what(); - } - } - }; - DISABLE_COPY_AND_ASSIGN(ThreadPool); // If the task queue is empty and avaialbe is equal to the number of @@ -135,6 +135,17 @@ class ThreadPool { std::condition_variable completed_; }; +class ThreadPoolIO : ThreadPool { + public: + static ThreadPool* GetInstanceIO(); + static void InitIO(); + + private: + // NOTE: threadpool in base will be inhereted here. + static std::unique_ptr io_threadpool_; + static std::once_flag io_init_flag_; +}; + // Run a function asynchronously. // NOTE: The function must return void. If the function need to return a value, // you can use lambda to capture a value pointer. @@ -143,5 +154,10 @@ std::future Async(Callback callback) { return ThreadPool::GetInstance()->Run(callback); } +template +std::future AsyncIO(Callback callback) { + return ThreadPoolIO::GetInstanceIO()->Run(callback); +} + } // namespace framework } // namespace paddle diff --git a/paddle/fluid/inference/CMakeLists.txt b/paddle/fluid/inference/CMakeLists.txt index f417f62f3f75360f4ae1b7795608ae95200cfeb8..e53bcf2384e54e21c7dd5638f3b7469a35b571bf 100644 --- a/paddle/fluid/inference/CMakeLists.txt +++ b/paddle/fluid/inference/CMakeLists.txt @@ -1,4 +1,4 @@ -set(FLUID_CORE_MODULES proto_desc memory lod_tensor executor prune init) +set(FLUID_CORE_MODULES proto_desc memory lod_tensor executor init) cc_library(paddle_fluid_api SRCS io.cc @@ -11,7 +11,7 @@ cc_library(paddle_fluid DEPS ${fluid_modules}) # Create shared library cc_library(paddle_fluid_shared SHARED SRCS io.cc - DEPS ARCHIVE_START ${GLOB_OP_LIB} ${FLUID_CORE_MODULES} ARCHIVE_END) + DEPS ${fluid_modules}) set_target_properties(paddle_fluid_shared PROPERTIES OUTPUT_NAME paddle_fluid) if(NOT APPLE) # TODO(liuyiqun): Temporarily disable the link flag because it is not support on Mac. diff --git a/paddle/fluid/inference/io.cc b/paddle/fluid/inference/io.cc index a5b62ef322bfad0fc956d7d722797bd5add6aea6..3b58019db6e55fa8198d2f77731095c6cf356266 100644 --- a/paddle/fluid/inference/io.cc +++ b/paddle/fluid/inference/io.cc @@ -17,10 +17,16 @@ limitations under the License. */ #include #include "paddle/fluid/framework/block_desc.h" #include "paddle/fluid/framework/feed_fetch_type.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/pybind/pybind.h" namespace paddle { namespace inference { +// Temporarily add this function for exposing framework::InitDevices() when +// linking the inference shared library. +void Init(bool init_p2p) { framework::InitDevices(init_p2p); } + void ReadBinaryFile(const std::string& filename, std::string& contents) { std::ifstream fin(filename, std::ios::in | std::ios::binary); PADDLE_ENFORCE(static_cast(fin), "Cannot open file %s", filename); diff --git a/paddle/fluid/inference/io.h b/paddle/fluid/inference/io.h index d07d315b93ef10a464080899b1cb9920abe83be3..756c936b33ad55e2994542b171b945e248ba2e21 100644 --- a/paddle/fluid/inference/io.h +++ b/paddle/fluid/inference/io.h @@ -18,12 +18,15 @@ limitations under the License. */ #include #include #include "paddle/fluid/framework/executor.h" +#include "paddle/fluid/framework/init.h" #include "paddle/fluid/framework/program_desc.h" #include "paddle/fluid/framework/scope.h" namespace paddle { namespace inference { +void Init(bool init_p2p); + void LoadPersistables(framework::Executor& executor, framework::Scope& scope, const framework::ProgramDesc& main_program, const std::string& dirname, diff --git a/paddle/fluid/inference/tests/book/CMakeLists.txt b/paddle/fluid/inference/tests/book/CMakeLists.txt index 86e36f3f65cb79b23e5bb23d7d8b2b34a000193c..97d9f03f88ad3e851a2dd4256d34e8ca76fdfb01 100644 --- a/paddle/fluid/inference/tests/book/CMakeLists.txt +++ b/paddle/fluid/inference/tests/book/CMakeLists.txt @@ -17,7 +17,7 @@ function(inference_test TARGET_NAME) string(REGEX REPLACE "^_$" "" arg "${arg}") cc_test(test_inference_${TARGET_NAME}${arg} SRCS test_inference_${TARGET_NAME}.cc - DEPS ARCHIVE_START paddle_fluid ARCHIVE_END + DEPS paddle_fluid ARGS --dirname=${PYTHON_TESTS_DIR}/book/${TARGET_NAME}${arg}.inference.model) set_tests_properties(test_inference_${TARGET_NAME}${arg} PROPERTIES DEPENDS test_${TARGET_NAME}) diff --git a/paddle/fluid/inference/tests/book/test_inference_image_classification.cc b/paddle/fluid/inference/tests/book/test_inference_image_classification.cc index ca2077d07411d2cd6095e0dc2a874af0890145c5..1e6555bb02033a28dedd2a1d1962981dfcc97cc2 100644 --- a/paddle/fluid/inference/tests/book/test_inference_image_classification.cc +++ b/paddle/fluid/inference/tests/book/test_inference_image_classification.cc @@ -46,8 +46,8 @@ TEST(inference, image_classification) { // Run inference on CPU LOG(INFO) << "--- CPU Runs: ---"; - TestInference(dirname, cpu_feeds, - cpu_fetchs1, FLAGS_repeat); + TestInference( + dirname, cpu_feeds, cpu_fetchs1, FLAGS_repeat); LOG(INFO) << output1.dims(); #ifdef PADDLE_WITH_CUDA @@ -57,8 +57,8 @@ TEST(inference, image_classification) { // Run inference on CUDA GPU LOG(INFO) << "--- GPU Runs: ---"; - TestInference(dirname, cpu_feeds, - cpu_fetchs2, FLAGS_repeat); + TestInference( + dirname, cpu_feeds, cpu_fetchs2, FLAGS_repeat); LOG(INFO) << output2.dims(); CheckError(output1, output2); diff --git a/paddle/fluid/inference/tests/test_helper.h b/paddle/fluid/inference/tests/test_helper.h index 064e400f0c750872ab2142c5fc8e28dd3da85b1a..c3a8d0889c6a6dd9591837ccc523da56f8d13661 100644 --- a/paddle/fluid/inference/tests/test_helper.h +++ b/paddle/fluid/inference/tests/test_helper.h @@ -89,7 +89,7 @@ void CheckError(const paddle::framework::LoDTensor& output1, EXPECT_EQ(count, 0U) << "There are " << count << " different elements."; } -template +template void TestInference(const std::string& dirname, const std::vector& cpu_feeds, const std::vector& cpu_fetchs, @@ -175,8 +175,15 @@ void TestInference(const std::string& dirname, } // Ignore the profiling results of the first run - executor.Run(*inference_program, scope, feed_targets, fetch_targets, - CreateVars); + std::unique_ptr ctx; + if (PrepareContext) { + ctx = executor.Prepare(*inference_program, 0); + executor.RunPreparedContext(ctx.get(), scope, feed_targets, fetch_targets, + CreateVars); + } else { + executor.Run(*inference_program, scope, feed_targets, fetch_targets, + CreateVars); + } // Enable the profiler paddle::platform::EnableProfiler(state); @@ -187,8 +194,15 @@ void TestInference(const std::string& dirname, "run_inference", paddle::platform::DeviceContextPool::Instance().Get(place)); - executor.Run(*inference_program, scope, feed_targets, fetch_targets, - CreateVars); + if (PrepareContext) { + // Note: if you change the inference_program, you need to call + // executor.Prepare() again to get a new ExecutorPrepareContext. + executor.RunPreparedContext(ctx.get(), scope, feed_targets, + fetch_targets, CreateVars); + } else { + executor.Run(*inference_program, scope, feed_targets, fetch_targets, + CreateVars); + } } // Disable the profiler and print the timing information diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt index 3c8696b508443e1b8d9f7cac6336b70562ffedc5..7d6781c2c38822eaabb64eda9c76ff657bbdeeb8 100644 --- a/paddle/fluid/operators/CMakeLists.txt +++ b/paddle/fluid/operators/CMakeLists.txt @@ -245,9 +245,17 @@ op_library(channel_send_op DEPS concurrency) op_library(channel_recv_op DEPS concurrency) list(REMOVE_ITEM GENERAL_OPS ${DEPS_OPS}) + +# The fully connected layer is deleted when the WITH_MKLDNN flag is OFF +# Because the fully connected layer has only one MKLDNN's operator +if(NOT WITH_MKLDNN) + list(REMOVE_ITEM GENERAL_OPS fc_op) +endif(NOT WITH_MKLDNN) + foreach(src ${GENERAL_OPS}) op_library(${src}) endforeach() + file(APPEND ${pybind_file} "USE_OP(less_than);\nUSE_OP(logical_and);\nUSE_NO_KERNEL_OP(read_from_array);\n") add_subdirectory(reader) diff --git a/paddle/fluid/operators/batch_norm_op.cu.cc b/paddle/fluid/operators/batch_norm_op.cu.cc index eecb58e11ef57b550c79c040e6933ed6e52e2e87..cb1927bc0f2eb735f0a3184df5f0f8fada2f9dca 100644 --- a/paddle/fluid/operators/batch_norm_op.cu.cc +++ b/paddle/fluid/operators/batch_norm_op.cu.cc @@ -114,23 +114,11 @@ class BatchNormKernel const auto *bias = ctx.Input("Bias"); auto *y = ctx.Output("Y"); - auto *mean_out = ctx.Output("MeanOut"); - auto *variance_out = ctx.Output("VarianceOut"); - auto *saved_mean = ctx.Output("SavedMean"); - auto *saved_variance = ctx.Output("SavedVariance"); // alloc memory y->mutable_data(ctx.GetPlace()); - mean_out->mutable_data>(ctx.GetPlace()); - variance_out->mutable_data>(ctx.GetPlace()); - saved_mean->mutable_data>(ctx.GetPlace()); - saved_variance->mutable_data>(ctx.GetPlace()); auto &dev_ctx = ctx.template device_context(); - math::SetConstant> - functor; - functor(dev_ctx, saved_mean, static_cast>(0)); - functor(dev_ctx, saved_variance, static_cast>(0)); auto handle = dev_ctx.cudnn_handle(); @@ -159,6 +147,21 @@ class BatchNormKernel // Run training mode. // obtain running mean and running inv var, and see if we need to // initialize them. + + auto *mean_out = ctx.Output("MeanOut"); + auto *variance_out = ctx.Output("VarianceOut"); + mean_out->mutable_data>(ctx.GetPlace()); + variance_out->mutable_data>(ctx.GetPlace()); + + auto *saved_mean = ctx.Output("SavedMean"); + auto *saved_variance = ctx.Output("SavedVariance"); + saved_mean->mutable_data>(ctx.GetPlace()); + saved_variance->mutable_data>(ctx.GetPlace()); + math::SetConstant> + functor; + functor(dev_ctx, saved_mean, static_cast>(0)); + functor(dev_ctx, saved_variance, static_cast>(0)); + double this_factor = 1. - momentum; CUDNN_ENFORCE(platform::dynload::cudnnBatchNormalizationForwardTraining( diff --git a/paddle/fluid/operators/concat_op.cc b/paddle/fluid/operators/concat_op.cc index d65a7b34678cda38d5f8beb9154d61928f517ce0..4a36b03cb63ac3ea61be1bbc56b8dd0adbe7d334 100644 --- a/paddle/fluid/operators/concat_op.cc +++ b/paddle/fluid/operators/concat_op.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/concat_op.h" + #include #include @@ -34,7 +35,10 @@ class ConcatOp : public framework::OperatorWithKernel { size_t axis = static_cast(ctx->Attrs().Get("axis")); const size_t n = ins.size(); - PADDLE_ENFORCE_GT(n, 1, "Input tensors count should > 1."); + PADDLE_ENFORCE_GT(n, 0, "Input tensors count should > 0."); + if (n == 1) { + VLOG(3) << "Warning: concat op have only one input, may waste memory"; + } auto out_dims = ins[0]; size_t in_zero_dims_size = out_dims.size(); diff --git a/paddle/fluid/operators/detail/grpc_client.cc b/paddle/fluid/operators/detail/grpc_client.cc index 45f88ec8697d9f3de2612f28889fefc36f7ddbf9..661dfa69fe1580ff3890f12defcd124225be0c06 100644 --- a/paddle/fluid/operators/detail/grpc_client.cc +++ b/paddle/fluid/operators/detail/grpc_client.cc @@ -35,7 +35,8 @@ bool RPCClient::AsyncSendVariable(const std::string& ep, const framework::Scope* p_scope = &scope; const auto ch = GetChannel(ep_val); - framework::Async([var_name_val, p_ctx, ep_val, p_scope, time_out, ch, this] { + framework::AsyncIO([var_name_val, p_ctx, ep_val, p_scope, time_out, ch, + this] { auto* var = p_scope->FindVar(var_name_val); ::grpc::ByteBuffer req; @@ -89,7 +90,8 @@ bool RPCClient::AsyncGetVariable(const std::string& ep, const framework::Scope* p_scope = &scope; const auto ch = GetChannel(ep_val); - framework::Async([var_name_val, ep_val, p_scope, p_ctx, time_out, ch, this] { + framework::AsyncIO([var_name_val, ep_val, p_scope, p_ctx, time_out, ch, + this] { // prepare input sendrecv::VariableMessage req; req.set_varname(var_name_val); @@ -132,8 +134,8 @@ bool RPCClient::AsyncPrefetchVariable(const std::string& ep, const framework::Scope* p_scope = &scope; const auto ch = GetChannel(ep_val); - framework::Async([in_var_name_val, out_var_name_val, ep_val, p_scope, p_ctx, - time_out, ch, this] { + framework::AsyncIO([in_var_name_val, out_var_name_val, ep_val, p_scope, p_ctx, + time_out, ch, this] { auto* var = p_scope->FindVar(in_var_name_val); ::grpc::ByteBuffer req; @@ -196,7 +198,7 @@ bool RPCClient::Wait() { std::vector> waits(req_count_); for (int i = 0; i < req_count_; i++) { - waits[i] = framework::Async([i, &a, this] { a[i] = Proceed(); }); + waits[i] = framework::AsyncIO([i, &a, this] { a[i] = Proceed(); }); } for (int i = 0; i < req_count_; i++) { diff --git a/paddle/fluid/operators/detail/grpc_server.cc b/paddle/fluid/operators/detail/grpc_server.cc index d5fc163bc25409e0607b149b61c6266b38119d9d..119e146e078e476b2768a8495ea63e468f952fd2 100644 --- a/paddle/fluid/operators/detail/grpc_server.cc +++ b/paddle/fluid/operators/detail/grpc_server.cc @@ -161,6 +161,7 @@ class RequestPrefetch final : public RequestBase { ::grpc::ByteBuffer reply; std::string var_name = request_->OutVarname(); + VLOG(3) << "prefetch var " << var_name; auto var_desc = program_->Block(0).FindVar(var_name); framework::Scope* local_scope = &scope_->NewScope(); auto* var = local_scope->FindVar(var_name); @@ -216,10 +217,10 @@ void AsyncGRPCServer::RunSyncUpdate() { std::function prefetch_register = std::bind(&AsyncGRPCServer::TryToRegisterNewPrefetchOne, this); + // TODO(wuyi): Run these "HandleRequest" in thread pool t_send_.reset( new std::thread(std::bind(&AsyncGRPCServer::HandleRequest, this, cq_send_.get(), "cq_send", send_register))); - t_get_.reset( new std::thread(std::bind(&AsyncGRPCServer::HandleRequest, this, cq_get_.get(), "cq_get", get_register))); diff --git a/paddle/fluid/operators/listen_and_serv_op.cc b/paddle/fluid/operators/listen_and_serv_op.cc index 9188f2d989e601b7a97dedaf71f7080829cdb7c3..5d293665f0bcc098126ad3ec6c9bf34ff54c3b6f 100644 --- a/paddle/fluid/operators/listen_and_serv_op.cc +++ b/paddle/fluid/operators/listen_and_serv_op.cc @@ -13,7 +13,8 @@ See the License for the specific language governing permissions and limitations under the License. */ #include -#include +#include // NOLINT +#include #include "paddle/fluid/operators/listen_and_serv_op.h" @@ -88,8 +89,9 @@ void ListenAndServOp::RunImpl(const framework::Scope &scope, auto ins = Inputs("X"); auto fan_in = Attr("Fanin"); - auto *block = Attr(kOptimizeBlock); - auto *program = block->Program(); + auto *optimize_block = Attr(kOptimizeBlock); + auto *prefetch_block = Attr(kPrefetchBlock); + auto *program = optimize_block->Program(); size_t num_blocks = program->Size(); PADDLE_ENFORCE_GE(num_blocks, 2, "server program should have at least 2 blocks"); @@ -97,18 +99,25 @@ void ListenAndServOp::RunImpl(const framework::Scope &scope, framework::Executor executor(dev_place); std::vector block_list; for (size_t blkid = 1; blkid < num_blocks; ++blkid) { - block_list.push_back(blkid); + if (blkid != prefetch_block->ID()) { + block_list.push_back(blkid); + } } - auto prepared = executor.Prepare(*program, block_list); + auto optimize_prepared = executor.Prepare(*program, block_list); // Insert placeholder for block0 which holds current op itself. - prepared.insert(prepared.begin(), - std::shared_ptr(nullptr)); + optimize_prepared.insert( + optimize_prepared.begin(), + std::shared_ptr(nullptr)); rpc_service_->SetScope(&recv_scope); rpc_service_->SetDevCtx(&dev_ctx); // TODO(qiao) set proper fields for table lookup and update rpc_service_->SetExecutor(&executor); - rpc_service_->SetPrefetchBlkdId(0); + VLOG(3) << "prefetch block id is " << prefetch_block->ID(); + auto prefetch_prepared = executor.Prepare(*program, prefetch_block->ID()); + rpc_service_->SetPrefetchBlkdId(prefetch_block->ID()); + rpc_service_->SetPrefetchPreparedCtx(prefetch_prepared.get()); + prefetch_prepared.release(); rpc_service_->SetProgram(program); // start the server listening after all member initialized. server_thread_.reset(new std::thread(RunServer, rpc_service_)); @@ -166,16 +175,18 @@ void ListenAndServOp::RunImpl(const framework::Scope &scope, parallel_blkids.push_back(1); double ts = detail::GetTimestamp(); for (size_t blkid = 2; blkid < num_blocks; ++blkid) { - if (program->Block(blkid).Parent() != last_parent_blkid) { - ParallelExecuteBlocks(parallel_blkids, &executor, prepared, program, - &recv_scope); - parallel_blkids.clear(); - last_parent_blkid = program->Block(blkid).Parent(); + if (blkid != prefetch_block->ID()) { + if (program->Block(blkid).Parent() != last_parent_blkid) { + ParallelExecuteBlocks(parallel_blkids, &executor, optimize_prepared, + program, &recv_scope); + parallel_blkids.clear(); + last_parent_blkid = program->Block(blkid).Parent(); + } + parallel_blkids.push_back(blkid); } - parallel_blkids.push_back(blkid); } - ParallelExecuteBlocks(parallel_blkids, &executor, prepared, program, - &recv_scope); + ParallelExecuteBlocks(parallel_blkids, &executor, optimize_prepared, + program, &recv_scope); VLOG(2) << "run all blocks spent " << detail::GetTimestamp() - ts << "(ms)"; // Reset the received sparse variables, the sum operator would not @@ -211,6 +222,8 @@ from send_op and send back variables to recv_op. .AddCustomChecker([](const std::string &ip) { return !ip.empty(); }); AddAttr(kOptimizeBlock, "BlockID to run on server side."); + AddAttr(kPrefetchBlock, + "prefetch block to run on server side."); AddAttr("Fanin", "How many clients send to this server.") .SetDefault(1); } diff --git a/paddle/fluid/operators/listen_and_serv_op.h b/paddle/fluid/operators/listen_and_serv_op.h index 0da87afc961e896f04b4f0028bf9b17d5e992548..759b2a462ba5b938991aa86be9b9dc3e59fe3f7e 100644 --- a/paddle/fluid/operators/listen_and_serv_op.h +++ b/paddle/fluid/operators/listen_and_serv_op.h @@ -16,6 +16,7 @@ limitations under the License. */ #include #include +#include #include "paddle/fluid/framework/executor.h" #include "paddle/fluid/framework/lod_tensor.h" @@ -27,6 +28,7 @@ namespace paddle { namespace operators { constexpr char kOptimizeBlock[] = "OptimizeBlock"; +constexpr char kPrefetchBlock[] = "PrefetchBlock"; void RunServer(std::shared_ptr service); diff --git a/paddle/fluid/operators/lookup_table_op.cc b/paddle/fluid/operators/lookup_table_op.cc index bf33be310686640fa187a07cf46a157b7f433340..5e59bd1b178ad1803f6f70c5f3f9fd7af495ac3c 100644 --- a/paddle/fluid/operators/lookup_table_op.cc +++ b/paddle/fluid/operators/lookup_table_op.cc @@ -78,6 +78,9 @@ class LookupTableOpMaker : public framework::OpProtoAndCheckerMaker { "(boolean, default false) " "Sparse update.") .SetDefault(false); + AddAttr("is_distributed", + "(boolean, default false) distributed lookup table.") + .SetDefault(false); AddAttr("padding_idx", "(int64, default -1) " "If the value is -1, it makes no effect to lookup. " diff --git a/paddle/fluid/operators/math/math_function.cu b/paddle/fluid/operators/math/math_function.cu index e53183603fec54ceef68873cfd97b4b985b0d437..c28047e6e915280eed6886f99cd6d55704e3f4ad 100644 --- a/paddle/fluid/operators/math/math_function.cu +++ b/paddle/fluid/operators/math/math_function.cu @@ -288,9 +288,14 @@ void batched_gemm( // TODO(kexinzhao): add processing code for compute capability < 53 case PADDLE_ENFORCE_GE(context.GetComputeCapability(), 53, "cublas Hgemm requires GPU compute capability >= 53"); + +#if CUDA_VERSION >= 8000 PADDLE_ENFORCE(platform::dynload::cublasHgemmStridedBatched( context.cublas_handle(), cuTransB, cuTransA, N, M, K, &h_alpha, h_B, ldb, strideB, h_A, lda, strideA, &h_beta, h_C, ldc, strideC, batchCount)); +#else + PADDLE_ENFORCE(false, "HgemmStridedBatched is not supported on cuda <= 7.5"); +#endif } template <> @@ -310,9 +315,13 @@ void batched_gemm( (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; const int strideC = M * N; +#if CUDA_VERSION >= 8000 PADDLE_ENFORCE(platform::dynload::cublasSgemmStridedBatched( context.cublas_handle(), cuTransB, cuTransA, N, M, K, &alpha, B, ldb, strideB, A, lda, strideA, &beta, C, ldc, strideC, batchCount)); +#else + PADDLE_ENFORCE(false, "SgemmStridedBatched is not supported on cuda <= 7.5"); +#endif } template <> @@ -332,9 +341,13 @@ void batched_gemm( (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; const int strideC = M * N; +#if CUDA_VERSION >= 8000 PADDLE_ENFORCE(platform::dynload::cublasDgemmStridedBatched( context.cublas_handle(), cuTransB, cuTransA, N, M, K, &alpha, B, ldb, strideB, A, lda, strideA, &beta, C, ldc, strideC, batchCount)); +#else + PADDLE_ENFORCE(false, "DgemmStridedBatched is not supported on cuda <= 7.5"); +#endif } template <> diff --git a/paddle/fluid/operators/pad_op.h b/paddle/fluid/operators/pad_op.h index a36abe3789574cb64f05001e34d534cf352a60b2..c93c096575a30dd9344894ead4b81acc16930e21 100644 --- a/paddle/fluid/operators/pad_op.h +++ b/paddle/fluid/operators/pad_op.h @@ -14,6 +14,8 @@ limitations under the License. */ #pragma once +#include +#include #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/op_registry.h" diff --git a/paddle/fluid/operators/pool_mkldnn_op.cc b/paddle/fluid/operators/pool_mkldnn_op.cc index c88578570c1acdecaa97dd8b12a702778fef2b7e..63eaaedcd5fc3df17902511dc02b25bf43ccd241 100644 --- a/paddle/fluid/operators/pool_mkldnn_op.cc +++ b/paddle/fluid/operators/pool_mkldnn_op.cc @@ -83,9 +83,11 @@ class PoolMKLDNNOpKernel : public paddle::framework::OpKernel { dev_ctx.SetBlob(key_pool_workspace_memory, workspace_memory); auto src_memory = - mkldnn::memory({src_md, mkldnn_engine}, (void*)input_data); + mkldnn::memory({src_md, mkldnn_engine}, + static_cast(const_cast(input_data))); auto dst_memory = - mkldnn::memory({dst_md, mkldnn_engine}, (void*)output_data); + mkldnn::memory({dst_md, mkldnn_engine}, + static_cast(const_cast(output_data))); auto pool_prim = mkldnn::pooling_forward(*pool_pd, src_memory, dst_memory, *workspace_memory); @@ -195,9 +197,11 @@ class PoolMKLDNNGradOpKernel : public paddle::framework::OpKernel { pool_bwd_desc, mkldnn_engine, *pool_pd); auto diff_src_memory = - mkldnn::memory({diff_src_md, mkldnn_engine}, (void*)in_x_grad_data); + mkldnn::memory({diff_src_md, mkldnn_engine}, + static_cast(const_cast(in_x_grad_data))); auto diff_dst_memory = - mkldnn::memory({diff_dst_md, mkldnn_engine}, (void*)out_grad_data); + mkldnn::memory({diff_dst_md, mkldnn_engine}, + static_cast(const_cast(out_grad_data))); auto bwd_prim = mkldnn::pooling_backward( pool_bwd_pd, diff_dst_memory, *workspace_memory, diff_src_memory); diff --git a/paddle/fluid/operators/pool_op.h b/paddle/fluid/operators/pool_op.h index 2fec50ef25e0d2621a87963acdf142d24970329d..a48127ea6983d3d4ea12ec4925f30af233002ef2 100644 --- a/paddle/fluid/operators/pool_op.h +++ b/paddle/fluid/operators/pool_op.h @@ -14,6 +14,8 @@ limitations under the License. */ #pragma once +#include +#include #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/math/math_function.h" diff --git a/paddle/fluid/operators/pool_with_index_op.h b/paddle/fluid/operators/pool_with_index_op.h index 83e7bd138ae25c6d3e09c3d01178d6887205bf98..b55fa76eae34c3179d40f31ed6a57d3ecbbaaccf 100644 --- a/paddle/fluid/operators/pool_with_index_op.h +++ b/paddle/fluid/operators/pool_with_index_op.h @@ -14,6 +14,7 @@ limitations under the License. */ #pragma once +#include #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/math/math_function.h" diff --git a/paddle/fluid/operators/prefetch_op.cc b/paddle/fluid/operators/prefetch_op.cc index 09ab7da663b5ef5f099b9f65b0df661ceea0d9e2..f9ae01ab5d2972d2a74b36ae6035985d1d874bb6 100644 --- a/paddle/fluid/operators/prefetch_op.cc +++ b/paddle/fluid/operators/prefetch_op.cc @@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include +#include // NOLINT #include #include "paddle/fluid/framework/data_type.h" @@ -50,8 +50,8 @@ class PrefetchOp : public framework::OperatorBase { for (size_t i = 0; i < ins.size(); i++) { if (NeedSend(scope, ins[i])) { - VLOG(3) << "sending " << ins[i] << " to " << epmap[i] << "to get " - << outs[i] << "back"; + VLOG(3) << "sending " << ins[i] << " to " << epmap[i] << " to get " + << outs[i] << " back"; rpc_client->AsyncPrefetchVariable(epmap[i], ctx, scope, ins[i], outs[i]); } else { @@ -71,7 +71,7 @@ class PrefetchOpMaker : public framework::OpProtoAndCheckerMaker { "(RPCClient) The RPC client object which will be" "initialized at most once."); AddOutput("Out", - "(SelectedRows) result " + "(LoDTensor) result " "to be fetched from parameter server") .AsDuplicable(); AddAttr>( diff --git a/paddle/fluid/operators/prelu_op.cc b/paddle/fluid/operators/prelu_op.cc index 7fb45bd19da3a7f0c51d8e98a52efe62c15c1c55..8eaa12a4a6cfc09fd4e2c3642bc8825fe2af6d6b 100644 --- a/paddle/fluid/operators/prelu_op.cc +++ b/paddle/fluid/operators/prelu_op.cc @@ -13,7 +13,6 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/prelu_op.h" - #include namespace paddle { diff --git a/paddle/fluid/operators/prior_box_op.cc b/paddle/fluid/operators/prior_box_op.cc index 82e54139c8c1f42b1d8f74811a6793ec5c66473e..058b13eeb872aaa77a88da37db64a6d59fbdd1cf 100644 --- a/paddle/fluid/operators/prior_box_op.cc +++ b/paddle/fluid/operators/prior_box_op.cc @@ -45,7 +45,7 @@ class PriorBoxOp : public framework::OperatorWithKernel { bool flip = ctx->Attrs().Get("flip"); std::vector aspect_ratios_vec; - ExpandAspectRatios(aspect_ratios, flip, aspect_ratios_vec); + ExpandAspectRatios(aspect_ratios, flip, &aspect_ratios_vec); size_t num_priors = aspect_ratios_vec.size() * min_sizes.size(); if (max_sizes.size() > 0) { diff --git a/paddle/fluid/operators/prior_box_op.cu b/paddle/fluid/operators/prior_box_op.cu index 76bf2b3b7de7a24c80e927c16199f89c5b7fb794..0ea8909296f8f52d252b0ec258666cf32d69a8bb 100644 --- a/paddle/fluid/operators/prior_box_op.cu +++ b/paddle/fluid/operators/prior_box_op.cu @@ -96,7 +96,7 @@ class PriorBoxOpCUDAKernel : public framework::OpKernel { auto clip = ctx.Attr("clip"); std::vector aspect_ratios; - ExpandAspectRatios(input_aspect_ratio, flip, aspect_ratios); + ExpandAspectRatios(input_aspect_ratio, flip, &aspect_ratios); T step_w = static_cast(ctx.Attr("step_w")); T step_h = static_cast(ctx.Attr("step_h")); diff --git a/paddle/fluid/operators/prior_box_op.h b/paddle/fluid/operators/prior_box_op.h index 1e4a12aac1c5f1c3b7e2e1bc83170de9ad590fc3..1c62fd8d2c4d4e4deba4ca6442efbaff83e36c35 100644 --- a/paddle/fluid/operators/prior_box_op.h +++ b/paddle/fluid/operators/prior_box_op.h @@ -13,6 +13,8 @@ See the License for the specific language governing permissions and limitations under the License. */ #pragma once +#include +#include #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/platform/transform.h" @@ -22,23 +24,23 @@ namespace operators { inline void ExpandAspectRatios(const std::vector& input_aspect_ratior, bool flip, - std::vector& output_aspect_ratior) { + std::vector* output_aspect_ratior) { constexpr float epsilon = 1e-6; - output_aspect_ratior.clear(); - output_aspect_ratior.push_back(1.0f); + output_aspect_ratior->clear(); + output_aspect_ratior->push_back(1.0f); for (size_t i = 0; i < input_aspect_ratior.size(); ++i) { float ar = input_aspect_ratior[i]; bool already_exist = false; - for (size_t j = 0; j < output_aspect_ratior.size(); ++j) { - if (fabs(ar - output_aspect_ratior[j]) < epsilon) { + for (size_t j = 0; j < output_aspect_ratior->size(); ++j) { + if (fabs(ar - output_aspect_ratior->at(j)) < epsilon) { already_exist = true; break; } } if (!already_exist) { - output_aspect_ratior.push_back(ar); + output_aspect_ratior->push_back(ar); if (flip) { - output_aspect_ratior.push_back(1.0f / ar); + output_aspect_ratior->push_back(1.0f / ar); } } } @@ -68,7 +70,7 @@ class PriorBoxOpKernel : public framework::OpKernel { auto clip = ctx.Attr("clip"); std::vector aspect_ratios; - ExpandAspectRatios(input_aspect_ratio, flip, aspect_ratios); + ExpandAspectRatios(input_aspect_ratio, flip, &aspect_ratios); T step_w = static_cast(ctx.Attr("step_w")); T step_h = static_cast(ctx.Attr("step_h")); diff --git a/paddle/fluid/operators/rank_loss_op.cc b/paddle/fluid/operators/rank_loss_op.cc index 767eef56861ea075ec2450b1456e7c5c807ce25d..a1127f11a75e54168ca9682a0189255d37ee8571 100644 --- a/paddle/fluid/operators/rank_loss_op.cc +++ b/paddle/fluid/operators/rank_loss_op.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/rank_loss_op.h" +#include namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/recv_op.cc b/paddle/fluid/operators/recv_op.cc index 083c1fae5e2016ada6309aba78bdfa6ad7fef89c..a4dcf704a63ae3bad6567ddb042ea23513bccff7 100644 --- a/paddle/fluid/operators/recv_op.cc +++ b/paddle/fluid/operators/recv_op.cc @@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include // NOLINT #include #include "paddle/fluid/framework/data_type.h" @@ -19,7 +20,6 @@ limitations under the License. */ #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/op_registry.h" -#include #include "paddle/fluid/operators/detail/grpc_client.h" namespace paddle { diff --git a/paddle/fluid/operators/reshape_op.h b/paddle/fluid/operators/reshape_op.h index 807e5ad951b893a4c027a96d743f0606b70cf160..9abc78421a7554f51f56665e4d82d34e67c7c159 100644 --- a/paddle/fluid/operators/reshape_op.h +++ b/paddle/fluid/operators/reshape_op.h @@ -60,7 +60,7 @@ class ReshapeOp : public framework::OperatorWithKernel { static framework::DDim ValidateShape(const std::vector shape, const framework::DDim &in_dims) { const int64_t in_size = framework::product(in_dims); - // only one dimension canbe set to -1, whose size will be automatically + // only one dimension can be set to -1, whose size will be automatically // infered. const int64_t unk_dim_val = -1; const int64_t copy_dim_val = 0; @@ -119,13 +119,15 @@ class ReshapeKernel : public framework::OpKernel { auto *shape_tensor = ctx.Input("Shape"); framework::DDim out_dims = out->dims(); + if (shape_tensor) { auto *shape_data = shape_tensor->data(); + framework::Tensor cpu_shape_tensor; if (platform::is_gpu_place(ctx.GetPlace())) { - framework::Tensor cpu_shape_tensor; TensorCopy(*shape_tensor, platform::CPUPlace(), ctx.device_context(), &cpu_shape_tensor); shape_data = cpu_shape_tensor.data(); + ctx.device_context().Wait(); } auto shape = std::vector(shape_data, shape_data + shape_tensor->numel()); diff --git a/paddle/fluid/operators/roi_pool_op.h b/paddle/fluid/operators/roi_pool_op.h index f38c5a3c0c9952b37f7db468ea00470a00b5ff6f..54e07490319cf1da749bd33449a7b51efd6c3d65 100644 --- a/paddle/fluid/operators/roi_pool_op.h +++ b/paddle/fluid/operators/roi_pool_op.h @@ -13,6 +13,8 @@ See the License for the specific language governing permissions and limitations under the License. */ #pragma once +#include +#include #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/math/math_function.h" diff --git a/paddle/fluid/operators/send_recv_op_test.cc b/paddle/fluid/operators/send_recv_op_test.cc index 542bc3fde2a3616807eea560be85fb42026d5825..3bf5d57809019d3ae469471c2ee2e7aac70b9faf 100644 --- a/paddle/fluid/operators/send_recv_op_test.cc +++ b/paddle/fluid/operators/send_recv_op_test.cc @@ -14,7 +14,7 @@ limitations under the License. */ #include #include -#include +#include // NOLINT #include "gtest/gtest.h" #include "paddle/fluid/framework/op_registry.h" @@ -37,11 +37,11 @@ namespace m = paddle::operators::math; std::unique_ptr listen_and_serv_op; int selected_port; -void InitTensorsInScope(f::Scope &scope, p::CPUPlace &place) { +void InitTensorsInScope(const p::CPUPlace &place, f::Scope *scope) { p::CPUDeviceContext ctx(place); for (int i = 0; i < 2; ++i) { auto var_name = paddle::string::Sprintf("x%d", i); - auto var = scope.Var(var_name); + auto var = scope->Var(var_name); auto tensor = var->GetMutable(); tensor->Resize({10, 10}); float *expect = tensor->mutable_data(place); @@ -50,20 +50,20 @@ void InitTensorsInScope(f::Scope &scope, p::CPUPlace &place) { } } - auto out_var = scope.Var("Out"); + auto out_var = scope->Var("Out"); auto out_tensor = out_var->GetMutable(); out_tensor->Resize({10, 10}); out_tensor->mutable_data(place); // allocate } -void InitSelectedRowsInScope(f::Scope &scope, p::CPUPlace &place) { +void InitSelectedRowsInScope(const p::CPUPlace &place, f::Scope *scope) { p::CPUDeviceContext ctx(place); int64_t height = 10; int64_t row_numel = 10; m::SetConstant set_one; // init x0 std::vector rows0{0, 4, 7}; - auto x0_var = scope.Var("x0"); + auto x0_var = scope->Var("x0"); auto x0 = x0_var->GetMutable(); x0->set_rows(rows0); x0->set_height(height); @@ -74,7 +74,7 @@ void InitSelectedRowsInScope(f::Scope &scope, p::CPUPlace &place) { // init x1 std::vector rows1{2, 9}; - auto x1_var = scope.Var("x1"); + auto x1_var = scope->Var("x1"); auto x1 = x1_var->GetMutable(); x1->set_rows(rows1); x1->set_height(height); @@ -83,7 +83,7 @@ void InitSelectedRowsInScope(f::Scope &scope, p::CPUPlace &place) { f::make_ddim({static_cast(rows1.size()), row_numel}), place); set_one(ctx, x1_value, 1.0); - auto out_var = scope.Var("Out"); + auto out_var = scope->Var("Out"); auto out = out_var->GetMutable(); auto out_value = out->mutable_value(); out->set_height(height); @@ -117,15 +117,16 @@ void StartServerNet(bool is_sparse) { f::Scope scope; p::CPUPlace place; if (is_sparse) { - InitSelectedRowsInScope(scope, place); + InitSelectedRowsInScope(place, &scope); } else { - InitTensorsInScope(scope, place); + InitTensorsInScope(place, &scope); } // sub program run in listen_and_serv_op, for simple test we use sum f::ProgramDesc program; const auto &root_block = program.Block(0); auto *optimize_block = program.AppendBlock(root_block); + auto *prefetch_block = program.AppendBlock(root_block); // X for server side tensors, RX for received tensers, must be of same shape. AddOp("sum", {{"X", {"x0", "x1"}}}, {{"Out", {"Out"}}}, {}, optimize_block); @@ -135,6 +136,7 @@ void StartServerNet(bool is_sparse) { attrs.insert({"ParamList", std::vector({"Out"})}); attrs.insert({"GradList", std::vector({"x1"})}); attrs.insert({"OptimizeBlock", optimize_block}); + attrs.insert({"PrefetchBlock", prefetch_block}); listen_and_serv_op = f::OpRegistry::CreateOp("listen_and_serv", {{"X", {"x1"}}}, {}, attrs); LOG(INFO) << "selected port before run " << selected_port; @@ -148,7 +150,7 @@ TEST(SendRecvOp, CPUDense) { // local net f::Scope scope; p::CPUPlace place; - InitTensorsInScope(scope, place); + InitTensorsInScope(place, &scope); // create rpc client var scope.Var("RPC_CLIENT_VAR"); @@ -191,7 +193,7 @@ TEST(SendRecvOp, CPUSparse) { f::Scope scope; p::CPUPlace place; p::CPUDeviceContext ctx(place); - InitSelectedRowsInScope(scope, place); + InitSelectedRowsInScope(place, &scope); scope.Var("RPC_CLIENT_VAR"); f::AttributeMap attrs; selected_port = static_cast( diff --git a/paddle/fluid/operators/send_vars_op.cc b/paddle/fluid/operators/send_vars_op.cc index 2cbd9e2394800dc3b9c5be1163d16bbec435c533..56b3713d6af28d0787e114a672a503e86cbd85fd 100644 --- a/paddle/fluid/operators/send_vars_op.cc +++ b/paddle/fluid/operators/send_vars_op.cc @@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include +#include // NOLINT #include #include "paddle/fluid/framework/data_type.h" @@ -36,7 +36,7 @@ class SendVarsOp : public framework::OperatorBase { auto ins = Inputs("X"); std::vector epmap = Attr>("epmap"); - int sync_send = Attr("sync_sent"); + int sync_send = Attr("sync_send"); platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); auto& ctx = *pool.Get(place); diff --git a/paddle/fluid/operators/sgd_op.cc b/paddle/fluid/operators/sgd_op.cc index 074fa9e00f2ec531f324ff10113d95144687d500..06cb0550ad7d4ad0241a4f439ea9ac16d9714c38 100644 --- a/paddle/fluid/operators/sgd_op.cc +++ b/paddle/fluid/operators/sgd_op.cc @@ -35,8 +35,8 @@ class SGDOp : public framework::OperatorWithKernel { PADDLE_ENFORCE_EQ(framework::product(lr_dims), 1, "Learning rate should have 1 element"); auto param_dim = ctx->GetInputDim("Param"); - // TODO(qijun): check dimensions of Param and Grad at complie - // and run time. + // TODO(qijun): check dimensions of Param and Grad at compile + // and runtime. ctx->SetOutputDim("ParamOut", param_dim); } diff --git a/paddle/fluid/operators/sgd_op.h b/paddle/fluid/operators/sgd_op.h index 8d2bdf75903b4958e14605781f65c5a214cb5300..cfc8793e1e05a7d4fa9207ae77a664b391b9a986 100644 --- a/paddle/fluid/operators/sgd_op.h +++ b/paddle/fluid/operators/sgd_op.h @@ -65,7 +65,8 @@ class SGDOpKernel : public framework::OpKernel { auto &grad_rows = grad->rows(); size_t grad_row_numel = grad_value.numel() / grad_rows.size(); - PADDLE_ENFORCE_EQ(grad_row_numel, param_out->numel() / grad_height); + PADDLE_ENFORCE_EQ(static_cast(grad_row_numel), + param_out->numel() / grad_height); auto *grad_data = grad_value.data(); auto *out_data = param_out->data(); @@ -73,7 +74,7 @@ class SGDOpKernel : public framework::OpKernel { for (size_t i = 0; i < grad_rows.size(); i++) { PADDLE_ENFORCE(grad_rows[i] < grad_height, "Input rows index should less than height"); - for (int64_t j = 0; j < grad_row_numel; j++) { + for (size_t j = 0; j < grad_row_numel; j++) { out_data[grad_rows[i] * grad_row_numel + j] -= lr[0] * grad_data[i * grad_row_numel + j]; } @@ -107,7 +108,7 @@ class SGDOpKernel : public framework::OpKernel { PADDLE_ENFORCE(grad.rows()[i] < grad.height(), "Input rows index should less than height"); int64_t id_index = param.index(grad.rows()[i]); - for (int64_t j = 0; j < grad_row_width; j++) { + for (size_t j = 0; j < grad_row_width; j++) { out_data[id_index * grad_row_width + j] -= lr[0] * grad_data[i * grad_row_width + j]; } diff --git a/paddle/fluid/operators/split_ids_op.cc b/paddle/fluid/operators/split_ids_op.cc index a54f8a2878c8606e6b487552324d1e7dfa94b9b8..a53cbc8ac5199061dafdc7f4cf560b9e4fc577ab 100644 --- a/paddle/fluid/operators/split_ids_op.cc +++ b/paddle/fluid/operators/split_ids_op.cc @@ -48,11 +48,11 @@ class SplitIdsOp : public framework::OperatorWithKernel { PADDLE_ENFORCE(ctx->HasOutputs("Out"), "SplitIdsOp must has output Out."); auto ids_var_type = ctx->GetInputsVarType("Ids").front(); - PADDLE_ENFORCE_EQ(ids_var_type, framework::proto::VarType::LOD_TENSOR); - auto ids_dims = ctx->GetInputDim("Ids"); - PADDLE_ENFORCE_EQ(ids_dims.size(), 2); - PADDLE_ENFORCE_EQ(ids_dims[1], 1); + if (ids_var_type == framework::proto::VarType::LOD_TENSOR) { + PADDLE_ENFORCE_EQ(ids_dims.size(), 2); + PADDLE_ENFORCE_EQ(ids_dims[1], 1); + } } }; @@ -60,8 +60,9 @@ class SplitIdsOpInferVarType : public framework::VarTypeInference { public: void operator()(const framework::OpDesc &op_desc, framework::BlockDesc *block) const override { + auto *input_var = block->Var(op_desc.Input("Ids")[0]); for (auto &out_var : op_desc.Output("Out")) { - block->Var(out_var)->SetType(framework::proto::VarType::LOD_TENSOR); + block->Var(out_var)->SetType(input_var->GetType()); } } }; @@ -73,4 +74,5 @@ namespace ops = paddle::operators; REGISTER_OPERATOR(split_ids, ops::SplitIdsOp, ops::SplitIdsOpMaker, ops::SplitIdsOpInferVarType); REGISTER_OP_CPU_KERNEL( - split_ids, ops::SplitIdsOpKernel); + split_ids, ops::SplitIdsOpKernel, + ops::SplitIdsOpKernel); diff --git a/paddle/fluid/operators/split_ids_op.h b/paddle/fluid/operators/split_ids_op.h index d36ed398ebce661a62ca92696b0089b5289d5b1c..ba1e903dbb6daaa86b1b664322d100a800fd16b3 100644 --- a/paddle/fluid/operators/split_ids_op.h +++ b/paddle/fluid/operators/split_ids_op.h @@ -24,35 +24,63 @@ namespace operators { template class SplitIdsOpKernel : public framework::OpKernel { public: - void Compute(const framework::ExecutionContext& ctx) const override { + void Compute(const framework::ExecutionContext &ctx) const override { auto place = ctx.GetPlace(); if (!platform::is_cpu_place(place)) { PADDLE_THROW("SplitIds do not support GPU kernel"); } - auto& ids_dims = ctx.Input("Ids")->dims(); - const T* ids = ctx.Input("Ids")->data(); - auto outs = ctx.MultiOutput("Out"); - const size_t shard_num = outs.size(); + const auto *ids_var = ctx.InputVar("Ids"); + if (ids_var->IsType()) { + const auto &ids_dims = ctx.Input("Ids")->dims(); + const T *ids = ctx.Input("Ids")->data(); + auto outs = ctx.MultiOutput("Out"); + const size_t shard_num = outs.size(); - std::vector> out_ids; - out_ids.resize(outs.size()); + std::vector> out_ids; + out_ids.resize(outs.size()); - // split id by their shard_num. - for (int i = 0; i < ids_dims[0]; ++i) { - T id = ids[i]; - size_t shard_id = static_cast(id) % shard_num; - out_ids[shard_id].push_back(id); - } + // split id by their shard_num. + for (int i = 0; i < ids_dims[0]; ++i) { + T id = ids[i]; + size_t shard_id = static_cast(id) % shard_num; + out_ids[shard_id].push_back(id); + } + + // create tensor for each shard and send to parameter server + for (size_t i = 0; i < out_ids.size(); ++i) { + auto *shard_t = outs[i]; + std::vector ids = out_ids[i]; + auto *shard_data = shard_t->mutable_data( + framework::make_ddim({static_cast(ids.size()), 1}), place); + for (size_t i = 0; i < ids.size(); ++i) { + shard_data[i] = ids[i]; + } + } + } else if (ids_var->IsType()) { + const auto *ids_selected_rows = ctx.Input("Ids"); + auto &ids_dims = ids_selected_rows->value().dims(); + PADDLE_ENFORCE_EQ(ids_dims[0], ids_selected_rows->rows().size(), ""); + const T *ids = ids_selected_rows->value().data(); + const auto &ids_rows = ids_selected_rows->rows(); + auto outs = ctx.MultiOutput("Out"); + const size_t shard_num = outs.size(); + // get rows for outputs + for (auto &id : ids_rows) { + size_t shard_id = static_cast(id) % shard_num; + outs[shard_id]->mutable_rows()->push_back(id); + } - // create tensor for each shard and send to parameter server - for (size_t i = 0; i < out_ids.size(); ++i) { - auto* shard_t = outs[i]; - std::vector ids = out_ids[i]; - auto* shard_data = shard_t->mutable_data( - framework::make_ddim({static_cast(ids.size()), 1}), place); - for (size_t i = 0; i < ids.size(); ++i) { - shard_data[i] = ids[i]; + int64_t row_width = ids_dims[1]; + for (auto &out : outs) { + out->set_height(ids_selected_rows->height()); + framework::DDim ddim = framework::make_ddim( + {static_cast(out->rows().size()), row_width}); + T *output = out->mutable_value()->mutable_data(ddim, place); + for (size_t i = 0; i < ddim[0]; ++i) { + memcpy(output + i * row_width, ids + out->rows()[i] * row_width, + row_width * sizeof(T)); + } } } } diff --git a/paddle/fluid/operators/strided_memcpy.h b/paddle/fluid/operators/strided_memcpy.h index 22c1db82e9f5aff6aa9a311cd1093b33fa7e6db9..7a10218e1556698f3e0a1828db5de8851dd1c90b 100644 --- a/paddle/fluid/operators/strided_memcpy.h +++ b/paddle/fluid/operators/strided_memcpy.h @@ -37,8 +37,8 @@ inline void StridedMemcpy(const platform::DeviceContext& dev_ctx, const T* src, const framework::DDim& src_stride, const framework::DDim& dst_dim, const framework::DDim& dst_stride, T* dst) { - using namespace detail; - StridedCopyDimVisitor func(dev_ctx, src, src_stride, dst_stride, dst); + paddle::operators::detail::StridedCopyDimVisitor func( + dev_ctx, src, src_stride, dst_stride, dst); boost::apply_visitor(func, dst_dim); } diff --git a/paddle/fluid/operators/sum_op.cc b/paddle/fluid/operators/sum_op.cc index 9061e137bd1c789d34665729c48c1c2ea9525c8e..108f26fafe7af76eaa613d77ed77748ee43ea234 100644 --- a/paddle/fluid/operators/sum_op.cc +++ b/paddle/fluid/operators/sum_op.cc @@ -10,9 +10,11 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/sum_op.h" + #include #include #include + #include "paddle/fluid/framework/var_type_inference.h" #include "paddle/fluid/operators/detail/safe_ref.h" @@ -37,7 +39,10 @@ class SumOp : public framework::OperatorWithKernel { auto x_dims = ctx->GetInputsDim("X"); size_t N = x_dims.size(); - PADDLE_ENFORCE_GT(N, 1, "Input tensors count should > 1."); + PADDLE_ENFORCE_GT(N, 0, "Input tensors count should > 0."); + if (N == 1) { + VLOG(3) << "Warning: sum have only one input, may waste memory"; + } framework::DDim in_dim({0}); for (auto& x_dim : x_dims) { diff --git a/paddle/fluid/operators/top_k_op.cu b/paddle/fluid/operators/top_k_op.cu index bfd26c2f2294f954adc81a1719650c46372098c4..d7f4d383ce0d9e1ff42fc12c96aaf0ceb532e5db 100644 --- a/paddle/fluid/operators/top_k_op.cu +++ b/paddle/fluid/operators/top_k_op.cu @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/top_k_op.h" #include "paddle/fluid/platform/assert.h" namespace paddle { @@ -133,71 +134,71 @@ __device__ __forceinline__ void GetTopK(Pair topk[], const T* val, int* col, } template -__device__ __forceinline__ void ThreadGetTopK(Pair topk[], int& beam, +__device__ __forceinline__ void ThreadGetTopK(Pair topk[], int* beam, int beam_size, const T* src, - bool& firstStep, bool& is_empty, - Pair& max, int dim, + bool* firstStep, bool* is_empty, + Pair* max, int dim, const int tid) { - if (beam > 0) { - int length = beam < beam_size ? beam : beam_size; - if (firstStep) { - firstStep = false; + if (*beam > 0) { + int length = (*beam) < beam_size ? *beam : beam_size; + if (*firstStep) { + *firstStep = false; GetTopK(topk, src, tid, dim, length); } else { for (int k = 0; k < MaxLength; k++) { - if (k < MaxLength - beam) { - topk[k] = topk[k + beam]; + if (k < MaxLength - (*beam)) { + topk[k] = topk[k + *beam]; } else { topk[k].set(-INFINITY, -1); } } - if (!is_empty) { - GetTopK(topk + MaxLength - beam, src, tid, dim, max, + if (!(*is_empty)) { + GetTopK(topk + MaxLength - *beam, src, tid, dim, *max, length); } } - max = topk[MaxLength - 1]; - if (max.v == -1) is_empty = true; - beam = 0; + *max = topk[MaxLength - 1]; + if ((*max).v == -1) *is_empty = true; + *beam = 0; } } template -__device__ __forceinline__ void ThreadGetTopK(Pair topk[], int& beam, +__device__ __forceinline__ void ThreadGetTopK(Pair topk[], int* beam, int beam_size, const T* val, - int* col, bool& firstStep, - bool& is_empty, Pair& max, + int* col, bool* firstStep, + bool* is_empty, Pair* max, int dim, const int tid) { - if (beam > 0) { - int length = beam < beam_size ? beam : beam_size; - if (firstStep) { - firstStep = false; + if (*beam > 0) { + int length = (*beam) < beam_size ? *beam : beam_size; + if (*firstStep) { + *firstStep = false; GetTopK(topk, val, col, tid, dim, length); } else { for (int k = 0; k < MaxLength; k++) { - if (k < MaxLength - beam) { - topk[k] = topk[k + beam]; + if (k < MaxLength - *beam) { + topk[k] = topk[k + *beam]; } else { topk[k].set(-INFINITY, -1); } } - if (!is_empty) { - GetTopK(topk + MaxLength - beam, val, col, tid, dim, max, + if (!(*is_empty)) { + GetTopK(topk + MaxLength - *beam, val, col, tid, dim, max, length); } } - max = topk[MaxLength - 1]; - if (max.v == -1) is_empty = true; - beam = 0; + *max = topk[MaxLength - 1]; + if ((*max).v == -1) *is_empty = true; + *beam = 0; } } template __device__ __forceinline__ void BlockReduce(Pair* sh_topk, int* maxid, Pair topk[], T** topVal, - int64_t** topIds, int& beam, int& k, + int64_t** topIds, int* beam, int* k, const int tid, const int warp) { while (true) { __syncthreads(); @@ -225,17 +226,17 @@ __device__ __forceinline__ void BlockReduce(Pair* sh_topk, int* maxid, (*topVal)++; (*topIds)++; } - if (tid == maxid[0]) beam++; - if (--k == 0) break; + if (tid == maxid[0]) (*beam)++; + if (--(*k) == 0) break; __syncthreads(); if (tid == maxid[0]) { - if (beam < MaxLength) { - sh_topk[tid] = topk[beam]; + if (*beam < MaxLength) { + sh_topk[tid] = topk[*beam]; } } if (maxid[0] / 32 == warp) { - if (__shfl(beam, (maxid[0]) % 32, 32) == MaxLength) break; + if (__shfl(*beam, (maxid[0]) % 32, 32) == MaxLength) break; } } } @@ -268,13 +269,13 @@ __global__ void KeMatrixTopK(T* output, int output_stride, int64_t* indices, topk[k].set(-INFINITY, -1); } while (k) { - ThreadGetTopK(topk, beam, k, - src + blockIdx.x * lds, firststep, - is_empty, max, dim, tid); + ThreadGetTopK(topk, &beam, k, + src + blockIdx.x * lds, &firststep, + &is_empty, &max, dim, tid); sh_topk[tid] = topk[0]; BlockReduce(sh_topk, maxid, topk, &output, - &indices, beam, k, tid, warp); + &indices, &beam, &k, tid, warp); } } @@ -308,9 +309,9 @@ class TopkOpCUDAKernel : public framework::OpKernel { KeMatrixTopK<<< grid, threads, 0, reinterpret_cast( ctx.device_context()) - .stream()>>>(output_data, output->dims()[1], - indices_data, input_data, - input_width, input_width, int(k)); + .stream()>>>( + output_data, output->dims()[1], indices_data, input_data, input_width, + input_width, static_cast(k)); } }; diff --git a/python/paddle/fluid/distribute_transpiler.py b/python/paddle/fluid/distribute_transpiler.py index e18ace844e203be5b1b37040432bdad565a0734c..b0522b49f44d8ed0c8c7e3148e24f312fbdd1123 100644 --- a/python/paddle/fluid/distribute_transpiler.py +++ b/python/paddle/fluid/distribute_transpiler.py @@ -13,14 +13,17 @@ # limitations under the License. from __future__ import print_function -import framework -from framework import Program, default_main_program, default_startup_program, Parameter, Variable -import optimizer -from layer_helper import LayerHelper -import distributed_splitter as splitter + import math + +import distributed_splitter as splitter +import framework +from framework import Program, default_main_program, Variable from . import core -import debuger + +LOOKUP_TABLE_TYPE = "lookup_table" +LOOKUP_TABLE_GRAD_TYPE = "lookup_table_grad" +RPC_CLIENT_VAR_NAME = "RPC_CLIENT_VAR" class VarBlock: @@ -35,9 +38,9 @@ class VarBlock: class UnionFind(object): - """ Union-find data struct. + """ Union-find data structure. - Union-find is a data struct that keeps track of a set of elements partitioned + Union-find is a data structure that keeps track of a set of elements partitioned into a number of disjoint (non-overlapping) subsets. Reference: @@ -185,19 +188,66 @@ class DistributeTranspiler: assert (callable(split_method)) if program is None: program = default_main_program() - self.program = program - self.trainers = trainers + self.origin_program = program + self.trainer_num = trainers self.optimize_ops = optimize_ops # TODO(typhoonzero): currently trainer_id is fetched from cluster system # like Kubernetes, we should port this to use etcd later when developing # fluid distributed training with fault-tolerance. self.trainer_id = trainer_id pserver_endpoints = pservers.split(",") + self.pserver_endpoints = pserver_endpoints + + # process lookup_table_op + # 1. check all lookup_table_op is distributed + # 2. check all lookup_table_op share the same table. + distributed_lookup_table_ops = [] + # support only one distributed_lookup_table now + self.table_name = None + for op in program.global_block().ops: + if op.type == LOOKUP_TABLE_TYPE: + if op.attrs['is_distributed'] is True: + if self.table_name is None: + self.table_name = op.input("W")[0] + if self.table_name != op.input("W")[0]: + raise RuntimeError("all distributed lookup_table_ops" + " should have only one table") + distributed_lookup_table_ops.append(op) + else: + if self.table_name is not None: + assert op.input("W")[0] != self.table_name + + self.has_distributed_lookup_table = len( + distributed_lookup_table_ops) > 0 # step1: For large parameters and gradients, split them into smaller # blocks. param_list = [pg[0] for pg in params_grads] grad_list = [pg[1] for pg in params_grads] + + if self.has_distributed_lookup_table: + param_list = [ + param for param in param_list if param.name != self.table_name + ] + grad_list = [ + grad for grad in grad_list + if grad.name != framework.grad_var_name(self.table_name) + ] + self.table_param_grad = [ + param_grad for param_grad in params_grads + if param_grad[0].name == self.table_name + ][0] + table_grad_var = self.table_param_grad[1] + self.table_grad_list = [ + program.global_block().create_var( + name="%s.trainer_%d.pserver_%d" % + (table_grad_var.name, trainer_id, index), + type=table_grad_var.type, + shape=table_grad_var.shape, + dtype=table_grad_var.dtype) + for index in range(len(self.pserver_endpoints)) + ] + grad_blocks = split_dense_variable(grad_list, len(pserver_endpoints)) param_blocks = split_dense_variable(param_list, len(pserver_endpoints)) # step2: Create new vars for the parameters and gradients blocks and @@ -229,7 +279,7 @@ class DistributeTranspiler: self.param_grad_ep_mapping[ep]["grads"].append(grad) rpc_client_var = program.global_block().create_var( - name="RPC_CLIENT_VAR", + name=RPC_CLIENT_VAR_NAME, persistable=True, type=core.VarDesc.VarType.RAW) @@ -252,13 +302,19 @@ class DistributeTranspiler: outputs={"Out": [orig_param]}, attrs={"axis": 0}) + if self.has_distributed_lookup_table: + self._replace_lookup_table_op_with_prefetch(program, rpc_client_var, + eplist) + self._split_table_grad_and_add_send_vars(program, rpc_client_var, + pserver_endpoints) + def get_trainer_program(self): # remove optimize ops and add a send op to main_program - self.program.global_block().delete_ops(self.optimize_ops) - self.program.sync_with_cpp() + self.origin_program.global_block().delete_ops(self.optimize_ops) + self.origin_program.sync_with_cpp() # FIXME(typhoonzero): serialize once will fix error occurs when clone. - self.program.__str__() - return self.program + self.origin_program.__str__() + return self.origin_program def get_pserver_program(self, endpoint): """ @@ -294,8 +350,8 @@ class DistributeTranspiler: type=v.type, dtype=v.dtype, shape=v.shape) - if self.trainers > 1: - for trainer_id in xrange(self.trainers): + if self.trainer_num > 1: + for trainer_id in xrange(self.trainer_num): var = pserver_program.global_block().create_var( name="%s.trainer_%d" % (orig_var_name, trainer_id), persistable=False, @@ -309,7 +365,7 @@ class DistributeTranspiler: # step3 optimize_block = pserver_program.create_block(0) # step 4 - # Create a union-find data struct from optimize ops, + # Create a union-find data structure from optimize ops, # If two ops are connected, we could add these two ops # into one set. ufind = self._create_ufind(self.optimize_ops) @@ -384,6 +440,23 @@ class DistributeTranspiler: # __append_optimize_op__(glb_op, optimize_block) # break + # process distributed lookup_table + prefetch_block = None + if self.has_distributed_lookup_table: + pserver_index = self.pserver_endpoints.index(endpoint) + self._create_table_optimize_block(pserver_index, pserver_program, + append_block) + prefetch_block = self._create_prefetch_block( + pserver_index, pserver_program, optimize_block) + + # NOTE: if has_distributed_lookup_table is False, then prefetch_block will + # not be executed, so it's safe to use optimize_block to hold the place + if self.has_distributed_lookup_table: + assert prefetch_block is not None + else: + assert prefetch_block is None + prefetch_block = pserver_program.global_block() + # step5 append the listen_and_serv op pserver_program.global_block().append_op( type="listen_and_serv", @@ -392,8 +465,10 @@ class DistributeTranspiler: attrs={ "OptimizeBlock": optimize_block, "endpoint": endpoint, - "Fanin": self.trainers + "Fanin": self.trainer_num, + "PrefetchBlock": prefetch_block }) + pserver_program.sync_with_cpp() return pserver_program @@ -451,6 +526,197 @@ class DistributeTranspiler: attrs=op.attrs) return s_prog + # transpiler function for dis lookup_table + def _replace_lookup_table_op_with_prefetch(self, program, rpc_client_var, + eplist): + # 1. replace lookup_table_op with split_ids_op -> prefetch_op -> sum_op + self.prefetch_input_vars = None + self.prefetch_output_vars = None + + continue_search_lookup_table_op = True + while continue_search_lookup_table_op: + continue_search_lookup_table_op = False + all_ops = program.global_block().ops + for op in all_ops: + if op.type == LOOKUP_TABLE_TYPE: + continue_search_lookup_table_op = True + + op_index = list(all_ops).index(op) + ids_name = op.input("Ids") + out_name = op.output("Out") + + if self.prefetch_input_vars is None: + ids_var = program.global_block().vars[ids_name[0]] + self.prefetch_input_vars = self.create_splited_vars( + source_var=ids_var, + block=program.global_block(), + tag="_prefetch_in_") + if self.prefetch_output_vars is None: + out_var = program.global_block().vars[out_name[0]] + self.prefetch_output_vars = self.create_splited_vars( + source_var=out_var, + block=program.global_block(), + tag="_prefetch_out_") + + # insert split_ids_op + program.global_block().insert_op( + index=op_index, + type="split_ids", + inputs={ + 'Ids': [ + program.global_block().vars[varname] + for varname in ids_name + ] + }, + outputs={"Out": self.prefetch_input_vars}) + + # insert prefetch_op + program.global_block().insert_op( + index=op_index + 1, + type="prefetch", + inputs={'X': self.prefetch_input_vars}, + outputs={ + "Out": self.prefetch_output_vars, + "RPCClient": rpc_client_var + }, + attrs={"epmap": eplist}) + + # insert concat_op + program.global_block().insert_op( + index=op_index + 2, + type="concat", + inputs={'X': self.prefetch_output_vars}, + outputs={ + "Out": [ + program.global_block().vars[varname] + for varname in out_name + ] + }, + attrs={"axis": 0}) + + # delete lookup_table_op + program.global_block().delete_ops([op]) + program.sync_with_cpp() + # break for loop + break + + def _split_table_grad_and_add_send_vars(self, program, rpc_client_var, + pserver_endpoints): + # 2. add split_ids_op and send_vars_op to send gradient to pservers + # there should only be one table_name + all_ops = program.global_block().ops + table_grad_name = framework.grad_var_name(self.table_name) + for op in all_ops: + if table_grad_name in op.output_arg_names: + op_index = list(all_ops).index(op) + # insert split_ids_op + program.global_block().insert_op( + index=op_index + 1, + type="split_ids", + inputs={ + 'Ids': [program.global_block().vars[table_grad_name]] + }, + outputs={"Out": self.table_grad_list}) + program.global_block().insert_op( + index=op_index + 2, + type="send_vars", + inputs={'X': self.table_grad_list}, + outputs={"RPCClient": rpc_client_var}, + attrs={"sync_send": True, + "epmap": pserver_endpoints}) + break + + def _create_prefetch_block(self, pserver_index, pserver_program, + optimize_block): + # STEP: create prefetch block + table_var = pserver_program.global_block().vars[self.table_name] + prefetch_block = pserver_program.create_block(optimize_block.idx) + trainer_ids = self.prefetch_input_vars[pserver_index] + pserver_ids = pserver_program.global_block().create_var( + name=trainer_ids.name, + type=trainer_ids.type, + shape=trainer_ids.shape, + dtype=trainer_ids.dtype) + trainer_out = self.prefetch_output_vars[pserver_index] + pserver_out = pserver_program.global_block().create_var( + name=trainer_out.name, + type=trainer_out.type, + shape=trainer_out.shape, + dtype=trainer_out.dtype) + prefetch_block.append_op( + type=LOOKUP_TABLE_TYPE, + inputs={'Ids': pserver_ids, + "W": table_var}, + outputs={"Out": pserver_out}, + attrs={ + "is_sparse": True, # has no effect on lookup_table op + "is_distributed": True, + "padding_idx": -1 + }) + return prefetch_block + + def _create_table_optimize_block(self, pserver_index, pserver_program, + append_block): + def _clone_var(block, var, persistable=True): + assert isinstance(var, Variable) + return block.create_var( + name=var.name, + shape=var.shape, + dtype=var.dtype, + type=var.type, + persistable=persistable) + + # STEP: create table optimize block + # create table param and grad var in pserver program + param_var = _clone_var( + pserver_program.global_block(), + self.origin_program.global_block().vars[self.table_name]) + grad_var = _clone_var( + pserver_program.global_block(), + self.origin_program.global_block().vars[framework.grad_var_name( + self.table_name)], + persistable=False) + + # create grad vars in pserver program + table_grad_var = self.table_param_grad[1] + table_grad_list = [ + pserver_program.global_block().create_var( + name="%s.trainer_%d.pserver_%d" % + (table_grad_var.name, index, pserver_index), + type=table_grad_var.type, + shape=table_grad_var.shape, + dtype=table_grad_var.dtype) for index in range(self.trainer_num) + ] + + # create table optimize block in pserver program + table_opt_op = [ + op for op in self.optimize_ops + if op.input("Param")[0] == self.table_name + ][0] + table_opt_block = pserver_program.create_block(append_block.idx) + # only support sgd now + assert table_opt_op.type == "sgd" + + # append sum op for table_grad_list + table_opt_block.append_op( + type="sum", + inputs={"X": table_grad_list}, + outputs={"Out": [grad_var]}) + + lr_var = pserver_program.global_block().vars[table_opt_op.input( + "LearningRate")[0]] + inputs = { + "Param": [param_var], + "Grad": [grad_var], + "LearningRate": [lr_var] + } + outputs = {"ParamOut": [param_var]} + table_opt_block.append_op( + type=table_opt_op.type, + inputs=inputs, + outputs=outputs, + attrs=table_opt_op.attrs) + # ====================== private transpiler functions ===================== def _create_vars_from_blocklist(self, program, @@ -512,7 +778,17 @@ class DistributeTranspiler: program.global_block().sync_with_cpp() return var_mapping - def _clone_var(self, block, var): + def create_splited_vars(self, source_var, block, tag): + return [ + block.create_var( + name=str(source_var.name + tag + str(index)), + type=source_var.type, + shape=source_var.shape, + dtype=source_var.dtype) + for index in range(len(self.pserver_endpoints)) + ] + + def _clone_var(self, block, var, persistable=True): assert isinstance(var, Variable) return block.create_var( name=var.name, @@ -520,12 +796,12 @@ class DistributeTranspiler: dtype=var.dtype, type=var.type, lod_level=var.lod_level, - persistable=True) + persistable=persistable) def _append_split_op(self, program, gradblocks): # Split variables that need to be split and append respective ops add_suffix = False - if self.trainers > 1: + if self.trainer_num > 1: add_suffix = True var_mapping = self._create_vars_from_blocklist( program, gradblocks, add_trainer_suffix=add_suffix) @@ -616,9 +892,9 @@ class DistributeTranspiler: return merged_var = \ pserver_block.vars[self._orig_varname(grad_block.name)] - if self.trainers > 1: + if self.trainer_num > 1: vars2merge = [] - for i in xrange(self.trainers): + for i in xrange(self.trainer_num): per_trainer_name = "%s.trainer_%d" % \ (self._orig_varname(grad_block.name), i) vars2merge.append(pserver_block.vars[per_trainer_name]) @@ -633,7 +909,7 @@ class DistributeTranspiler: type="scale", inputs={"X": merged_var}, outputs={"Out": merged_var}, - attrs={"scale": 1.0 / float(self.trainers)}) + attrs={"scale": 1.0 / float(self.trainer_num)}) new_inputs[key] = merged_var elif key == "Param": # param is already created on global program @@ -669,7 +945,7 @@ class DistributeTranspiler: new_shape = None if key in ["Param", "Grad", "LearningRate"]: continue - var = self.program.global_block().vars[opt_op.input(key)[0]] + var = self.origin_program.global_block().vars[opt_op.input(key)[0]] # update accumulator variable shape param_shape = new_inputs["Param"].shape new_shape = self._get_optimizer_input_shape(opt_op.type, key, @@ -682,8 +958,8 @@ class DistributeTranspiler: new_inputs[key] = tmpvar # change output's ParamOut variable - outputs = self._get_output_map_from_op(self.program.global_block().vars, - opt_op) + outputs = self._get_output_map_from_op( + self.origin_program.global_block().vars, opt_op) outputs["ParamOut"] = new_inputs["Param"] optimize_block.append_op( @@ -695,8 +971,8 @@ class DistributeTranspiler: def _append_pserver_non_opt_ops(self, optimize_block, opt_op): program = optimize_block.program # Append the ops for parameters that do not need to be optimized/updated - inputs = self._get_input_map_from_op(self.program.global_block().vars, - opt_op) + inputs = self._get_input_map_from_op( + self.origin_program.global_block().vars, opt_op) for varlist in inputs.itervalues(): if not isinstance(varlist, list): varlist = [varlist] @@ -709,8 +985,8 @@ class DistributeTranspiler: dtype=var.dtype, shape=var.shape) - outputs = self._get_output_map_from_op(self.program.global_block().vars, - opt_op) + outputs = self._get_output_map_from_op( + self.origin_program.global_block().vars, opt_op) for varlist in outputs.itervalues(): if not isinstance(varlist, list): @@ -783,7 +1059,6 @@ class DistributeTranspiler: if same_or_split_var(n, param) and n != param: return True return False - return False def _get_input_map_from_op(self, varmap, op): """Returns a dict from op input name to the vars in varmap.""" @@ -821,7 +1096,7 @@ class DistributeTranspiler: find_ops = [] # find ops which output is lr var - block = self.program.global_block() + block = self.origin_program.global_block() for op in block.ops: if set(op.output_arg_names) & lr_vars: find_ops.append(op) diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py index ea9abdcae2e11290b1f90f6008723815da523aa7..4b841ef31dcb67ab660475cf6e231fd8a4ae83d6 100644 --- a/python/paddle/fluid/framework.py +++ b/python/paddle/fluid/framework.py @@ -1183,6 +1183,8 @@ class Parameter(Variable): self.gradient_clip_attr = kwargs.get('gradient_clip_attr', None) + self.do_model_average = kwargs.get('do_model_average', None) + def __str__(self): return self.to_string(True) @@ -1203,7 +1205,7 @@ class Parameter(Variable): if with_details: res_str = Variable.to_string(self, throw_on_error, True) additional_attr = ("trainable", "optimize_attr", "regularizer", - "gradient_clip_attr") + "gradient_clip_attr", "do_model_average") for attr_name in additional_attr: res_str += "%s: %s\n" % (attr_name, str(getattr(self, attr_name))) diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index d2e7d58524bfb11627b6acb36ef873c41b348f0f..5c2c2dd7abebf8960d68b4c4dfd746a4e27acd03 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -218,6 +218,7 @@ def fc(input, def embedding(input, size, is_sparse=False, + is_distributed=False, padding_idx=None, param_attr=None, dtype='float32'): @@ -268,8 +269,11 @@ def embedding(input, inputs={'Ids': input, 'W': w}, outputs={'Out': tmp}, - attrs={'is_sparse': is_sparse, - 'padding_idx': padding_idx}) + attrs={ + 'is_sparse': is_sparse, + 'is_distributed': is_distributed, + 'padding_idx': padding_idx + }) return tmp @@ -1516,7 +1520,8 @@ def batch_norm(input, in_place=False, name=None, moving_mean_name=None, - moving_variance_name=None): + moving_variance_name=None, + do_model_average_for_mean_and_var=False): """ This function helps create an operator to implement the BatchNorm layer using the configurations from the input parameters. @@ -1547,7 +1552,10 @@ def batch_norm(input, mean = helper.create_parameter( attr=ParamAttr( - name=moving_mean_name, initializer=Constant(0.0), trainable=False), + name=moving_mean_name, + initializer=Constant(0.0), + trainable=False, + do_model_average=do_model_average_for_mean_and_var), shape=param_shape, dtype=input.dtype) mean.stop_gradient = True @@ -1556,7 +1564,8 @@ def batch_norm(input, attr=ParamAttr( name=moving_variance_name, initializer=Constant(1.0), - trainable=False), + trainable=False, + do_model_average=do_model_average_for_mean_and_var), shape=param_shape, dtype=input.dtype) variance.stop_gradient = True @@ -3374,14 +3383,14 @@ def reshape(x, shape, actual_shape=None, act=None, inplace=True, name=None): Here are some examples to explain it. 1. Given a 3-D tensor x with a shape [2, 4, 6], and the target shape - is [6, 8], the reshape operator will transform x into a 2-D tensor with + is [6, 8], the reshape operator will transform x into a 2-D tensor with shape [6, 8] and leaving x's data unchanged. 2. Given a 3-D tensor x with a shape [2, 4, 6], and the target shape specified is [2, 3, -1, 2], the reshape operator will transform x into a 4-D tensor with shape [2, 3, 4, 2] and leaving x's data unchanged. In this - case, one dimension of the target shape is set to -1, the value of this - dimension is inferred from the total element number of x and remaining + case, one dimension of the target shape is set to -1, the value of this + dimension is inferred from the total element number of x and remaining dimensions. 3. Given a 3-D tensor x with a shape [2, 4, 6], and the target shape @@ -3615,7 +3624,7 @@ def lrn(input, n=5, k=1.0, alpha=1e-4, beta=0.75, name=None): def pad(x, paddings, pad_value=0., name=None): """ Pads a tensor with a constant value given by :attr:`pad_value`, and the - padded width is specified by :attr:`paddings`. + padded width is specified by :attr:`paddings`. Specifically, the number of values padded before the contents of :attr:`x` in dimension :attr:`i` is indicated by :attr:`paddings[i]`, and the number @@ -3643,7 +3652,7 @@ def pad(x, paddings, pad_value=0., name=None): x (Variable): The input tensor variable. paddings (list): A list of integers. Its elements specify the padded width before and after for each dimension in turn. - The length of :attr:paddings must be + The length of :attr:paddings must be :math:`rank(x) \\times 2`. pad_value (float): The constant value used to pad. name(str|None): A name for this layer(optional). If set None, the layer diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py index 180575c35dc6e115e11cccf9fff9fb2d3cd7e9a6..36503cac6d5391821b977d90e6b77c4df7e3b564 100644 --- a/python/paddle/fluid/optimizer.py +++ b/python/paddle/fluid/optimizer.py @@ -11,7 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - +import re from collections import defaultdict from paddle.fluid.framework import Program import framework @@ -818,8 +818,8 @@ class ModelAverage(Optimizer): min_average_window, max_average_window and current update times. Args: - params_grads: A list of parameter-grad variable pairs. average_window_rate: The rate of average window. + params_grads: A list of parameter-grad variable pairs. min_average_window: The minimum size of average window. max_average_window: The maximum size of average window. @@ -840,8 +840,8 @@ class ModelAverage(Optimizer): """ def __init__(self, - params_grads, average_window_rate, + params_grads=None, min_average_window=10000, max_average_window=10000, **kwargs): @@ -849,24 +849,37 @@ class ModelAverage(Optimizer): self.average_window = average_window_rate self.min_average_window = min_average_window self.max_average_window = max_average_window - self.params_grads = params_grads + + self.params_grads = [] if params_grads is None else params_grads + params = {} + for param, grad in self.params_grads: + if param.do_model_average != False: + params[param.name] = (param, grad) + for param in framework.default_main_program().global_block( + ).all_parameters(): + if param.name not in params and param.do_model_average != False: + grad = param.block.create_var( + name=unique_name.generate(".".join([param.name, 'tmp'])), + dtype=param.dtype, + persistable=False, + stop_gradient=True) + params[param.name] = (param, grad) + self.params_grads = params.values() + for param, grad in self.params_grads: - if grad is not None: - self._append_average_accumulate_op(param) + self._append_average_accumulate_op(param) self.apply_program = Program() block = self.apply_program.global_block() with program_guard(main_program=self.apply_program): for param_grad in self.params_grads: - if param_grad[1] is not None: - self._add_average_apply_op(block, param_grad) + self._add_average_apply_op(block, param_grad) self.restore_program = Program() block = self.restore_program.global_block() with program_guard(main_program=self.restore_program): for param_grad in self.params_grads: - if param_grad[1] is not None: - self._add_average_restore_op(block, param_grad) + self._add_average_restore_op(block, param_grad) def _add_average_apply_op(self, block, param_grad): param = block.clone_variable(param_grad[0]) diff --git a/python/paddle/fluid/param_attr.py b/python/paddle/fluid/param_attr.py index 255cd2104325afa31449cbd3875499a7c5d7f572..1c6970441bccdc1c1221503256c30c83502bd123 100644 --- a/python/paddle/fluid/param_attr.py +++ b/python/paddle/fluid/param_attr.py @@ -28,13 +28,15 @@ class ParamAttr(object): learning_rate=1.0, regularizer=None, trainable=True, - gradient_clip=None): + gradient_clip=None, + do_model_average=None): self.name = name self.initializer = initializer self.learning_rate = learning_rate self.regularizer = regularizer self.trainable = trainable self.gradient_clip = gradient_clip + self.model_average = do_model_average def set_default_initializer(self, initializer): if initializer is None: @@ -80,7 +82,8 @@ class ParamAttr(object): }, 'regularizer': self.regularizer, 'trainable': self.trainable, - 'gradient_clip_attr': self.gradient_clip + 'gradient_clip_attr': self.gradient_clip, + 'model_average': self.model_average } if with_initializer: kwargs['initializer'] = self.initializer @@ -90,7 +93,7 @@ class ParamAttr(object): class WeightNormParamAttr(ParamAttr): """ Used for weight normalization. Any field in ParamAttr can also be set here. - Besides, an extra field dim can be set to indicate the dimension except + Besides, an extra field dim can be set to indicate the dimension except which to normalize. """ # List to record the parameters reparameterized by weight normalization. diff --git a/python/paddle/fluid/tests/book/test_label_semantic_roles.py b/python/paddle/fluid/tests/book/test_label_semantic_roles.py index c0a6df831acbfe2654a5941cf95c91343992ef13..4d8bca4d2430a248ccf421572bdafdffc3a3003a 100644 --- a/python/paddle/fluid/tests/book/test_label_semantic_roles.py +++ b/python/paddle/fluid/tests/book/test_label_semantic_roles.py @@ -37,7 +37,7 @@ depth = 8 mix_hidden_lr = 1e-3 IS_SPARSE = True -PASS_NUM = 10 +PASS_NUM = 100 BATCH_SIZE = 10 embedding_name = 'emb' @@ -77,7 +77,8 @@ def db_lstm(word, predicate, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, mark, emb_layers.append(mark_embedding) hidden_0_layers = [ - fluid.layers.fc(input=emb, size=hidden_dim) for emb in emb_layers + fluid.layers.fc(input=emb, size=hidden_dim, act='tanh') + for emb in emb_layers ] hidden_0 = fluid.layers.sums(input=hidden_0_layers) @@ -94,8 +95,8 @@ def db_lstm(word, predicate, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, mark, for i in range(1, depth): mix_hidden = fluid.layers.sums(input=[ - fluid.layers.fc(input=input_tmp[0], size=hidden_dim), - fluid.layers.fc(input=input_tmp[1], size=hidden_dim) + fluid.layers.fc(input=input_tmp[0], size=hidden_dim, act='tanh'), + fluid.layers.fc(input=input_tmp[1], size=hidden_dim, act='tanh') ]) lstm = fluid.layers.dynamic_lstm( @@ -109,8 +110,8 @@ def db_lstm(word, predicate, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, mark, input_tmp = [mix_hidden, lstm] feature_out = fluid.layers.sums(input=[ - fluid.layers.fc(input=input_tmp[0], size=label_dict_len), - fluid.layers.fc(input=input_tmp[1], size=label_dict_len) + fluid.layers.fc(input=input_tmp[0], size=label_dict_len, act='tanh'), + fluid.layers.fc(input=input_tmp[1], size=label_dict_len, act='tanh') ]) return feature_out @@ -171,7 +172,7 @@ def train(use_cuda, save_dirname=None, is_local=True): # check other optimizers and check why out will be NAN sgd_optimizer = fluid.optimizer.SGD( learning_rate=fluid.layers.exponential_decay( - learning_rate=0.0001, + learning_rate=0.01, decay_steps=100000, decay_rate=0.5, staircase=True)) @@ -233,7 +234,7 @@ def train(use_cuda, save_dirname=None, is_local=True): print("second per batch: " + str((time.time( ) - start_time) / batch_id)) # Set the threshold low to speed up the CI test - if float(pass_precision) > 0.05: + if float(pass_precision) > 0.01: if save_dirname is not None: # TODO(liuyiqun): Change the target to crf_decode fluid.io.save_inference_model(save_dirname, [ diff --git a/python/paddle/fluid/tests/book/test_recognize_digits.py b/python/paddle/fluid/tests/book/test_recognize_digits.py index e4997b4069f60ff4382b4254bc026ae8ae29b345..5ec6890c1b0dabd2804a92071b63c9610299e67c 100644 --- a/python/paddle/fluid/tests/book/test_recognize_digits.py +++ b/python/paddle/fluid/tests/book/test_recognize_digits.py @@ -157,7 +157,6 @@ def train(nn_type, for ip in pserver_ips.split(","): eplist.append(':'.join([ip, port])) pserver_endpoints = ",".join(eplist) # ip:port,ip:port... - pserver_endpoints = os.getenv("PSERVERS") trainers = int(os.getenv("TRAINERS")) current_endpoint = os.getenv("POD_IP") + ":" + port trainer_id = int(os.getenv("PADDLE_INIT_TRAINER_ID")) diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt index f10ef9b63412ecf74471f4fb94eb91ac72d5f8f9..3bd24c98a22b5db9833a312f481ed74c3d26f0ad 100644 --- a/python/paddle/fluid/tests/unittests/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt @@ -1,6 +1,12 @@ file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py") string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}") +# The fully connected test is removed whe the WITH_MKLDNN flag is OFF +# Because the fully connected layer has only one kernel (MKLDNN) +if(NOT WITH_MKLDNN) + list(REMOVE_ITEM TEST_OPS test_fc_op) +endif(NOT WITH_MKLDNN) + if(NOT WITH_DISTRIBUTE) list(REMOVE_ITEM TEST_OPS test_recv_op) endif(NOT WITH_DISTRIBUTE)