提交 b858c103 编写于 作者: W weixing02

Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into data_reader

# A image for building paddle binaries # A image for building paddle binaries
# Use cuda devel base image for both cpu and gpu environment # Use cuda devel base image for both cpu and gpu environment
FROM nvidia/cuda:8.0-cudnn5-devel-ubuntu16.04 FROM nvidia/cuda:8.0-cudnn7-devel-ubuntu16.04
MAINTAINER PaddlePaddle Authors <paddle-dev@baidu.com> MAINTAINER PaddlePaddle Authors <paddle-dev@baidu.com>
ARG UBUNTU_MIRROR ARG UBUNTU_MIRROR
......
...@@ -62,29 +62,33 @@ endif() ...@@ -62,29 +62,33 @@ endif()
## Then find the reference-cblas. www.netlib.org/blas/ ## Then find the reference-cblas. www.netlib.org/blas/
set(REFERENCE_CBLAS_ROOT $ENV{REFERENCE_CBLAS_ROOT} CACHE PATH set(REFERENCE_CBLAS_ROOT $ENV{REFERENCE_CBLAS_ROOT} CACHE PATH
"Folder contains reference-cblas") "Folder contains reference-cblas")
set(REFERENCE_CBLAS_INCLUDE_SEARCH_PATHS if(NOT CMAKE_CROSSCOMPILING)
${REFERENCE_CBLAS_ROOT}/include set(REFERENCE_CBLAS_INCLUDE_SEARCH_PATHS
/usr/include ${REFERENCE_CBLAS_ROOT}/include
/usr/include/cblas /usr/include
) /usr/include/cblas
)
set(REFERENCE_CBLAS_LIB_SEARCH_PATHS
${REFERENCE_CBLAS_ROOT}/lib set(REFERENCE_CBLAS_LIB_SEARCH_PATHS
/usr/lib ${REFERENCE_CBLAS_ROOT}/lib
/usr/lib/blas/reference/ /usr/lib
/usr/lib/reference/ /usr/lib/blas/reference/
) /usr/lib/reference/
)
else()
# Disable the finding of reference cblas under host's system path
set(REFERENCE_CBLAS_INCLUDE_SEARCH_PATHS ${REFERENCE_CBLAS_ROOT}/include)
set(REFERENCE_CBLAS_LIB_SEARCH_PATHS ${REFERENCE_CBLAS_ROOT}/lib)
endif()
find_path(REFERENCE_CBLAS_INCLUDE_DIR NAMES cblas.h PATHS find_path(REFERENCE_CBLAS_INCLUDE_DIR NAMES cblas.h PATHS
${REFERENCE_CBLAS_INCLUDE_SEARCH_PATHS}) ${REFERENCE_CBLAS_INCLUDE_SEARCH_PATHS})
find_library(REFERENCE_CBLAS_LIBRARY NAMES cblas PATHS find_library(REFERENCE_CBLAS_LIBRARY NAMES cblas PATHS
${REFERENCE_CBLAS_LIB_SEARCH_PATHS}) ${REFERENCE_CBLAS_LIB_SEARCH_PATHS})
if (REFERENCE_CBLAS_INCLUDE_DIR AND REFERENCE_CBLAS_LIBRARY) if(REFERENCE_CBLAS_INCLUDE_DIR AND REFERENCE_CBLAS_LIBRARY)
set(CBLAS_FOUND ON) set(CBLAS_FOUND ON)
set(CBLAS_PROVIDER REFERENCE) set(CBLAS_PROVIDER REFERENCE)
set(CBLAS_INC_DIR ${REFERENCE_CBLAS_INCLUDE_DIR}) set(CBLAS_INC_DIR ${REFERENCE_CBLAS_INCLUDE_DIR})
......
...@@ -24,16 +24,16 @@ SET(GRPC_INSTALL_DIR ${THIRD_PARTY_PATH}/install/grpc) ...@@ -24,16 +24,16 @@ SET(GRPC_INSTALL_DIR ${THIRD_PARTY_PATH}/install/grpc)
SET(GRPC_INCLUDE_DIR "${GRPC_INSTALL_DIR}/include/" CACHE PATH "grpc include directory." FORCE) SET(GRPC_INCLUDE_DIR "${GRPC_INSTALL_DIR}/include/" CACHE PATH "grpc include directory." FORCE)
SET(GRPC_CPP_PLUGIN "${GRPC_INSTALL_DIR}/bin/grpc_cpp_plugin" CACHE FILEPATH "GRPC_CPP_PLUGIN" FORCE) SET(GRPC_CPP_PLUGIN "${GRPC_INSTALL_DIR}/bin/grpc_cpp_plugin" CACHE FILEPATH "GRPC_CPP_PLUGIN" FORCE)
IF(APPLE) IF(APPLE)
SET(BUILD_CMD make -n HAS_SYSTEM_PROTOBUF=false -s -j8 static grpc_cpp_plugin | sed "s/-Werror//g" | sh) SET(BUILD_CMD make -n HAS_SYSTEM_PROTOBUF=false -s -j static grpc_cpp_plugin | sed "s/-Werror//g" | sh)
ELSE() ELSE()
SET(BUILD_CMD make HAS_SYSTEM_PROTOBUF=false -s -j8 static grpc_cpp_plugin) SET(BUILD_CMD make HAS_SYSTEM_PROTOBUF=false -s -j static grpc_cpp_plugin)
ENDIF() ENDIF()
ExternalProject_Add( ExternalProject_Add(
extern_grpc extern_grpc
DEPENDS protobuf zlib DEPENDS protobuf zlib
GIT_REPOSITORY "https://github.com/grpc/grpc.git" GIT_REPOSITORY "https://github.com/grpc/grpc.git"
GIT_TAG "v1.8.x" GIT_TAG "v1.11.x"
PREFIX ${GRPC_SOURCES_DIR} PREFIX ${GRPC_SOURCES_DIR}
UPDATE_COMMAND "" UPDATE_COMMAND ""
CONFIGURE_COMMAND "" CONFIGURE_COMMAND ""
......
...@@ -11,19 +11,20 @@ ...@@ -11,19 +11,20 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
#
IF(MOBILE_INFERENCE) if(MOBILE_INFERENCE OR RPI)
return() return()
ENDIF() endif()
include (ExternalProject) include (ExternalProject)
# NOTE: snappy is needed when linking with recordio # NOTE: snappy is needed when linking with recordio
SET(SNAPPY_SOURCES_DIR ${THIRD_PARTY_PATH}/snappy) set(SNAPPY_SOURCES_DIR ${THIRD_PARTY_PATH}/snappy)
SET(SNAPPY_INSTALL_DIR ${THIRD_PARTY_PATH}/install/snappy) set(SNAPPY_INSTALL_DIR ${THIRD_PARTY_PATH}/install/snappy)
SET(SNAPPY_INCLUDE_DIR "${SNAPPY_INSTALL_DIR}/include/" CACHE PATH "snappy include directory." FORCE) set(SNAPPY_INCLUDE_DIR "${SNAPPY_INSTALL_DIR}/include" CACHE PATH "snappy include directory." FORCE)
set(SNAPPY_LIBRARIES "${SNAPPY_INSTALL_DIR}/lib/libsnappy.a")
ExternalProject_Add( ExternalProject_Add(
extern_snappy extern_snappy
...@@ -51,8 +52,7 @@ ExternalProject_Add( ...@@ -51,8 +52,7 @@ ExternalProject_Add(
) )
add_library(snappy STATIC IMPORTED GLOBAL) add_library(snappy STATIC IMPORTED GLOBAL)
set_property(TARGET snappy PROPERTY IMPORTED_LOCATION set_property(TARGET snappy PROPERTY IMPORTED_LOCATION ${SNAPPY_LIBRARIES})
"${SNAPPY_INSTALL_DIR}/lib/libsnappy.a")
include_directories(${SNAPPY_INCLUDE_DIR}) include_directories(${SNAPPY_INCLUDE_DIR})
add_dependencies(snappy extern_snappy) add_dependencies(snappy extern_snappy)
...@@ -11,9 +11,8 @@ ...@@ -11,9 +11,8 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
#
IF(MOBILE_INFERENCE) IF(MOBILE_INFERENCE OR RPI)
return() return()
ENDIF() ENDIF()
...@@ -21,9 +20,11 @@ include (ExternalProject) ...@@ -21,9 +20,11 @@ include (ExternalProject)
# NOTE: snappy is needed when linking with recordio # NOTE: snappy is needed when linking with recordio
SET(SNAPPYSTREAM_SOURCES_DIR ${THIRD_PARTY_PATH}/snappy_stream) set(SNAPPYSTREAM_SOURCES_DIR ${THIRD_PARTY_PATH}/snappy_stream)
SET(SNAPPYSTREAM_INSTALL_DIR ${THIRD_PARTY_PATH}/install/snappy_stream) set(SNAPPYSTREAM_INSTALL_DIR ${THIRD_PARTY_PATH}/install/snappy_stream)
SET(SNAPPYSTREAM_INCLUDE_DIR "${SNAPPYSTREAM_INSTALL_DIR}/include/" CACHE PATH "snappy stream include directory." FORCE) set(SNAPPYSTREAM_INCLUDE_DIR "${SNAPPYSTREAM_INSTALL_DIR}/include" CACHE PATH "snappy stream include directory." FORCE)
set(SNAPPYSTREAM_LIBRARIES "${SNAPPYSTREAM_INSTALL_DIR}/lib/libsnappystream.a")
ExternalProject_Add( ExternalProject_Add(
extern_snappystream extern_snappystream
...@@ -51,8 +52,7 @@ ExternalProject_Add( ...@@ -51,8 +52,7 @@ ExternalProject_Add(
) )
add_library(snappystream STATIC IMPORTED GLOBAL) add_library(snappystream STATIC IMPORTED GLOBAL)
set_property(TARGET snappystream PROPERTY IMPORTED_LOCATION set_property(TARGET snappystream PROPERTY IMPORTED_LOCATION ${SNAPPYSTREAM_LIBRARIES})
"${SNAPPYSTREAM_INSTALL_DIR}/lib/libsnappystream.a")
include_directories(${SNAPPYSTREAM_INCLUDE_DIR}) # For snappysteam to include its own headers. include_directories(${SNAPPYSTREAM_INCLUDE_DIR}) # For snappysteam to include its own headers.
include_directories(${THIRD_PARTY_PATH}/install) # For Paddle to include snappy stream headers. include_directories(${THIRD_PARTY_PATH}/install) # For Paddle to include snappy stream headers.
......
...@@ -195,14 +195,7 @@ function(cc_library TARGET_NAME) ...@@ -195,14 +195,7 @@ function(cc_library TARGET_NAME)
list(REMOVE_ITEM cc_library_DEPS warpctc) list(REMOVE_ITEM cc_library_DEPS warpctc)
add_dependencies(${TARGET_NAME} warpctc) add_dependencies(${TARGET_NAME} warpctc)
endif() endif()
if("${cc_library_DEPS}" MATCHES "ARCHIVE_START") target_link_libraries(${TARGET_NAME} ${cc_library_DEPS})
# Support linking flags: --whole-archive (Linux) / -force_load (MacOS).
# WARNING: Please don't use ARCHIVE_START&ARCHIVE_END if TARGET_NAME will be linked by other libraries.
target_circle_link_libraries(${TARGET_NAME} ${cc_library_DEPS})
list(REMOVE_ITEM cc_library_DEPS ARCHIVE_START ARCHIVE_END)
else()
target_link_libraries(${TARGET_NAME} ${cc_library_DEPS})
endif()
add_dependencies(${TARGET_NAME} ${cc_library_DEPS}) add_dependencies(${TARGET_NAME} ${cc_library_DEPS})
endif() endif()
...@@ -243,11 +236,7 @@ function(cc_test TARGET_NAME) ...@@ -243,11 +236,7 @@ function(cc_test TARGET_NAME)
set(multiValueArgs SRCS DEPS ARGS) set(multiValueArgs SRCS DEPS ARGS)
cmake_parse_arguments(cc_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) cmake_parse_arguments(cc_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
add_executable(${TARGET_NAME} ${cc_test_SRCS}) add_executable(${TARGET_NAME} ${cc_test_SRCS})
# Support linking flags: --whole-archive (Linux) / -force_load (MacOS) target_link_libraries(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main memory gtest gflags glog)
target_circle_link_libraries(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main memory gtest gflags glog)
if("${cc_test_DEPS}" MATCHES "ARCHIVE_START")
list(REMOVE_ITEM cc_test_DEPS ARCHIVE_START ARCHIVE_END)
endif()
add_dependencies(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main memory gtest gflags glog) add_dependencies(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main memory gtest gflags glog)
add_test(NAME ${TARGET_NAME} add_test(NAME ${TARGET_NAME}
COMMAND ${TARGET_NAME} ${cc_test_ARGS} COMMAND ${TARGET_NAME} ${cc_test_ARGS}
......
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
set_property(GLOBAL PROPERTY FLUID_MODULES "") set_property(GLOBAL PROPERTY FLUID_MODULES "")
# find all fluid modules is used for paddle fluid static library # find all fluid modules is used for paddle fluid static library
function(find_fluid_modules TARGET_NAME) function(find_fluid_modules TARGET_NAME)
get_filename_component(__target_path ${TARGET_NAME} ABSOLUTE) get_filename_component(__target_path ${TARGET_NAME} ABSOLUTE)
string(REGEX REPLACE "^${PADDLE_SOURCE_DIR}/" "" __target_path ${__target_path})
string(FIND "${__target_path}" "fluid" pos) string(FIND "${__target_path}" "fluid" pos)
if(pos GREATER 1) if(pos GREATER 1)
get_property(fluid_modules GLOBAL PROPERTY FLUID_MODULES) get_property(fluid_modules GLOBAL PROPERTY FLUID_MODULES)
...@@ -77,6 +92,23 @@ elseif (WITH_MKLML) ...@@ -77,6 +92,23 @@ elseif (WITH_MKLML)
) )
endif() endif()
if(NOT MOBILE_INFERENCE AND NOT RPI)
set(dst_dir "${CMAKE_INSTALL_PREFIX}/third_party/install/snappy")
copy(snappy_lib
SRCS ${SNAPPY_INCLUDE_DIR} ${SNAPPY_LIBRARIES}
DSTS ${dst_dir} ${dst_dir}/lib)
set(dst_dir "${CMAKE_INSTALL_PREFIX}/third_party/install/snappystream")
copy(snappystream_lib
SRCS ${SNAPPYSTREAM_INCLUDE_DIR} ${SNAPPYSTREAM_LIBRARIES}
DSTS ${dst_dir} ${dst_dir}/lib)
set(dst_dir "${CMAKE_INSTALL_PREFIX}/third_party/install/zlib")
copy(zlib_lib
SRCS ${ZLIB_INCLUDE_DIR} ${ZLIB_LIBRARIES}
DSTS ${dst_dir} ${dst_dir}/lib)
endif()
# paddle fluid module # paddle fluid module
set(src_dir "${PADDLE_SOURCE_DIR}/paddle/fluid") set(src_dir "${PADDLE_SOURCE_DIR}/paddle/fluid")
set(dst_dir "${CMAKE_INSTALL_PREFIX}/paddle/fluid") set(dst_dir "${CMAKE_INSTALL_PREFIX}/paddle/fluid")
......
...@@ -119,7 +119,7 @@ An actual Fluid example is described [here](https://github.com/PaddlePaddle/Pad ...@@ -119,7 +119,7 @@ An actual Fluid example is described [here](https://github.com/PaddlePaddle/Pad
From the example, the Fluid programs look very similar to their PyTorch equivalent programs, except that Fluid's loop structure, wrapped with Python's `with` statement, could run much faster than just a Python loop. From the example, the Fluid programs look very similar to their PyTorch equivalent programs, except that Fluid's loop structure, wrapped with Python's `with` statement, could run much faster than just a Python loop.
We have more examples of the [`if-then-else`](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/if_else_op.md) structure of Fluid. We have more examples of the [`if-then-else`](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/execution/if_else_op.md) structure of Fluid.
## Turing Completeness ## Turing Completeness
......
...@@ -24,6 +24,6 @@ if(NOT WITH_FLUID_ONLY) ...@@ -24,6 +24,6 @@ if(NOT WITH_FLUID_ONLY)
endif() endif()
add_subdirectory(testing) add_subdirectory(testing)
if(NOT MOBILE_INFERENCE AND NOT ANDROID AND NOT IOS) if(NOT MOBILE_INFERENCE AND NOT RPI)
add_subdirectory(fluid) add_subdirectory(fluid)
endif() endif()
...@@ -3,6 +3,7 @@ add_subdirectory(platform) ...@@ -3,6 +3,7 @@ add_subdirectory(platform)
add_subdirectory(framework) add_subdirectory(framework)
add_subdirectory(operators) add_subdirectory(operators)
add_subdirectory(pybind) add_subdirectory(pybind)
add_subdirectory(inference)
add_subdirectory(string) add_subdirectory(string)
add_subdirectory(recordio) add_subdirectory(recordio)
# NOTE: please add subdirectory inference at last.
add_subdirectory(inference)
...@@ -92,7 +92,7 @@ class BlockDesc { ...@@ -92,7 +92,7 @@ class BlockDesc {
/* /*
* Remove Op and its input/output variables. * Remove Op and its input/output variables.
* Note that for either input or ouput variable, if it is also an input or * Note that for either input or output variable, if it is also an input or
* output variable of other ops, we should remain it. * output variable of other ops, we should remain it.
*/ */
void RemoveOp(size_t s, size_t e); void RemoveOp(size_t s, size_t e);
......
...@@ -14,6 +14,8 @@ ...@@ -14,6 +14,8 @@
#include "paddle/fluid/framework/details/computation_op_handle.h" #include "paddle/fluid/framework/details/computation_op_handle.h"
#include <string>
namespace paddle { namespace paddle {
namespace framework { namespace framework {
namespace details { namespace details {
...@@ -33,7 +35,7 @@ void ComputationOpHandle::RunImpl() { ...@@ -33,7 +35,7 @@ void ComputationOpHandle::RunImpl() {
} }
} }
op_->Run(*scope_->FindVar("@TMP_SCOPE@")->Get<Scope *>(), place_); op_->Run(*scope_->FindVar(kLocalExecScopeName)->Get<Scope *>(), place_);
} }
std::string ComputationOpHandle::Name() const { return op_->Type(); } std::string ComputationOpHandle::Name() const { return op_->Type(); }
......
...@@ -14,6 +14,9 @@ ...@@ -14,6 +14,9 @@
#include "paddle/fluid/framework/details/fetch_op_handle.h" #include "paddle/fluid/framework/details/fetch_op_handle.h"
#include <string>
#include <vector>
namespace paddle { namespace paddle {
namespace framework { namespace framework {
namespace details { namespace details {
...@@ -57,7 +60,10 @@ void FetchOpHandle::RunImpl() { ...@@ -57,7 +60,10 @@ void FetchOpHandle::RunImpl() {
for (size_t i = 0; i < scopes.size(); ++i) { for (size_t i = 0; i < scopes.size(); ++i) {
auto &scope = scopes[i]; auto &scope = scopes[i];
auto &t = scope->FindVar(var_name)->Get<framework::LoDTensor>(); auto &t = scope->FindVar(kLocalExecScopeName)
->Get<Scope *>()
->FindVar(var_name)
->Get<framework::LoDTensor>();
if (platform::is_gpu_place(var->place_)) { if (platform::is_gpu_place(var->place_)) {
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
TensorCopy(t, cpu, *dev_ctxes_[t.place()], &tensors_[i]); TensorCopy(t, cpu, *dev_ctxes_[t.place()], &tensors_[i]);
......
...@@ -24,6 +24,8 @@ namespace paddle { ...@@ -24,6 +24,8 @@ namespace paddle {
namespace framework { namespace framework {
namespace details { namespace details {
constexpr char kLocalExecScopeName[] = "@LCOAL_SCOPE@";
class OpHandleBase { class OpHandleBase {
private: private:
DISABLE_COPY_AND_ASSIGN(OpHandleBase); DISABLE_COPY_AND_ASSIGN(OpHandleBase);
......
...@@ -15,13 +15,15 @@ ...@@ -15,13 +15,15 @@
#pragma once #pragma once
#include <memory> #include <memory>
#include <string>
#include <vector>
#include "paddle/fluid/framework/details/ssa_graph.h" #include "paddle/fluid/framework/details/ssa_graph.h"
#include "paddle/fluid/framework/feed_fetch_type.h" #include "paddle/fluid/framework/feed_fetch_type.h"
namespace paddle { namespace paddle {
namespace framework { namespace framework {
namespace details { namespace details {
class SSAGraphExecutor { class SSAGraphExecutor {
DISABLE_COPY_AND_ASSIGN(SSAGraphExecutor); DISABLE_COPY_AND_ASSIGN(SSAGraphExecutor);
......
...@@ -136,12 +136,6 @@ FeedFetchList ThreadedSSAGraphExecutor::Run( ...@@ -136,12 +136,6 @@ FeedFetchList ThreadedSSAGraphExecutor::Run(
ready_ops.clear(); ready_ops.clear();
}; };
// Create local scopes.
for (auto &scope : local_scopes_) {
auto &local_scope = scope->NewScope();
*scope->Var("@TMP_SCOPE@")->GetMutable<Scope *>() = &local_scope;
}
// Step 3. Execution // Step 3. Execution
while (!pending_vars.empty() || !ready_ops.empty() || !delayed_ops.empty()) { while (!pending_vars.empty() || !ready_ops.empty() || !delayed_ops.empty()) {
// 1. Run All Ready ops // 1. Run All Ready ops
...@@ -189,34 +183,10 @@ FeedFetchList ThreadedSSAGraphExecutor::Run( ...@@ -189,34 +183,10 @@ FeedFetchList ThreadedSSAGraphExecutor::Run(
PADDLE_ENFORCE(ready_ops.empty()); PADDLE_ENFORCE(ready_ops.empty());
PADDLE_ENFORCE(delayed_ops.empty()); PADDLE_ENFORCE(delayed_ops.empty());
PADDLE_ENFORCE(blocked_by_delayed_ops.empty()); PADDLE_ENFORCE(blocked_by_delayed_ops.empty());
++computation_count_;
auto sync_computation = [&] {
computation_count_ = 0;
// Wait All computational streams
for (auto p : this->places_) {
platform::DeviceContextPool::Instance().Get(p)->Wait();
}
for (auto &scope : local_scopes_) {
scope->DropKids();
}
};
// Wait FetchOps. // Wait FetchOps.
if (!fetch_ops.empty()) { if (!fetch_ops.empty()) {
fetch_ops.clear(); fetch_ops.clear();
sync_computation();
}
if (computation_count_ == max_async_computation) {
sync_computation();
}
// NOTE: the temp scope can be dropped lazily if needed.
// Drop tmp scopes;
for (auto &scope : local_scopes_) {
auto &kid = *scope->Var("@TMP_SCOPE@")->GetMutable<Scope *>();
kid = nullptr;
} }
return fetch_data; return fetch_data;
......
...@@ -99,9 +99,6 @@ class ThreadedSSAGraphExecutor : public SSAGraphExecutor { ...@@ -99,9 +99,6 @@ class ThreadedSSAGraphExecutor : public SSAGraphExecutor {
std::unique_ptr<platform::EnforceNotMet> exception_; std::unique_ptr<platform::EnforceNotMet> exception_;
std::atomic<int> running_ops_; std::atomic<int> running_ops_;
bool allow_op_delay_; bool allow_op_delay_;
size_t computation_count_{0};
size_t max_async_computation{100};
}; };
} // namespace details } // namespace details
......
...@@ -83,8 +83,8 @@ static void CheckTensorNANOrInf(const std::string& name, ...@@ -83,8 +83,8 @@ static void CheckTensorNANOrInf(const std::string& name,
if (tensor.memory_size() == 0) { if (tensor.memory_size() == 0) {
return; return;
} }
if (tensor.type().hash_code() != typeid(float).hash_code() && if (tensor.type().hash_code() != typeid(float).hash_code() && // NOLINT
tensor.type().hash_code() != typeid(double).hash_code()) { tensor.type().hash_code() != typeid(double).hash_code()) { // NOLINT
return; return;
} }
PADDLE_ENFORCE(!framework::TensorContainsInf(tensor), PADDLE_ENFORCE(!framework::TensorContainsInf(tensor),
...@@ -145,12 +145,13 @@ void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id, ...@@ -145,12 +145,13 @@ void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id,
// Return true if the block has feed operators and holder of matching info. // Return true if the block has feed operators and holder of matching info.
static bool has_feed_operators( static bool has_feed_operators(
const BlockDesc& block, const BlockDesc& block,
std::map<std::string, const LoDTensor*>& feed_targets, const std::map<std::string, const LoDTensor*>& feed_targets,
const std::string& feed_holder_name) { const std::string& feed_holder_name) {
size_t feed_count = 0; size_t feed_count = 0;
for (auto* op : block.AllOps()) { for (auto* op : block.AllOps()) {
if (op->Type() == kFeedOpType) { if (op->Type() == kFeedOpType) {
feed_count++; feed_count++;
// The input variable's name of feed_op should be feed_holder_name.
PADDLE_ENFORCE_EQ(op->Input("X")[0], feed_holder_name, PADDLE_ENFORCE_EQ(op->Input("X")[0], feed_holder_name,
"Input to feed op should be '%s'", feed_holder_name); "Input to feed op should be '%s'", feed_holder_name);
std::string feed_target_name = op->Output("Out")[0]; std::string feed_target_name = op->Output("Out")[0];
...@@ -166,13 +167,15 @@ static bool has_feed_operators( ...@@ -166,13 +167,15 @@ static bool has_feed_operators(
feed_count, feed_targets.size(), feed_count, feed_targets.size(),
"The number of feed operators should match 'feed_targets'"); "The number of feed operators should match 'feed_targets'");
// When feed operator are present, so should be feed_holder if (!feed_holder_name.empty()) {
auto var = block.FindVar(feed_holder_name); // When feed operator are present, so should be feed_holder.
PADDLE_ENFORCE_NOT_NULL(var, "Block should already have a '%s' variable", auto var = block.FindVar(feed_holder_name);
feed_holder_name); PADDLE_ENFORCE_NOT_NULL(var, "Block should already have a '%s' variable",
PADDLE_ENFORCE_EQ(var->GetType(), proto::VarType::FEED_MINIBATCH, feed_holder_name);
"'%s' variable should be 'FEED_MINIBATCH' type", PADDLE_ENFORCE_EQ(var->GetType(), proto::VarType::FEED_MINIBATCH,
feed_holder_name); "'%s' variable should be 'FEED_MINIBATCH' type",
feed_holder_name);
}
} }
return feed_count > 0; return feed_count > 0;
...@@ -185,12 +188,14 @@ static bool has_feed_operators( ...@@ -185,12 +188,14 @@ static bool has_feed_operators(
// and fetch_holder_name. Raise exception when any mismatch is found. // and fetch_holder_name. Raise exception when any mismatch is found.
// Return true if the block has fetch operators and holder of matching info. // Return true if the block has fetch operators and holder of matching info.
static bool has_fetch_operators( static bool has_fetch_operators(
const BlockDesc& block, std::map<std::string, LoDTensor*>& fetch_targets, const BlockDesc& block,
const std::map<std::string, LoDTensor*>& fetch_targets,
const std::string& fetch_holder_name) { const std::string& fetch_holder_name) {
size_t fetch_count = 0; size_t fetch_count = 0;
for (auto* op : block.AllOps()) { for (auto* op : block.AllOps()) {
if (op->Type() == kFetchOpType) { if (op->Type() == kFetchOpType) {
fetch_count++; fetch_count++;
// The output variable's name of fetch_op should be fetch_holder_name.
PADDLE_ENFORCE_EQ(op->Output("Out")[0], fetch_holder_name, PADDLE_ENFORCE_EQ(op->Output("Out")[0], fetch_holder_name,
"Output of fetch op should be '%s'", fetch_holder_name); "Output of fetch op should be '%s'", fetch_holder_name);
std::string fetch_target_name = op->Input("X")[0]; std::string fetch_target_name = op->Input("X")[0];
...@@ -206,13 +211,15 @@ static bool has_fetch_operators( ...@@ -206,13 +211,15 @@ static bool has_fetch_operators(
fetch_count, fetch_targets.size(), fetch_count, fetch_targets.size(),
"The number of fetch operators should match 'fetch_targets'"); "The number of fetch operators should match 'fetch_targets'");
// When fetch operator are present, so should be fetch_holder if (!fetch_holder_name.empty()) {
auto var = block.FindVar(fetch_holder_name); // When fetch operator are present, so should be fetch_holder.
PADDLE_ENFORCE_NOT_NULL(var, "Block should already have a '%s' variable", auto var = block.FindVar(fetch_holder_name);
fetch_holder_name); PADDLE_ENFORCE_NOT_NULL(var, "Block should already have a '%s' variable",
PADDLE_ENFORCE_EQ(var->GetType(), proto::VarType::FETCH_LIST, fetch_holder_name);
"'%s' variable should be 'FETCH_LIST' type", PADDLE_ENFORCE_EQ(var->GetType(), proto::VarType::FETCH_LIST,
fetch_holder_name); "'%s' variable should be 'FETCH_LIST' type",
fetch_holder_name);
}
} }
return fetch_count > 0; return fetch_count > 0;
...@@ -259,16 +266,6 @@ void Executor::Run(const ProgramDesc& program, Scope* scope, ...@@ -259,16 +266,6 @@ void Executor::Run(const ProgramDesc& program, Scope* scope,
} }
} }
// map the data of feed_targets to feed_holder
for (auto* op : global_block->AllOps()) {
if (op->Type() == kFeedOpType) {
std::string feed_target_name = op->Output("Out")[0];
int idx = boost::get<int>(op->GetAttr("col"));
SetFeedVariable(scope, *feed_targets[feed_target_name], feed_holder_name,
idx);
}
}
if (!has_fetch_ops) { if (!has_fetch_ops) {
// create fetch_holder variable // create fetch_holder variable
auto* fetch_holder = global_block->Var(fetch_holder_name); auto* fetch_holder = global_block->Var(fetch_holder_name);
...@@ -292,17 +289,9 @@ void Executor::Run(const ProgramDesc& program, Scope* scope, ...@@ -292,17 +289,9 @@ void Executor::Run(const ProgramDesc& program, Scope* scope,
} }
} }
Run(*copy_program, scope, 0, create_vars, create_vars); auto ctx = Prepare(*copy_program, 0);
RunPreparedContext(ctx.get(), scope, feed_targets, fetch_targets, create_vars,
// obtain the data of fetch_targets from fetch_holder feed_holder_name, fetch_holder_name);
for (auto* op : global_block->AllOps()) {
if (op->Type() == kFetchOpType) {
std::string fetch_target_name = op->Input("X")[0];
int idx = boost::get<int>(op->GetAttr("col"));
*fetch_targets[fetch_target_name] =
GetFetchVariable(*scope, fetch_holder_name, idx);
}
}
} }
std::unique_ptr<ExecutorPrepareContext> Executor::Prepare( std::unique_ptr<ExecutorPrepareContext> Executor::Prepare(
...@@ -370,5 +359,42 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope, ...@@ -370,5 +359,42 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
} }
} }
void Executor::RunPreparedContext(
ExecutorPrepareContext* ctx, Scope* scope,
std::map<std::string, const LoDTensor*>& feed_targets,
std::map<std::string, LoDTensor*>& fetch_targets, bool create_vars,
const std::string& feed_holder_name, const std::string& fetch_holder_name) {
auto& global_block = ctx->prog_.Block(ctx->block_id_);
PADDLE_ENFORCE(
has_feed_operators(global_block, feed_targets, feed_holder_name),
"Program in ExecutorPrepareContext should has feed_ops.");
PADDLE_ENFORCE(
has_fetch_operators(global_block, fetch_targets, fetch_holder_name),
"Program in the prepared context should has fetch_ops.");
// map the data of feed_targets to feed_holder
for (auto* op : global_block.AllOps()) {
if (op->Type() == kFeedOpType) {
std::string feed_target_name = op->Output("Out")[0];
int idx = boost::get<int>(op->GetAttr("col"));
SetFeedVariable(scope, *feed_targets[feed_target_name], feed_holder_name,
idx);
}
}
RunPreparedContext(ctx, scope, create_vars, create_vars);
// obtain the data of fetch_targets from fetch_holder
for (auto* op : global_block.AllOps()) {
if (op->Type() == kFetchOpType) {
std::string fetch_target_name = op->Input("X")[0];
int idx = boost::get<int>(op->GetAttr("col"));
*fetch_targets[fetch_target_name] =
GetFetchVariable(*scope, fetch_holder_name, idx);
}
}
}
} // namespace framework } // namespace framework
} // namespace paddle } // namespace paddle
...@@ -14,6 +14,9 @@ limitations under the License. */ ...@@ -14,6 +14,9 @@ limitations under the License. */
#pragma once #pragma once
#include <map>
#include <string>
#include <vector>
#include "paddle/fluid/framework/op_info.h" #include "paddle/fluid/framework/op_info.h"
#include "paddle/fluid/framework/program_desc.h" #include "paddle/fluid/framework/program_desc.h"
#include "paddle/fluid/framework/scope.h" #include "paddle/fluid/framework/scope.h"
...@@ -70,6 +73,13 @@ class Executor { ...@@ -70,6 +73,13 @@ class Executor {
bool create_local_scope = true, bool create_local_scope = true,
bool create_vars = true); bool create_vars = true);
void RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
std::map<std::string, const LoDTensor*>& feed_targets,
std::map<std::string, LoDTensor*>& fetch_targets,
bool create_vars = true,
const std::string& feed_holder_name = "feed",
const std::string& fetch_holder_name = "fetch");
private: private:
const platform::Place place_; const platform::Place place_;
}; };
......
...@@ -46,7 +46,8 @@ proto::VarType::Type GetDataTypeOfVar(const Variable* var) { ...@@ -46,7 +46,8 @@ proto::VarType::Type GetDataTypeOfVar(const Variable* var) {
} }
} }
static DDim GetDims(const Scope& scope, const std::string& name) { static DDim GetDims(const Scope& scope, const std::string& name,
bool get_actual_dim = false) {
Variable* var = scope.FindVar(name); Variable* var = scope.FindVar(name);
if (var == nullptr) { if (var == nullptr) {
return DDim({-1}); return DDim({-1});
...@@ -55,7 +56,11 @@ static DDim GetDims(const Scope& scope, const std::string& name) { ...@@ -55,7 +56,11 @@ static DDim GetDims(const Scope& scope, const std::string& name) {
if (var->IsType<LoDTensor>()) { if (var->IsType<LoDTensor>()) {
return var->Get<LoDTensor>().dims(); return var->Get<LoDTensor>().dims();
} else if (var->IsType<SelectedRows>()) { } else if (var->IsType<SelectedRows>()) {
return var->Get<SelectedRows>().GetCompleteDims(); if (get_actual_dim) {
return var->Get<SelectedRows>().value().dims();
} else {
return var->Get<SelectedRows>().GetCompleteDims();
}
} else { } else {
return DDim({-1}); return DDim({-1});
} }
...@@ -129,7 +134,7 @@ std::string OperatorBase::DebugStringEx(const Scope* scope) const { ...@@ -129,7 +134,7 @@ std::string OperatorBase::DebugStringEx(const Scope* scope) const {
for (size_t i = 0; i < input.second.size(); ++i) { for (size_t i = 0; i < input.second.size(); ++i) {
ss << input.second[i]; ss << input.second[i];
if (scope) { if (scope) {
ss << "[" << GetDims(*scope, input.second[i]) << "]"; ss << "[" << GetDims(*scope, input.second[i], true) << "]";
ss << "(" << GetLoD(*scope, input.second[i]) << ")"; ss << "(" << GetLoD(*scope, input.second[i]) << ")";
} }
if (i != input.second.size() - 1) { if (i != input.second.size() - 1) {
...@@ -149,7 +154,7 @@ std::string OperatorBase::DebugStringEx(const Scope* scope) const { ...@@ -149,7 +154,7 @@ std::string OperatorBase::DebugStringEx(const Scope* scope) const {
for (size_t i = 0; i < output.second.size(); ++i) { for (size_t i = 0; i < output.second.size(); ++i) {
ss << output.second[i]; ss << output.second[i];
if (scope) { if (scope) {
ss << "[" << GetDims(*scope, output.second[i]) << "]"; ss << "[" << GetDims(*scope, output.second[i], true) << "]";
ss << "(" << GetLoD(*scope, output.second[i]) << ")"; ss << "(" << GetLoD(*scope, output.second[i]) << ")";
} }
if (i != output.second.size() - 1) { if (i != output.second.size() - 1) {
......
...@@ -15,6 +15,7 @@ limitations under the License. */ ...@@ -15,6 +15,7 @@ limitations under the License. */
#include "paddle/fluid/framework/parallel_executor.h" #include "paddle/fluid/framework/parallel_executor.h"
#include <string> #include <string>
#include <tuple>
#include <vector> #include <vector>
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
...@@ -41,6 +42,8 @@ class ParallelExecutorPrivate { ...@@ -41,6 +42,8 @@ class ParallelExecutorPrivate {
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
std::unique_ptr<platform::NCCLContextMap> nccl_ctxs_; std::unique_ptr<platform::NCCLContextMap> nccl_ctxs_;
#endif #endif
std::vector<std::tuple<std::string, proto::VarType::Type, bool>> var_types_;
}; };
std::vector<Scope *> &ParallelExecutor::GetLocalScopes() { std::vector<Scope *> &ParallelExecutor::GetLocalScopes() {
...@@ -97,14 +100,9 @@ ParallelExecutor::ParallelExecutor( ...@@ -97,14 +100,9 @@ ParallelExecutor::ParallelExecutor(
allow_op_delay)); allow_op_delay));
// Step 3. Create vars in each scope; // Step 3. Create vars in each scope;
for (auto *scope : member_->local_scopes_) { for (auto *var : main_program.Block(0).AllVars()) {
for (auto *var : main_program.Block(0).AllVars()) { member_->var_types_.emplace_back(var->Name(), var->GetType(),
if (scope->FindVar(var->Name()) != nullptr) { var->Persistable());
continue;
}
InitializeVariable(scope->Var(var->Name()), var->GetType());
}
} }
} }
...@@ -163,9 +161,42 @@ void ParallelExecutor::Run( ...@@ -163,9 +161,42 @@ void ParallelExecutor::Run(
const std::unordered_map<std::string, LoDTensor> &feed_tensors) { const std::unordered_map<std::string, LoDTensor> &feed_tensors) {
platform::RecordBlock b(0); platform::RecordBlock b(0);
SplitTensorToPlaces(feed_tensors); SplitTensorToPlaces(feed_tensors);
// Create local scopes.
for (auto &scope : member_->local_scopes_) {
Scope &local_scope = scope->NewScope();
*scope->Var(details::kLocalExecScopeName)->GetMutable<Scope *>() =
&local_scope;
for (auto &name_type_pair : member_->var_types_) {
if (scope->FindVar(std::get<0>(name_type_pair)) != nullptr) {
continue;
}
if (std::get<2>(name_type_pair)) { // Persistable
InitializeVariable(scope->Var(std::get<0>(name_type_pair)),
std::get<1>(name_type_pair));
} else {
InitializeVariable(scope->Var(std::get<0>(name_type_pair)),
std::get<1>(name_type_pair));
}
}
}
auto fetch_data = member_->executor_->Run(fetch_tensors); auto fetch_data = member_->executor_->Run(fetch_tensors);
*member_->global_scope_->Var(fetched_var_name)->GetMutable<FeedFetchList>() = *member_->global_scope_->Var(fetched_var_name)->GetMutable<FeedFetchList>() =
fetch_data; fetch_data;
// Wait All computational streams
for (auto p : member_->places_) {
platform::DeviceContextPool::Instance().Get(p)->Wait();
}
for (auto &scope : member_->local_scopes_) {
auto &local_scope =
*scope->Var(details::kLocalExecScopeName)->GetMutable<Scope *>();
scope->DeleteScope(local_scope);
local_scope = nullptr;
}
} }
void ParallelExecutor::SplitTensorToPlaces( void ParallelExecutor::SplitTensorToPlaces(
......
...@@ -14,8 +14,12 @@ ...@@ -14,8 +14,12 @@
#include "paddle/fluid/framework/threadpool.h" #include "paddle/fluid/framework/threadpool.h"
#include "gflags/gflags.h"
#include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/enforce.h"
DEFINE_int32(io_threadpool_size, 100,
"number of threads used for doing IO, default 100");
namespace paddle { namespace paddle {
namespace framework { namespace framework {
...@@ -91,5 +95,20 @@ void ThreadPool::TaskLoop() { ...@@ -91,5 +95,20 @@ void ThreadPool::TaskLoop() {
} }
} }
std::unique_ptr<ThreadPool> ThreadPoolIO::io_threadpool_(nullptr);
std::once_flag ThreadPoolIO::io_init_flag_;
ThreadPool* ThreadPoolIO::GetInstanceIO() {
std::call_once(io_init_flag_, &ThreadPoolIO::InitIO);
return io_threadpool_.get();
}
void ThreadPoolIO::InitIO() {
if (io_threadpool_.get() == nullptr) {
// TODO(typhoonzero1986): make this configurable
io_threadpool_.reset(new ThreadPool(FLAGS_io_threadpool_size));
}
}
} // namespace framework } // namespace framework
} // namespace paddle } // namespace paddle
...@@ -14,12 +14,12 @@ limitations under the License. */ ...@@ -14,12 +14,12 @@ limitations under the License. */
#pragma once #pragma once
#include <condition_variable> #include <condition_variable> // NOLINT
#include <functional> #include <functional>
#include <future> #include <future> // NOLINT
#include <mutex> #include <mutex> // NOLINT
#include <queue> #include <queue>
#include <thread> #include <thread> // NOLINT
#include <vector> #include <vector>
#include "glog/logging.h" #include "glog/logging.h"
#include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/enforce.h"
...@@ -28,6 +28,22 @@ limitations under the License. */ ...@@ -28,6 +28,22 @@ limitations under the License. */
namespace paddle { namespace paddle {
namespace framework { namespace framework {
struct ExceptionHandler {
mutable std::future<std::unique_ptr<platform::EnforceNotMet>> future_;
explicit ExceptionHandler(
std::future<std::unique_ptr<platform::EnforceNotMet>>&& f)
: future_(std::move(f)) {}
void operator()() const {
auto ex = this->future_.get();
if (ex != nullptr) {
LOG(FATAL) << "The exception is thrown inside the thread pool. You "
"should use RunAndGetException to handle the exception.\n"
"The default exception handler is LOG(FATAL)."
<< ex->what();
}
}
};
// ThreadPool maintains a queue of tasks, and runs them using a fixed // ThreadPool maintains a queue of tasks, and runs them using a fixed
// number of threads. // number of threads.
class ThreadPool { class ThreadPool {
...@@ -87,22 +103,6 @@ class ThreadPool { ...@@ -87,22 +103,6 @@ class ThreadPool {
void Wait(); void Wait();
private: private:
struct ExceptionHandler {
mutable std::future<std::unique_ptr<platform::EnforceNotMet>> future_;
explicit ExceptionHandler(
std::future<std::unique_ptr<platform::EnforceNotMet>>&& f)
: future_(std::move(f)) {}
void operator()() const {
auto ex = this->future_.get();
if (ex != nullptr) {
LOG(FATAL) << "The exception is thrown inside the thread pool. You "
"should use RunAndGetException to handle the exception.\n"
"The default exception handler is LOG(FATAL)."
<< ex->what();
}
}
};
DISABLE_COPY_AND_ASSIGN(ThreadPool); DISABLE_COPY_AND_ASSIGN(ThreadPool);
// If the task queue is empty and avaialbe is equal to the number of // If the task queue is empty and avaialbe is equal to the number of
...@@ -135,6 +135,17 @@ class ThreadPool { ...@@ -135,6 +135,17 @@ class ThreadPool {
std::condition_variable completed_; std::condition_variable completed_;
}; };
class ThreadPoolIO : ThreadPool {
public:
static ThreadPool* GetInstanceIO();
static void InitIO();
private:
// NOTE: threadpool in base will be inhereted here.
static std::unique_ptr<ThreadPool> io_threadpool_;
static std::once_flag io_init_flag_;
};
// Run a function asynchronously. // Run a function asynchronously.
// NOTE: The function must return void. If the function need to return a value, // NOTE: The function must return void. If the function need to return a value,
// you can use lambda to capture a value pointer. // you can use lambda to capture a value pointer.
...@@ -143,5 +154,10 @@ std::future<void> Async(Callback callback) { ...@@ -143,5 +154,10 @@ std::future<void> Async(Callback callback) {
return ThreadPool::GetInstance()->Run(callback); return ThreadPool::GetInstance()->Run(callback);
} }
template <typename Callback>
std::future<void> AsyncIO(Callback callback) {
return ThreadPoolIO::GetInstanceIO()->Run(callback);
}
} // namespace framework } // namespace framework
} // namespace paddle } // namespace paddle
set(FLUID_CORE_MODULES proto_desc memory lod_tensor executor prune init) set(FLUID_CORE_MODULES proto_desc memory lod_tensor executor init)
cc_library(paddle_fluid_api cc_library(paddle_fluid_api
SRCS io.cc SRCS io.cc
...@@ -11,7 +11,7 @@ cc_library(paddle_fluid DEPS ${fluid_modules}) ...@@ -11,7 +11,7 @@ cc_library(paddle_fluid DEPS ${fluid_modules})
# Create shared library # Create shared library
cc_library(paddle_fluid_shared SHARED cc_library(paddle_fluid_shared SHARED
SRCS io.cc SRCS io.cc
DEPS ARCHIVE_START ${GLOB_OP_LIB} ${FLUID_CORE_MODULES} ARCHIVE_END) DEPS ${fluid_modules})
set_target_properties(paddle_fluid_shared PROPERTIES OUTPUT_NAME paddle_fluid) set_target_properties(paddle_fluid_shared PROPERTIES OUTPUT_NAME paddle_fluid)
if(NOT APPLE) if(NOT APPLE)
# TODO(liuyiqun): Temporarily disable the link flag because it is not support on Mac. # TODO(liuyiqun): Temporarily disable the link flag because it is not support on Mac.
......
...@@ -17,10 +17,16 @@ limitations under the License. */ ...@@ -17,10 +17,16 @@ limitations under the License. */
#include <fstream> #include <fstream>
#include "paddle/fluid/framework/block_desc.h" #include "paddle/fluid/framework/block_desc.h"
#include "paddle/fluid/framework/feed_fetch_type.h" #include "paddle/fluid/framework/feed_fetch_type.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/pybind/pybind.h"
namespace paddle { namespace paddle {
namespace inference { namespace inference {
// Temporarily add this function for exposing framework::InitDevices() when
// linking the inference shared library.
void Init(bool init_p2p) { framework::InitDevices(init_p2p); }
void ReadBinaryFile(const std::string& filename, std::string& contents) { void ReadBinaryFile(const std::string& filename, std::string& contents) {
std::ifstream fin(filename, std::ios::in | std::ios::binary); std::ifstream fin(filename, std::ios::in | std::ios::binary);
PADDLE_ENFORCE(static_cast<bool>(fin), "Cannot open file %s", filename); PADDLE_ENFORCE(static_cast<bool>(fin), "Cannot open file %s", filename);
......
...@@ -18,12 +18,15 @@ limitations under the License. */ ...@@ -18,12 +18,15 @@ limitations under the License. */
#include <string> #include <string>
#include <vector> #include <vector>
#include "paddle/fluid/framework/executor.h" #include "paddle/fluid/framework/executor.h"
#include "paddle/fluid/framework/init.h"
#include "paddle/fluid/framework/program_desc.h" #include "paddle/fluid/framework/program_desc.h"
#include "paddle/fluid/framework/scope.h" #include "paddle/fluid/framework/scope.h"
namespace paddle { namespace paddle {
namespace inference { namespace inference {
void Init(bool init_p2p);
void LoadPersistables(framework::Executor& executor, framework::Scope& scope, void LoadPersistables(framework::Executor& executor, framework::Scope& scope,
const framework::ProgramDesc& main_program, const framework::ProgramDesc& main_program,
const std::string& dirname, const std::string& dirname,
......
...@@ -17,7 +17,7 @@ function(inference_test TARGET_NAME) ...@@ -17,7 +17,7 @@ function(inference_test TARGET_NAME)
string(REGEX REPLACE "^_$" "" arg "${arg}") string(REGEX REPLACE "^_$" "" arg "${arg}")
cc_test(test_inference_${TARGET_NAME}${arg} cc_test(test_inference_${TARGET_NAME}${arg}
SRCS test_inference_${TARGET_NAME}.cc SRCS test_inference_${TARGET_NAME}.cc
DEPS ARCHIVE_START paddle_fluid ARCHIVE_END DEPS paddle_fluid
ARGS --dirname=${PYTHON_TESTS_DIR}/book/${TARGET_NAME}${arg}.inference.model) ARGS --dirname=${PYTHON_TESTS_DIR}/book/${TARGET_NAME}${arg}.inference.model)
set_tests_properties(test_inference_${TARGET_NAME}${arg} set_tests_properties(test_inference_${TARGET_NAME}${arg}
PROPERTIES DEPENDS test_${TARGET_NAME}) PROPERTIES DEPENDS test_${TARGET_NAME})
......
...@@ -46,8 +46,8 @@ TEST(inference, image_classification) { ...@@ -46,8 +46,8 @@ TEST(inference, image_classification) {
// Run inference on CPU // Run inference on CPU
LOG(INFO) << "--- CPU Runs: ---"; LOG(INFO) << "--- CPU Runs: ---";
TestInference<paddle::platform::CPUPlace, false>(dirname, cpu_feeds, TestInference<paddle::platform::CPUPlace, false, true>(
cpu_fetchs1, FLAGS_repeat); dirname, cpu_feeds, cpu_fetchs1, FLAGS_repeat);
LOG(INFO) << output1.dims(); LOG(INFO) << output1.dims();
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
...@@ -57,8 +57,8 @@ TEST(inference, image_classification) { ...@@ -57,8 +57,8 @@ TEST(inference, image_classification) {
// Run inference on CUDA GPU // Run inference on CUDA GPU
LOG(INFO) << "--- GPU Runs: ---"; LOG(INFO) << "--- GPU Runs: ---";
TestInference<paddle::platform::CUDAPlace, false>(dirname, cpu_feeds, TestInference<paddle::platform::CUDAPlace, false, true>(
cpu_fetchs2, FLAGS_repeat); dirname, cpu_feeds, cpu_fetchs2, FLAGS_repeat);
LOG(INFO) << output2.dims(); LOG(INFO) << output2.dims();
CheckError<float>(output1, output2); CheckError<float>(output1, output2);
......
...@@ -89,7 +89,7 @@ void CheckError(const paddle::framework::LoDTensor& output1, ...@@ -89,7 +89,7 @@ void CheckError(const paddle::framework::LoDTensor& output1,
EXPECT_EQ(count, 0U) << "There are " << count << " different elements."; EXPECT_EQ(count, 0U) << "There are " << count << " different elements.";
} }
template <typename Place, bool CreateVars = true> template <typename Place, bool CreateVars = true, bool PrepareContext = false>
void TestInference(const std::string& dirname, void TestInference(const std::string& dirname,
const std::vector<paddle::framework::LoDTensor*>& cpu_feeds, const std::vector<paddle::framework::LoDTensor*>& cpu_feeds,
const std::vector<paddle::framework::LoDTensor*>& cpu_fetchs, const std::vector<paddle::framework::LoDTensor*>& cpu_fetchs,
...@@ -175,8 +175,15 @@ void TestInference(const std::string& dirname, ...@@ -175,8 +175,15 @@ void TestInference(const std::string& dirname,
} }
// Ignore the profiling results of the first run // Ignore the profiling results of the first run
executor.Run(*inference_program, scope, feed_targets, fetch_targets, std::unique_ptr<paddle::framework::ExecutorPrepareContext> ctx;
CreateVars); if (PrepareContext) {
ctx = executor.Prepare(*inference_program, 0);
executor.RunPreparedContext(ctx.get(), scope, feed_targets, fetch_targets,
CreateVars);
} else {
executor.Run(*inference_program, scope, feed_targets, fetch_targets,
CreateVars);
}
// Enable the profiler // Enable the profiler
paddle::platform::EnableProfiler(state); paddle::platform::EnableProfiler(state);
...@@ -187,8 +194,15 @@ void TestInference(const std::string& dirname, ...@@ -187,8 +194,15 @@ void TestInference(const std::string& dirname,
"run_inference", "run_inference",
paddle::platform::DeviceContextPool::Instance().Get(place)); paddle::platform::DeviceContextPool::Instance().Get(place));
executor.Run(*inference_program, scope, feed_targets, fetch_targets, if (PrepareContext) {
CreateVars); // Note: if you change the inference_program, you need to call
// executor.Prepare() again to get a new ExecutorPrepareContext.
executor.RunPreparedContext(ctx.get(), scope, feed_targets,
fetch_targets, CreateVars);
} else {
executor.Run(*inference_program, scope, feed_targets, fetch_targets,
CreateVars);
}
} }
// Disable the profiler and print the timing information // Disable the profiler and print the timing information
......
...@@ -245,9 +245,17 @@ op_library(channel_send_op DEPS concurrency) ...@@ -245,9 +245,17 @@ op_library(channel_send_op DEPS concurrency)
op_library(channel_recv_op DEPS concurrency) op_library(channel_recv_op DEPS concurrency)
list(REMOVE_ITEM GENERAL_OPS ${DEPS_OPS}) list(REMOVE_ITEM GENERAL_OPS ${DEPS_OPS})
# The fully connected layer is deleted when the WITH_MKLDNN flag is OFF
# Because the fully connected layer has only one MKLDNN's operator
if(NOT WITH_MKLDNN)
list(REMOVE_ITEM GENERAL_OPS fc_op)
endif(NOT WITH_MKLDNN)
foreach(src ${GENERAL_OPS}) foreach(src ${GENERAL_OPS})
op_library(${src}) op_library(${src})
endforeach() endforeach()
file(APPEND ${pybind_file} "USE_OP(less_than);\nUSE_OP(logical_and);\nUSE_NO_KERNEL_OP(read_from_array);\n") file(APPEND ${pybind_file} "USE_OP(less_than);\nUSE_OP(logical_and);\nUSE_NO_KERNEL_OP(read_from_array);\n")
add_subdirectory(reader) add_subdirectory(reader)
......
...@@ -114,23 +114,11 @@ class BatchNormKernel<platform::CUDADeviceContext, T> ...@@ -114,23 +114,11 @@ class BatchNormKernel<platform::CUDADeviceContext, T>
const auto *bias = ctx.Input<Tensor>("Bias"); const auto *bias = ctx.Input<Tensor>("Bias");
auto *y = ctx.Output<Tensor>("Y"); auto *y = ctx.Output<Tensor>("Y");
auto *mean_out = ctx.Output<Tensor>("MeanOut");
auto *variance_out = ctx.Output<Tensor>("VarianceOut");
auto *saved_mean = ctx.Output<Tensor>("SavedMean");
auto *saved_variance = ctx.Output<Tensor>("SavedVariance");
// alloc memory // alloc memory
y->mutable_data<T>(ctx.GetPlace()); y->mutable_data<T>(ctx.GetPlace());
mean_out->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
variance_out->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
saved_mean->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
saved_variance->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
auto &dev_ctx = ctx.template device_context<platform::CUDADeviceContext>(); auto &dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
math::SetConstant<platform::CUDADeviceContext, BatchNormParamType<T>>
functor;
functor(dev_ctx, saved_mean, static_cast<BatchNormParamType<T>>(0));
functor(dev_ctx, saved_variance, static_cast<BatchNormParamType<T>>(0));
auto handle = dev_ctx.cudnn_handle(); auto handle = dev_ctx.cudnn_handle();
...@@ -159,6 +147,21 @@ class BatchNormKernel<platform::CUDADeviceContext, T> ...@@ -159,6 +147,21 @@ class BatchNormKernel<platform::CUDADeviceContext, T>
// Run training mode. // Run training mode.
// obtain running mean and running inv var, and see if we need to // obtain running mean and running inv var, and see if we need to
// initialize them. // initialize them.
auto *mean_out = ctx.Output<Tensor>("MeanOut");
auto *variance_out = ctx.Output<Tensor>("VarianceOut");
mean_out->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
variance_out->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
auto *saved_mean = ctx.Output<Tensor>("SavedMean");
auto *saved_variance = ctx.Output<Tensor>("SavedVariance");
saved_mean->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
saved_variance->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
math::SetConstant<platform::CUDADeviceContext, BatchNormParamType<T>>
functor;
functor(dev_ctx, saved_mean, static_cast<BatchNormParamType<T>>(0));
functor(dev_ctx, saved_variance, static_cast<BatchNormParamType<T>>(0));
double this_factor = 1. - momentum; double this_factor = 1. - momentum;
CUDNN_ENFORCE(platform::dynload::cudnnBatchNormalizationForwardTraining( CUDNN_ENFORCE(platform::dynload::cudnnBatchNormalizationForwardTraining(
......
...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and ...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/operators/concat_op.h" #include "paddle/fluid/operators/concat_op.h"
#include <string> #include <string>
#include <vector> #include <vector>
...@@ -34,7 +35,10 @@ class ConcatOp : public framework::OperatorWithKernel { ...@@ -34,7 +35,10 @@ class ConcatOp : public framework::OperatorWithKernel {
size_t axis = static_cast<size_t>(ctx->Attrs().Get<int>("axis")); size_t axis = static_cast<size_t>(ctx->Attrs().Get<int>("axis"));
const size_t n = ins.size(); const size_t n = ins.size();
PADDLE_ENFORCE_GT(n, 1, "Input tensors count should > 1."); PADDLE_ENFORCE_GT(n, 0, "Input tensors count should > 0.");
if (n == 1) {
VLOG(3) << "Warning: concat op have only one input, may waste memory";
}
auto out_dims = ins[0]; auto out_dims = ins[0];
size_t in_zero_dims_size = out_dims.size(); size_t in_zero_dims_size = out_dims.size();
......
...@@ -35,7 +35,8 @@ bool RPCClient::AsyncSendVariable(const std::string& ep, ...@@ -35,7 +35,8 @@ bool RPCClient::AsyncSendVariable(const std::string& ep,
const framework::Scope* p_scope = &scope; const framework::Scope* p_scope = &scope;
const auto ch = GetChannel(ep_val); const auto ch = GetChannel(ep_val);
framework::Async([var_name_val, p_ctx, ep_val, p_scope, time_out, ch, this] { framework::AsyncIO([var_name_val, p_ctx, ep_val, p_scope, time_out, ch,
this] {
auto* var = p_scope->FindVar(var_name_val); auto* var = p_scope->FindVar(var_name_val);
::grpc::ByteBuffer req; ::grpc::ByteBuffer req;
...@@ -89,7 +90,8 @@ bool RPCClient::AsyncGetVariable(const std::string& ep, ...@@ -89,7 +90,8 @@ bool RPCClient::AsyncGetVariable(const std::string& ep,
const framework::Scope* p_scope = &scope; const framework::Scope* p_scope = &scope;
const auto ch = GetChannel(ep_val); const auto ch = GetChannel(ep_val);
framework::Async([var_name_val, ep_val, p_scope, p_ctx, time_out, ch, this] { framework::AsyncIO([var_name_val, ep_val, p_scope, p_ctx, time_out, ch,
this] {
// prepare input // prepare input
sendrecv::VariableMessage req; sendrecv::VariableMessage req;
req.set_varname(var_name_val); req.set_varname(var_name_val);
...@@ -132,8 +134,8 @@ bool RPCClient::AsyncPrefetchVariable(const std::string& ep, ...@@ -132,8 +134,8 @@ bool RPCClient::AsyncPrefetchVariable(const std::string& ep,
const framework::Scope* p_scope = &scope; const framework::Scope* p_scope = &scope;
const auto ch = GetChannel(ep_val); const auto ch = GetChannel(ep_val);
framework::Async([in_var_name_val, out_var_name_val, ep_val, p_scope, p_ctx, framework::AsyncIO([in_var_name_val, out_var_name_val, ep_val, p_scope, p_ctx,
time_out, ch, this] { time_out, ch, this] {
auto* var = p_scope->FindVar(in_var_name_val); auto* var = p_scope->FindVar(in_var_name_val);
::grpc::ByteBuffer req; ::grpc::ByteBuffer req;
...@@ -196,7 +198,7 @@ bool RPCClient::Wait() { ...@@ -196,7 +198,7 @@ bool RPCClient::Wait() {
std::vector<std::future<void>> waits(req_count_); std::vector<std::future<void>> waits(req_count_);
for (int i = 0; i < req_count_; i++) { for (int i = 0; i < req_count_; i++) {
waits[i] = framework::Async([i, &a, this] { a[i] = Proceed(); }); waits[i] = framework::AsyncIO([i, &a, this] { a[i] = Proceed(); });
} }
for (int i = 0; i < req_count_; i++) { for (int i = 0; i < req_count_; i++) {
......
...@@ -161,6 +161,7 @@ class RequestPrefetch final : public RequestBase { ...@@ -161,6 +161,7 @@ class RequestPrefetch final : public RequestBase {
::grpc::ByteBuffer reply; ::grpc::ByteBuffer reply;
std::string var_name = request_->OutVarname(); std::string var_name = request_->OutVarname();
VLOG(3) << "prefetch var " << var_name;
auto var_desc = program_->Block(0).FindVar(var_name); auto var_desc = program_->Block(0).FindVar(var_name);
framework::Scope* local_scope = &scope_->NewScope(); framework::Scope* local_scope = &scope_->NewScope();
auto* var = local_scope->FindVar(var_name); auto* var = local_scope->FindVar(var_name);
...@@ -216,10 +217,10 @@ void AsyncGRPCServer::RunSyncUpdate() { ...@@ -216,10 +217,10 @@ void AsyncGRPCServer::RunSyncUpdate() {
std::function<void()> prefetch_register = std::function<void()> prefetch_register =
std::bind(&AsyncGRPCServer::TryToRegisterNewPrefetchOne, this); std::bind(&AsyncGRPCServer::TryToRegisterNewPrefetchOne, this);
// TODO(wuyi): Run these "HandleRequest" in thread pool
t_send_.reset( t_send_.reset(
new std::thread(std::bind(&AsyncGRPCServer::HandleRequest, this, new std::thread(std::bind(&AsyncGRPCServer::HandleRequest, this,
cq_send_.get(), "cq_send", send_register))); cq_send_.get(), "cq_send", send_register)));
t_get_.reset( t_get_.reset(
new std::thread(std::bind(&AsyncGRPCServer::HandleRequest, this, new std::thread(std::bind(&AsyncGRPCServer::HandleRequest, this,
cq_get_.get(), "cq_get", get_register))); cq_get_.get(), "cq_get", get_register)));
......
...@@ -13,7 +13,8 @@ See the License for the specific language governing permissions and ...@@ -13,7 +13,8 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include <ostream> #include <ostream>
#include <thread> #include <thread> // NOLINT
#include <vector>
#include "paddle/fluid/operators/listen_and_serv_op.h" #include "paddle/fluid/operators/listen_and_serv_op.h"
...@@ -88,8 +89,9 @@ void ListenAndServOp::RunImpl(const framework::Scope &scope, ...@@ -88,8 +89,9 @@ void ListenAndServOp::RunImpl(const framework::Scope &scope,
auto ins = Inputs("X"); auto ins = Inputs("X");
auto fan_in = Attr<int>("Fanin"); auto fan_in = Attr<int>("Fanin");
auto *block = Attr<framework::BlockDesc *>(kOptimizeBlock); auto *optimize_block = Attr<framework::BlockDesc *>(kOptimizeBlock);
auto *program = block->Program(); auto *prefetch_block = Attr<framework::BlockDesc *>(kPrefetchBlock);
auto *program = optimize_block->Program();
size_t num_blocks = program->Size(); size_t num_blocks = program->Size();
PADDLE_ENFORCE_GE(num_blocks, 2, PADDLE_ENFORCE_GE(num_blocks, 2,
"server program should have at least 2 blocks"); "server program should have at least 2 blocks");
...@@ -97,18 +99,25 @@ void ListenAndServOp::RunImpl(const framework::Scope &scope, ...@@ -97,18 +99,25 @@ void ListenAndServOp::RunImpl(const framework::Scope &scope,
framework::Executor executor(dev_place); framework::Executor executor(dev_place);
std::vector<int> block_list; std::vector<int> block_list;
for (size_t blkid = 1; blkid < num_blocks; ++blkid) { for (size_t blkid = 1; blkid < num_blocks; ++blkid) {
block_list.push_back(blkid); if (blkid != prefetch_block->ID()) {
block_list.push_back(blkid);
}
} }
auto prepared = executor.Prepare(*program, block_list); auto optimize_prepared = executor.Prepare(*program, block_list);
// Insert placeholder for block0 which holds current op itself. // Insert placeholder for block0 which holds current op itself.
prepared.insert(prepared.begin(), optimize_prepared.insert(
std::shared_ptr<framework::ExecutorPrepareContext>(nullptr)); optimize_prepared.begin(),
std::shared_ptr<framework::ExecutorPrepareContext>(nullptr));
rpc_service_->SetScope(&recv_scope); rpc_service_->SetScope(&recv_scope);
rpc_service_->SetDevCtx(&dev_ctx); rpc_service_->SetDevCtx(&dev_ctx);
// TODO(qiao) set proper fields for table lookup and update // TODO(qiao) set proper fields for table lookup and update
rpc_service_->SetExecutor(&executor); rpc_service_->SetExecutor(&executor);
rpc_service_->SetPrefetchBlkdId(0); VLOG(3) << "prefetch block id is " << prefetch_block->ID();
auto prefetch_prepared = executor.Prepare(*program, prefetch_block->ID());
rpc_service_->SetPrefetchBlkdId(prefetch_block->ID());
rpc_service_->SetPrefetchPreparedCtx(prefetch_prepared.get());
prefetch_prepared.release();
rpc_service_->SetProgram(program); rpc_service_->SetProgram(program);
// start the server listening after all member initialized. // start the server listening after all member initialized.
server_thread_.reset(new std::thread(RunServer, rpc_service_)); server_thread_.reset(new std::thread(RunServer, rpc_service_));
...@@ -166,16 +175,18 @@ void ListenAndServOp::RunImpl(const framework::Scope &scope, ...@@ -166,16 +175,18 @@ void ListenAndServOp::RunImpl(const framework::Scope &scope,
parallel_blkids.push_back(1); parallel_blkids.push_back(1);
double ts = detail::GetTimestamp(); double ts = detail::GetTimestamp();
for (size_t blkid = 2; blkid < num_blocks; ++blkid) { for (size_t blkid = 2; blkid < num_blocks; ++blkid) {
if (program->Block(blkid).Parent() != last_parent_blkid) { if (blkid != prefetch_block->ID()) {
ParallelExecuteBlocks(parallel_blkids, &executor, prepared, program, if (program->Block(blkid).Parent() != last_parent_blkid) {
&recv_scope); ParallelExecuteBlocks(parallel_blkids, &executor, optimize_prepared,
parallel_blkids.clear(); program, &recv_scope);
last_parent_blkid = program->Block(blkid).Parent(); parallel_blkids.clear();
last_parent_blkid = program->Block(blkid).Parent();
}
parallel_blkids.push_back(blkid);
} }
parallel_blkids.push_back(blkid);
} }
ParallelExecuteBlocks(parallel_blkids, &executor, prepared, program, ParallelExecuteBlocks(parallel_blkids, &executor, optimize_prepared,
&recv_scope); program, &recv_scope);
VLOG(2) << "run all blocks spent " << detail::GetTimestamp() - ts << "(ms)"; VLOG(2) << "run all blocks spent " << detail::GetTimestamp() - ts << "(ms)";
// Reset the received sparse variables, the sum operator would not // Reset the received sparse variables, the sum operator would not
...@@ -211,6 +222,8 @@ from send_op and send back variables to recv_op. ...@@ -211,6 +222,8 @@ from send_op and send back variables to recv_op.
.AddCustomChecker([](const std::string &ip) { return !ip.empty(); }); .AddCustomChecker([](const std::string &ip) { return !ip.empty(); });
AddAttr<framework::BlockDesc *>(kOptimizeBlock, AddAttr<framework::BlockDesc *>(kOptimizeBlock,
"BlockID to run on server side."); "BlockID to run on server side.");
AddAttr<framework::BlockDesc *>(kPrefetchBlock,
"prefetch block to run on server side.");
AddAttr<int>("Fanin", "How many clients send to this server.") AddAttr<int>("Fanin", "How many clients send to this server.")
.SetDefault(1); .SetDefault(1);
} }
......
...@@ -16,6 +16,7 @@ limitations under the License. */ ...@@ -16,6 +16,7 @@ limitations under the License. */
#include <stdint.h> #include <stdint.h>
#include <ostream> #include <ostream>
#include <string>
#include "paddle/fluid/framework/executor.h" #include "paddle/fluid/framework/executor.h"
#include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/lod_tensor.h"
...@@ -27,6 +28,7 @@ namespace paddle { ...@@ -27,6 +28,7 @@ namespace paddle {
namespace operators { namespace operators {
constexpr char kOptimizeBlock[] = "OptimizeBlock"; constexpr char kOptimizeBlock[] = "OptimizeBlock";
constexpr char kPrefetchBlock[] = "PrefetchBlock";
void RunServer(std::shared_ptr<detail::AsyncGRPCServer> service); void RunServer(std::shared_ptr<detail::AsyncGRPCServer> service);
......
...@@ -78,6 +78,9 @@ class LookupTableOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -78,6 +78,9 @@ class LookupTableOpMaker : public framework::OpProtoAndCheckerMaker {
"(boolean, default false) " "(boolean, default false) "
"Sparse update.") "Sparse update.")
.SetDefault(false); .SetDefault(false);
AddAttr<bool>("is_distributed",
"(boolean, default false) distributed lookup table.")
.SetDefault(false);
AddAttr<int64_t>("padding_idx", AddAttr<int64_t>("padding_idx",
"(int64, default -1) " "(int64, default -1) "
"If the value is -1, it makes no effect to lookup. " "If the value is -1, it makes no effect to lookup. "
......
...@@ -288,9 +288,14 @@ void batched_gemm<platform::CUDADeviceContext, float16>( ...@@ -288,9 +288,14 @@ void batched_gemm<platform::CUDADeviceContext, float16>(
// TODO(kexinzhao): add processing code for compute capability < 53 case // TODO(kexinzhao): add processing code for compute capability < 53 case
PADDLE_ENFORCE_GE(context.GetComputeCapability(), 53, PADDLE_ENFORCE_GE(context.GetComputeCapability(), 53,
"cublas Hgemm requires GPU compute capability >= 53"); "cublas Hgemm requires GPU compute capability >= 53");
#if CUDA_VERSION >= 8000
PADDLE_ENFORCE(platform::dynload::cublasHgemmStridedBatched( PADDLE_ENFORCE(platform::dynload::cublasHgemmStridedBatched(
context.cublas_handle(), cuTransB, cuTransA, N, M, K, &h_alpha, h_B, ldb, context.cublas_handle(), cuTransB, cuTransA, N, M, K, &h_alpha, h_B, ldb,
strideB, h_A, lda, strideA, &h_beta, h_C, ldc, strideC, batchCount)); strideB, h_A, lda, strideA, &h_beta, h_C, ldc, strideC, batchCount));
#else
PADDLE_ENFORCE(false, "HgemmStridedBatched is not supported on cuda <= 7.5");
#endif
} }
template <> template <>
...@@ -310,9 +315,13 @@ void batched_gemm<platform::CUDADeviceContext, float>( ...@@ -310,9 +315,13 @@ void batched_gemm<platform::CUDADeviceContext, float>(
(transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
const int strideC = M * N; const int strideC = M * N;
#if CUDA_VERSION >= 8000
PADDLE_ENFORCE(platform::dynload::cublasSgemmStridedBatched( PADDLE_ENFORCE(platform::dynload::cublasSgemmStridedBatched(
context.cublas_handle(), cuTransB, cuTransA, N, M, K, &alpha, B, ldb, context.cublas_handle(), cuTransB, cuTransA, N, M, K, &alpha, B, ldb,
strideB, A, lda, strideA, &beta, C, ldc, strideC, batchCount)); strideB, A, lda, strideA, &beta, C, ldc, strideC, batchCount));
#else
PADDLE_ENFORCE(false, "SgemmStridedBatched is not supported on cuda <= 7.5");
#endif
} }
template <> template <>
...@@ -332,9 +341,13 @@ void batched_gemm<platform::CUDADeviceContext, double>( ...@@ -332,9 +341,13 @@ void batched_gemm<platform::CUDADeviceContext, double>(
(transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
const int strideC = M * N; const int strideC = M * N;
#if CUDA_VERSION >= 8000
PADDLE_ENFORCE(platform::dynload::cublasDgemmStridedBatched( PADDLE_ENFORCE(platform::dynload::cublasDgemmStridedBatched(
context.cublas_handle(), cuTransB, cuTransA, N, M, K, &alpha, B, ldb, context.cublas_handle(), cuTransB, cuTransA, N, M, K, &alpha, B, ldb,
strideB, A, lda, strideA, &beta, C, ldc, strideC, batchCount)); strideB, A, lda, strideA, &beta, C, ldc, strideC, batchCount));
#else
PADDLE_ENFORCE(false, "DgemmStridedBatched is not supported on cuda <= 7.5");
#endif
} }
template <> template <>
......
...@@ -14,6 +14,8 @@ limitations under the License. */ ...@@ -14,6 +14,8 @@ limitations under the License. */
#pragma once #pragma once
#include <utility>
#include <vector>
#include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
......
...@@ -83,9 +83,11 @@ class PoolMKLDNNOpKernel : public paddle::framework::OpKernel<T> { ...@@ -83,9 +83,11 @@ class PoolMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
dev_ctx.SetBlob(key_pool_workspace_memory, workspace_memory); dev_ctx.SetBlob(key_pool_workspace_memory, workspace_memory);
auto src_memory = auto src_memory =
mkldnn::memory({src_md, mkldnn_engine}, (void*)input_data); mkldnn::memory({src_md, mkldnn_engine},
static_cast<void*>(const_cast<T*>(input_data)));
auto dst_memory = auto dst_memory =
mkldnn::memory({dst_md, mkldnn_engine}, (void*)output_data); mkldnn::memory({dst_md, mkldnn_engine},
static_cast<void*>(const_cast<T*>(output_data)));
auto pool_prim = mkldnn::pooling_forward(*pool_pd, src_memory, dst_memory, auto pool_prim = mkldnn::pooling_forward(*pool_pd, src_memory, dst_memory,
*workspace_memory); *workspace_memory);
...@@ -195,9 +197,11 @@ class PoolMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> { ...@@ -195,9 +197,11 @@ class PoolMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
pool_bwd_desc, mkldnn_engine, *pool_pd); pool_bwd_desc, mkldnn_engine, *pool_pd);
auto diff_src_memory = auto diff_src_memory =
mkldnn::memory({diff_src_md, mkldnn_engine}, (void*)in_x_grad_data); mkldnn::memory({diff_src_md, mkldnn_engine},
static_cast<void*>(const_cast<T*>(in_x_grad_data)));
auto diff_dst_memory = auto diff_dst_memory =
mkldnn::memory({diff_dst_md, mkldnn_engine}, (void*)out_grad_data); mkldnn::memory({diff_dst_md, mkldnn_engine},
static_cast<void*>(const_cast<T*>(out_grad_data)));
auto bwd_prim = mkldnn::pooling_backward( auto bwd_prim = mkldnn::pooling_backward(
pool_bwd_pd, diff_dst_memory, *workspace_memory, diff_src_memory); pool_bwd_pd, diff_dst_memory, *workspace_memory, diff_src_memory);
......
...@@ -14,6 +14,8 @@ limitations under the License. */ ...@@ -14,6 +14,8 @@ limitations under the License. */
#pragma once #pragma once
#include <string>
#include <vector>
#include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/operators/math/math_function.h"
......
...@@ -14,6 +14,7 @@ limitations under the License. */ ...@@ -14,6 +14,7 @@ limitations under the License. */
#pragma once #pragma once
#include <vector>
#include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/operators/math/math_function.h"
......
...@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include <future> #include <future> // NOLINT
#include <ostream> #include <ostream>
#include "paddle/fluid/framework/data_type.h" #include "paddle/fluid/framework/data_type.h"
...@@ -50,8 +50,8 @@ class PrefetchOp : public framework::OperatorBase { ...@@ -50,8 +50,8 @@ class PrefetchOp : public framework::OperatorBase {
for (size_t i = 0; i < ins.size(); i++) { for (size_t i = 0; i < ins.size(); i++) {
if (NeedSend(scope, ins[i])) { if (NeedSend(scope, ins[i])) {
VLOG(3) << "sending " << ins[i] << " to " << epmap[i] << "to get " VLOG(3) << "sending " << ins[i] << " to " << epmap[i] << " to get "
<< outs[i] << "back"; << outs[i] << " back";
rpc_client->AsyncPrefetchVariable(epmap[i], ctx, scope, ins[i], rpc_client->AsyncPrefetchVariable(epmap[i], ctx, scope, ins[i],
outs[i]); outs[i]);
} else { } else {
...@@ -71,7 +71,7 @@ class PrefetchOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -71,7 +71,7 @@ class PrefetchOpMaker : public framework::OpProtoAndCheckerMaker {
"(RPCClient) The RPC client object which will be" "(RPCClient) The RPC client object which will be"
"initialized at most once."); "initialized at most once.");
AddOutput("Out", AddOutput("Out",
"(SelectedRows) result " "(LoDTensor) result "
"to be fetched from parameter server") "to be fetched from parameter server")
.AsDuplicable(); .AsDuplicable();
AddAttr<std::vector<std::string>>( AddAttr<std::vector<std::string>>(
......
...@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and ...@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/operators/prelu_op.h" #include "paddle/fluid/operators/prelu_op.h"
#include <string> #include <string>
namespace paddle { namespace paddle {
......
...@@ -45,7 +45,7 @@ class PriorBoxOp : public framework::OperatorWithKernel { ...@@ -45,7 +45,7 @@ class PriorBoxOp : public framework::OperatorWithKernel {
bool flip = ctx->Attrs().Get<bool>("flip"); bool flip = ctx->Attrs().Get<bool>("flip");
std::vector<float> aspect_ratios_vec; std::vector<float> aspect_ratios_vec;
ExpandAspectRatios(aspect_ratios, flip, aspect_ratios_vec); ExpandAspectRatios(aspect_ratios, flip, &aspect_ratios_vec);
size_t num_priors = aspect_ratios_vec.size() * min_sizes.size(); size_t num_priors = aspect_ratios_vec.size() * min_sizes.size();
if (max_sizes.size() > 0) { if (max_sizes.size() > 0) {
......
...@@ -96,7 +96,7 @@ class PriorBoxOpCUDAKernel : public framework::OpKernel<T> { ...@@ -96,7 +96,7 @@ class PriorBoxOpCUDAKernel : public framework::OpKernel<T> {
auto clip = ctx.Attr<bool>("clip"); auto clip = ctx.Attr<bool>("clip");
std::vector<float> aspect_ratios; std::vector<float> aspect_ratios;
ExpandAspectRatios(input_aspect_ratio, flip, aspect_ratios); ExpandAspectRatios(input_aspect_ratio, flip, &aspect_ratios);
T step_w = static_cast<T>(ctx.Attr<float>("step_w")); T step_w = static_cast<T>(ctx.Attr<float>("step_w"));
T step_h = static_cast<T>(ctx.Attr<float>("step_h")); T step_h = static_cast<T>(ctx.Attr<float>("step_h"));
......
...@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and ...@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#pragma once #pragma once
#include <algorithm>
#include <vector>
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/platform/transform.h" #include "paddle/fluid/platform/transform.h"
...@@ -22,23 +24,23 @@ namespace operators { ...@@ -22,23 +24,23 @@ namespace operators {
inline void ExpandAspectRatios(const std::vector<float>& input_aspect_ratior, inline void ExpandAspectRatios(const std::vector<float>& input_aspect_ratior,
bool flip, bool flip,
std::vector<float>& output_aspect_ratior) { std::vector<float>* output_aspect_ratior) {
constexpr float epsilon = 1e-6; constexpr float epsilon = 1e-6;
output_aspect_ratior.clear(); output_aspect_ratior->clear();
output_aspect_ratior.push_back(1.0f); output_aspect_ratior->push_back(1.0f);
for (size_t i = 0; i < input_aspect_ratior.size(); ++i) { for (size_t i = 0; i < input_aspect_ratior.size(); ++i) {
float ar = input_aspect_ratior[i]; float ar = input_aspect_ratior[i];
bool already_exist = false; bool already_exist = false;
for (size_t j = 0; j < output_aspect_ratior.size(); ++j) { for (size_t j = 0; j < output_aspect_ratior->size(); ++j) {
if (fabs(ar - output_aspect_ratior[j]) < epsilon) { if (fabs(ar - output_aspect_ratior->at(j)) < epsilon) {
already_exist = true; already_exist = true;
break; break;
} }
} }
if (!already_exist) { if (!already_exist) {
output_aspect_ratior.push_back(ar); output_aspect_ratior->push_back(ar);
if (flip) { if (flip) {
output_aspect_ratior.push_back(1.0f / ar); output_aspect_ratior->push_back(1.0f / ar);
} }
} }
} }
...@@ -68,7 +70,7 @@ class PriorBoxOpKernel : public framework::OpKernel<T> { ...@@ -68,7 +70,7 @@ class PriorBoxOpKernel : public framework::OpKernel<T> {
auto clip = ctx.Attr<bool>("clip"); auto clip = ctx.Attr<bool>("clip");
std::vector<float> aspect_ratios; std::vector<float> aspect_ratios;
ExpandAspectRatios(input_aspect_ratio, flip, aspect_ratios); ExpandAspectRatios(input_aspect_ratio, flip, &aspect_ratios);
T step_w = static_cast<T>(ctx.Attr<float>("step_w")); T step_w = static_cast<T>(ctx.Attr<float>("step_w"));
T step_h = static_cast<T>(ctx.Attr<float>("step_h")); T step_h = static_cast<T>(ctx.Attr<float>("step_h"));
......
...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and ...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/operators/rank_loss_op.h" #include "paddle/fluid/operators/rank_loss_op.h"
#include <string>
namespace paddle { namespace paddle {
namespace operators { namespace operators {
......
...@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include <future> // NOLINT
#include <ostream> #include <ostream>
#include "paddle/fluid/framework/data_type.h" #include "paddle/fluid/framework/data_type.h"
...@@ -19,7 +20,6 @@ limitations under the License. */ ...@@ -19,7 +20,6 @@ limitations under the License. */
#include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include <future>
#include "paddle/fluid/operators/detail/grpc_client.h" #include "paddle/fluid/operators/detail/grpc_client.h"
namespace paddle { namespace paddle {
......
...@@ -60,7 +60,7 @@ class ReshapeOp : public framework::OperatorWithKernel { ...@@ -60,7 +60,7 @@ class ReshapeOp : public framework::OperatorWithKernel {
static framework::DDim ValidateShape(const std::vector<int> shape, static framework::DDim ValidateShape(const std::vector<int> shape,
const framework::DDim &in_dims) { const framework::DDim &in_dims) {
const int64_t in_size = framework::product(in_dims); const int64_t in_size = framework::product(in_dims);
// only one dimension canbe set to -1, whose size will be automatically // only one dimension can be set to -1, whose size will be automatically
// infered. // infered.
const int64_t unk_dim_val = -1; const int64_t unk_dim_val = -1;
const int64_t copy_dim_val = 0; const int64_t copy_dim_val = 0;
...@@ -119,13 +119,15 @@ class ReshapeKernel : public framework::OpKernel<T> { ...@@ -119,13 +119,15 @@ class ReshapeKernel : public framework::OpKernel<T> {
auto *shape_tensor = ctx.Input<framework::LoDTensor>("Shape"); auto *shape_tensor = ctx.Input<framework::LoDTensor>("Shape");
framework::DDim out_dims = out->dims(); framework::DDim out_dims = out->dims();
if (shape_tensor) { if (shape_tensor) {
auto *shape_data = shape_tensor->data<int>(); auto *shape_data = shape_tensor->data<int>();
framework::Tensor cpu_shape_tensor;
if (platform::is_gpu_place(ctx.GetPlace())) { if (platform::is_gpu_place(ctx.GetPlace())) {
framework::Tensor cpu_shape_tensor;
TensorCopy(*shape_tensor, platform::CPUPlace(), ctx.device_context(), TensorCopy(*shape_tensor, platform::CPUPlace(), ctx.device_context(),
&cpu_shape_tensor); &cpu_shape_tensor);
shape_data = cpu_shape_tensor.data<int>(); shape_data = cpu_shape_tensor.data<int>();
ctx.device_context().Wait();
} }
auto shape = auto shape =
std::vector<int>(shape_data, shape_data + shape_tensor->numel()); std::vector<int>(shape_data, shape_data + shape_tensor->numel());
......
...@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and ...@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#pragma once #pragma once
#include <algorithm>
#include <limits>
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/operators/math/math_function.h"
......
...@@ -14,7 +14,7 @@ limitations under the License. */ ...@@ -14,7 +14,7 @@ limitations under the License. */
#include <unistd.h> #include <unistd.h>
#include <string> #include <string>
#include <thread> #include <thread> // NOLINT
#include "gtest/gtest.h" #include "gtest/gtest.h"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
...@@ -37,11 +37,11 @@ namespace m = paddle::operators::math; ...@@ -37,11 +37,11 @@ namespace m = paddle::operators::math;
std::unique_ptr<f::OperatorBase> listen_and_serv_op; std::unique_ptr<f::OperatorBase> listen_and_serv_op;
int selected_port; int selected_port;
void InitTensorsInScope(f::Scope &scope, p::CPUPlace &place) { void InitTensorsInScope(const p::CPUPlace &place, f::Scope *scope) {
p::CPUDeviceContext ctx(place); p::CPUDeviceContext ctx(place);
for (int i = 0; i < 2; ++i) { for (int i = 0; i < 2; ++i) {
auto var_name = paddle::string::Sprintf("x%d", i); auto var_name = paddle::string::Sprintf("x%d", i);
auto var = scope.Var(var_name); auto var = scope->Var(var_name);
auto tensor = var->GetMutable<f::LoDTensor>(); auto tensor = var->GetMutable<f::LoDTensor>();
tensor->Resize({10, 10}); tensor->Resize({10, 10});
float *expect = tensor->mutable_data<float>(place); float *expect = tensor->mutable_data<float>(place);
...@@ -50,20 +50,20 @@ void InitTensorsInScope(f::Scope &scope, p::CPUPlace &place) { ...@@ -50,20 +50,20 @@ void InitTensorsInScope(f::Scope &scope, p::CPUPlace &place) {
} }
} }
auto out_var = scope.Var("Out"); auto out_var = scope->Var("Out");
auto out_tensor = out_var->GetMutable<f::LoDTensor>(); auto out_tensor = out_var->GetMutable<f::LoDTensor>();
out_tensor->Resize({10, 10}); out_tensor->Resize({10, 10});
out_tensor->mutable_data<float>(place); // allocate out_tensor->mutable_data<float>(place); // allocate
} }
void InitSelectedRowsInScope(f::Scope &scope, p::CPUPlace &place) { void InitSelectedRowsInScope(const p::CPUPlace &place, f::Scope *scope) {
p::CPUDeviceContext ctx(place); p::CPUDeviceContext ctx(place);
int64_t height = 10; int64_t height = 10;
int64_t row_numel = 10; int64_t row_numel = 10;
m::SetConstant<p::CPUDeviceContext, float> set_one; m::SetConstant<p::CPUDeviceContext, float> set_one;
// init x0 // init x0
std::vector<int64_t> rows0{0, 4, 7}; std::vector<int64_t> rows0{0, 4, 7};
auto x0_var = scope.Var("x0"); auto x0_var = scope->Var("x0");
auto x0 = x0_var->GetMutable<f::SelectedRows>(); auto x0 = x0_var->GetMutable<f::SelectedRows>();
x0->set_rows(rows0); x0->set_rows(rows0);
x0->set_height(height); x0->set_height(height);
...@@ -74,7 +74,7 @@ void InitSelectedRowsInScope(f::Scope &scope, p::CPUPlace &place) { ...@@ -74,7 +74,7 @@ void InitSelectedRowsInScope(f::Scope &scope, p::CPUPlace &place) {
// init x1 // init x1
std::vector<int64_t> rows1{2, 9}; std::vector<int64_t> rows1{2, 9};
auto x1_var = scope.Var("x1"); auto x1_var = scope->Var("x1");
auto x1 = x1_var->GetMutable<f::SelectedRows>(); auto x1 = x1_var->GetMutable<f::SelectedRows>();
x1->set_rows(rows1); x1->set_rows(rows1);
x1->set_height(height); x1->set_height(height);
...@@ -83,7 +83,7 @@ void InitSelectedRowsInScope(f::Scope &scope, p::CPUPlace &place) { ...@@ -83,7 +83,7 @@ void InitSelectedRowsInScope(f::Scope &scope, p::CPUPlace &place) {
f::make_ddim({static_cast<int64_t>(rows1.size()), row_numel}), place); f::make_ddim({static_cast<int64_t>(rows1.size()), row_numel}), place);
set_one(ctx, x1_value, 1.0); set_one(ctx, x1_value, 1.0);
auto out_var = scope.Var("Out"); auto out_var = scope->Var("Out");
auto out = out_var->GetMutable<f::SelectedRows>(); auto out = out_var->GetMutable<f::SelectedRows>();
auto out_value = out->mutable_value(); auto out_value = out->mutable_value();
out->set_height(height); out->set_height(height);
...@@ -117,15 +117,16 @@ void StartServerNet(bool is_sparse) { ...@@ -117,15 +117,16 @@ void StartServerNet(bool is_sparse) {
f::Scope scope; f::Scope scope;
p::CPUPlace place; p::CPUPlace place;
if (is_sparse) { if (is_sparse) {
InitSelectedRowsInScope(scope, place); InitSelectedRowsInScope(place, &scope);
} else { } else {
InitTensorsInScope(scope, place); InitTensorsInScope(place, &scope);
} }
// sub program run in listen_and_serv_op, for simple test we use sum // sub program run in listen_and_serv_op, for simple test we use sum
f::ProgramDesc program; f::ProgramDesc program;
const auto &root_block = program.Block(0); const auto &root_block = program.Block(0);
auto *optimize_block = program.AppendBlock(root_block); auto *optimize_block = program.AppendBlock(root_block);
auto *prefetch_block = program.AppendBlock(root_block);
// X for server side tensors, RX for received tensers, must be of same shape. // X for server side tensors, RX for received tensers, must be of same shape.
AddOp("sum", {{"X", {"x0", "x1"}}}, {{"Out", {"Out"}}}, {}, optimize_block); AddOp("sum", {{"X", {"x0", "x1"}}}, {{"Out", {"Out"}}}, {}, optimize_block);
...@@ -135,6 +136,7 @@ void StartServerNet(bool is_sparse) { ...@@ -135,6 +136,7 @@ void StartServerNet(bool is_sparse) {
attrs.insert({"ParamList", std::vector<std::string>({"Out"})}); attrs.insert({"ParamList", std::vector<std::string>({"Out"})});
attrs.insert({"GradList", std::vector<std::string>({"x1"})}); attrs.insert({"GradList", std::vector<std::string>({"x1"})});
attrs.insert({"OptimizeBlock", optimize_block}); attrs.insert({"OptimizeBlock", optimize_block});
attrs.insert({"PrefetchBlock", prefetch_block});
listen_and_serv_op = listen_and_serv_op =
f::OpRegistry::CreateOp("listen_and_serv", {{"X", {"x1"}}}, {}, attrs); f::OpRegistry::CreateOp("listen_and_serv", {{"X", {"x1"}}}, {}, attrs);
LOG(INFO) << "selected port before run " << selected_port; LOG(INFO) << "selected port before run " << selected_port;
...@@ -148,7 +150,7 @@ TEST(SendRecvOp, CPUDense) { ...@@ -148,7 +150,7 @@ TEST(SendRecvOp, CPUDense) {
// local net // local net
f::Scope scope; f::Scope scope;
p::CPUPlace place; p::CPUPlace place;
InitTensorsInScope(scope, place); InitTensorsInScope(place, &scope);
// create rpc client var // create rpc client var
scope.Var("RPC_CLIENT_VAR"); scope.Var("RPC_CLIENT_VAR");
...@@ -191,7 +193,7 @@ TEST(SendRecvOp, CPUSparse) { ...@@ -191,7 +193,7 @@ TEST(SendRecvOp, CPUSparse) {
f::Scope scope; f::Scope scope;
p::CPUPlace place; p::CPUPlace place;
p::CPUDeviceContext ctx(place); p::CPUDeviceContext ctx(place);
InitSelectedRowsInScope(scope, place); InitSelectedRowsInScope(place, &scope);
scope.Var("RPC_CLIENT_VAR"); scope.Var("RPC_CLIENT_VAR");
f::AttributeMap attrs; f::AttributeMap attrs;
selected_port = static_cast<paddle::operators::ListenAndServOp *>( selected_port = static_cast<paddle::operators::ListenAndServOp *>(
......
...@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include <future> #include <future> // NOLINT
#include <ostream> #include <ostream>
#include "paddle/fluid/framework/data_type.h" #include "paddle/fluid/framework/data_type.h"
...@@ -36,7 +36,7 @@ class SendVarsOp : public framework::OperatorBase { ...@@ -36,7 +36,7 @@ class SendVarsOp : public framework::OperatorBase {
auto ins = Inputs("X"); auto ins = Inputs("X");
std::vector<std::string> epmap = Attr<std::vector<std::string>>("epmap"); std::vector<std::string> epmap = Attr<std::vector<std::string>>("epmap");
int sync_send = Attr<int>("sync_sent"); int sync_send = Attr<int>("sync_send");
platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
auto& ctx = *pool.Get(place); auto& ctx = *pool.Get(place);
......
...@@ -35,8 +35,8 @@ class SGDOp : public framework::OperatorWithKernel { ...@@ -35,8 +35,8 @@ class SGDOp : public framework::OperatorWithKernel {
PADDLE_ENFORCE_EQ(framework::product(lr_dims), 1, PADDLE_ENFORCE_EQ(framework::product(lr_dims), 1,
"Learning rate should have 1 element"); "Learning rate should have 1 element");
auto param_dim = ctx->GetInputDim("Param"); auto param_dim = ctx->GetInputDim("Param");
// TODO(qijun): check dimensions of Param and Grad at complie // TODO(qijun): check dimensions of Param and Grad at compile
// and run time. // and runtime.
ctx->SetOutputDim("ParamOut", param_dim); ctx->SetOutputDim("ParamOut", param_dim);
} }
......
...@@ -65,7 +65,8 @@ class SGDOpKernel : public framework::OpKernel<T> { ...@@ -65,7 +65,8 @@ class SGDOpKernel : public framework::OpKernel<T> {
auto &grad_rows = grad->rows(); auto &grad_rows = grad->rows();
size_t grad_row_numel = grad_value.numel() / grad_rows.size(); size_t grad_row_numel = grad_value.numel() / grad_rows.size();
PADDLE_ENFORCE_EQ(grad_row_numel, param_out->numel() / grad_height); PADDLE_ENFORCE_EQ(static_cast<int64_t>(grad_row_numel),
param_out->numel() / grad_height);
auto *grad_data = grad_value.data<T>(); auto *grad_data = grad_value.data<T>();
auto *out_data = param_out->data<T>(); auto *out_data = param_out->data<T>();
...@@ -73,7 +74,7 @@ class SGDOpKernel : public framework::OpKernel<T> { ...@@ -73,7 +74,7 @@ class SGDOpKernel : public framework::OpKernel<T> {
for (size_t i = 0; i < grad_rows.size(); i++) { for (size_t i = 0; i < grad_rows.size(); i++) {
PADDLE_ENFORCE(grad_rows[i] < grad_height, PADDLE_ENFORCE(grad_rows[i] < grad_height,
"Input rows index should less than height"); "Input rows index should less than height");
for (int64_t j = 0; j < grad_row_numel; j++) { for (size_t j = 0; j < grad_row_numel; j++) {
out_data[grad_rows[i] * grad_row_numel + j] -= out_data[grad_rows[i] * grad_row_numel + j] -=
lr[0] * grad_data[i * grad_row_numel + j]; lr[0] * grad_data[i * grad_row_numel + j];
} }
...@@ -107,7 +108,7 @@ class SGDOpKernel : public framework::OpKernel<T> { ...@@ -107,7 +108,7 @@ class SGDOpKernel : public framework::OpKernel<T> {
PADDLE_ENFORCE(grad.rows()[i] < grad.height(), PADDLE_ENFORCE(grad.rows()[i] < grad.height(),
"Input rows index should less than height"); "Input rows index should less than height");
int64_t id_index = param.index(grad.rows()[i]); int64_t id_index = param.index(grad.rows()[i]);
for (int64_t j = 0; j < grad_row_width; j++) { for (size_t j = 0; j < grad_row_width; j++) {
out_data[id_index * grad_row_width + j] -= out_data[id_index * grad_row_width + j] -=
lr[0] * grad_data[i * grad_row_width + j]; lr[0] * grad_data[i * grad_row_width + j];
} }
......
...@@ -48,11 +48,11 @@ class SplitIdsOp : public framework::OperatorWithKernel { ...@@ -48,11 +48,11 @@ class SplitIdsOp : public framework::OperatorWithKernel {
PADDLE_ENFORCE(ctx->HasOutputs("Out"), "SplitIdsOp must has output Out."); PADDLE_ENFORCE(ctx->HasOutputs("Out"), "SplitIdsOp must has output Out.");
auto ids_var_type = ctx->GetInputsVarType("Ids").front(); auto ids_var_type = ctx->GetInputsVarType("Ids").front();
PADDLE_ENFORCE_EQ(ids_var_type, framework::proto::VarType::LOD_TENSOR);
auto ids_dims = ctx->GetInputDim("Ids"); auto ids_dims = ctx->GetInputDim("Ids");
PADDLE_ENFORCE_EQ(ids_dims.size(), 2); if (ids_var_type == framework::proto::VarType::LOD_TENSOR) {
PADDLE_ENFORCE_EQ(ids_dims[1], 1); PADDLE_ENFORCE_EQ(ids_dims.size(), 2);
PADDLE_ENFORCE_EQ(ids_dims[1], 1);
}
} }
}; };
...@@ -60,8 +60,9 @@ class SplitIdsOpInferVarType : public framework::VarTypeInference { ...@@ -60,8 +60,9 @@ class SplitIdsOpInferVarType : public framework::VarTypeInference {
public: public:
void operator()(const framework::OpDesc &op_desc, void operator()(const framework::OpDesc &op_desc,
framework::BlockDesc *block) const override { framework::BlockDesc *block) const override {
auto *input_var = block->Var(op_desc.Input("Ids")[0]);
for (auto &out_var : op_desc.Output("Out")) { for (auto &out_var : op_desc.Output("Out")) {
block->Var(out_var)->SetType(framework::proto::VarType::LOD_TENSOR); block->Var(out_var)->SetType(input_var->GetType());
} }
} }
}; };
...@@ -73,4 +74,5 @@ namespace ops = paddle::operators; ...@@ -73,4 +74,5 @@ namespace ops = paddle::operators;
REGISTER_OPERATOR(split_ids, ops::SplitIdsOp, ops::SplitIdsOpMaker, REGISTER_OPERATOR(split_ids, ops::SplitIdsOp, ops::SplitIdsOpMaker,
ops::SplitIdsOpInferVarType); ops::SplitIdsOpInferVarType);
REGISTER_OP_CPU_KERNEL( REGISTER_OP_CPU_KERNEL(
split_ids, ops::SplitIdsOpKernel<paddle::platform::CPUPlace, int64_t>); split_ids, ops::SplitIdsOpKernel<paddle::platform::CPUPlace, int64_t>,
ops::SplitIdsOpKernel<paddle::platform::CPUPlace, float>);
...@@ -24,35 +24,63 @@ namespace operators { ...@@ -24,35 +24,63 @@ namespace operators {
template <typename DeviceContext, typename T> template <typename DeviceContext, typename T>
class SplitIdsOpKernel : public framework::OpKernel<T> { class SplitIdsOpKernel : public framework::OpKernel<T> {
public: public:
void Compute(const framework::ExecutionContext& ctx) const override { void Compute(const framework::ExecutionContext &ctx) const override {
auto place = ctx.GetPlace(); auto place = ctx.GetPlace();
if (!platform::is_cpu_place(place)) { if (!platform::is_cpu_place(place)) {
PADDLE_THROW("SplitIds do not support GPU kernel"); PADDLE_THROW("SplitIds do not support GPU kernel");
} }
auto& ids_dims = ctx.Input<framework::LoDTensor>("Ids")->dims(); const auto *ids_var = ctx.InputVar("Ids");
const T* ids = ctx.Input<framework::LoDTensor>("Ids")->data<T>(); if (ids_var->IsType<framework::LoDTensor>()) {
auto outs = ctx.MultiOutput<framework::LoDTensor>("Out"); const auto &ids_dims = ctx.Input<framework::LoDTensor>("Ids")->dims();
const size_t shard_num = outs.size(); const T *ids = ctx.Input<framework::LoDTensor>("Ids")->data<T>();
auto outs = ctx.MultiOutput<framework::LoDTensor>("Out");
const size_t shard_num = outs.size();
std::vector<std::vector<T>> out_ids; std::vector<std::vector<T>> out_ids;
out_ids.resize(outs.size()); out_ids.resize(outs.size());
// split id by their shard_num. // split id by their shard_num.
for (int i = 0; i < ids_dims[0]; ++i) { for (int i = 0; i < ids_dims[0]; ++i) {
T id = ids[i]; T id = ids[i];
size_t shard_id = static_cast<size_t>(id) % shard_num; size_t shard_id = static_cast<size_t>(id) % shard_num;
out_ids[shard_id].push_back(id); out_ids[shard_id].push_back(id);
} }
// create tensor for each shard and send to parameter server
for (size_t i = 0; i < out_ids.size(); ++i) {
auto *shard_t = outs[i];
std::vector<T> ids = out_ids[i];
auto *shard_data = shard_t->mutable_data<T>(
framework::make_ddim({static_cast<int64_t>(ids.size()), 1}), place);
for (size_t i = 0; i < ids.size(); ++i) {
shard_data[i] = ids[i];
}
}
} else if (ids_var->IsType<framework::SelectedRows>()) {
const auto *ids_selected_rows = ctx.Input<framework::SelectedRows>("Ids");
auto &ids_dims = ids_selected_rows->value().dims();
PADDLE_ENFORCE_EQ(ids_dims[0], ids_selected_rows->rows().size(), "");
const T *ids = ids_selected_rows->value().data<T>();
const auto &ids_rows = ids_selected_rows->rows();
auto outs = ctx.MultiOutput<framework::SelectedRows>("Out");
const size_t shard_num = outs.size();
// get rows for outputs
for (auto &id : ids_rows) {
size_t shard_id = static_cast<size_t>(id) % shard_num;
outs[shard_id]->mutable_rows()->push_back(id);
}
// create tensor for each shard and send to parameter server int64_t row_width = ids_dims[1];
for (size_t i = 0; i < out_ids.size(); ++i) { for (auto &out : outs) {
auto* shard_t = outs[i]; out->set_height(ids_selected_rows->height());
std::vector<T> ids = out_ids[i]; framework::DDim ddim = framework::make_ddim(
auto* shard_data = shard_t->mutable_data<T>( {static_cast<int64_t>(out->rows().size()), row_width});
framework::make_ddim({static_cast<int64_t>(ids.size()), 1}), place); T *output = out->mutable_value()->mutable_data<T>(ddim, place);
for (size_t i = 0; i < ids.size(); ++i) { for (size_t i = 0; i < ddim[0]; ++i) {
shard_data[i] = ids[i]; memcpy(output + i * row_width, ids + out->rows()[i] * row_width,
row_width * sizeof(T));
}
} }
} }
} }
......
...@@ -37,8 +37,8 @@ inline void StridedMemcpy(const platform::DeviceContext& dev_ctx, const T* src, ...@@ -37,8 +37,8 @@ inline void StridedMemcpy(const platform::DeviceContext& dev_ctx, const T* src,
const framework::DDim& src_stride, const framework::DDim& src_stride,
const framework::DDim& dst_dim, const framework::DDim& dst_dim,
const framework::DDim& dst_stride, T* dst) { const framework::DDim& dst_stride, T* dst) {
using namespace detail; paddle::operators::detail::StridedCopyDimVisitor<T> func(
StridedCopyDimVisitor<T> func(dev_ctx, src, src_stride, dst_stride, dst); dev_ctx, src, src_stride, dst_stride, dst);
boost::apply_visitor(func, dst_dim); boost::apply_visitor(func, dst_dim);
} }
......
...@@ -10,9 +10,11 @@ See the License for the specific language governing permissions and ...@@ -10,9 +10,11 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/operators/sum_op.h" #include "paddle/fluid/operators/sum_op.h"
#include <algorithm> #include <algorithm>
#include <string> #include <string>
#include <vector> #include <vector>
#include "paddle/fluid/framework/var_type_inference.h" #include "paddle/fluid/framework/var_type_inference.h"
#include "paddle/fluid/operators/detail/safe_ref.h" #include "paddle/fluid/operators/detail/safe_ref.h"
...@@ -37,7 +39,10 @@ class SumOp : public framework::OperatorWithKernel { ...@@ -37,7 +39,10 @@ class SumOp : public framework::OperatorWithKernel {
auto x_dims = ctx->GetInputsDim("X"); auto x_dims = ctx->GetInputsDim("X");
size_t N = x_dims.size(); size_t N = x_dims.size();
PADDLE_ENFORCE_GT(N, 1, "Input tensors count should > 1."); PADDLE_ENFORCE_GT(N, 0, "Input tensors count should > 0.");
if (N == 1) {
VLOG(3) << "Warning: sum have only one input, may waste memory";
}
framework::DDim in_dim({0}); framework::DDim in_dim({0});
for (auto& x_dim : x_dims) { for (auto& x_dim : x_dims) {
......
...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and ...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/top_k_op.h"
#include "paddle/fluid/platform/assert.h" #include "paddle/fluid/platform/assert.h"
namespace paddle { namespace paddle {
...@@ -133,71 +134,71 @@ __device__ __forceinline__ void GetTopK(Pair<T> topk[], const T* val, int* col, ...@@ -133,71 +134,71 @@ __device__ __forceinline__ void GetTopK(Pair<T> topk[], const T* val, int* col,
} }
template <typename T, int MaxLength, int BlockSize> template <typename T, int MaxLength, int BlockSize>
__device__ __forceinline__ void ThreadGetTopK(Pair<T> topk[], int& beam, __device__ __forceinline__ void ThreadGetTopK(Pair<T> topk[], int* beam,
int beam_size, const T* src, int beam_size, const T* src,
bool& firstStep, bool& is_empty, bool* firstStep, bool* is_empty,
Pair<T>& max, int dim, Pair<T>* max, int dim,
const int tid) { const int tid) {
if (beam > 0) { if (*beam > 0) {
int length = beam < beam_size ? beam : beam_size; int length = (*beam) < beam_size ? *beam : beam_size;
if (firstStep) { if (*firstStep) {
firstStep = false; *firstStep = false;
GetTopK<T, BlockSize>(topk, src, tid, dim, length); GetTopK<T, BlockSize>(topk, src, tid, dim, length);
} else { } else {
for (int k = 0; k < MaxLength; k++) { for (int k = 0; k < MaxLength; k++) {
if (k < MaxLength - beam) { if (k < MaxLength - (*beam)) {
topk[k] = topk[k + beam]; topk[k] = topk[k + *beam];
} else { } else {
topk[k].set(-INFINITY, -1); topk[k].set(-INFINITY, -1);
} }
} }
if (!is_empty) { if (!(*is_empty)) {
GetTopK<T, BlockSize>(topk + MaxLength - beam, src, tid, dim, max, GetTopK<T, BlockSize>(topk + MaxLength - *beam, src, tid, dim, *max,
length); length);
} }
} }
max = topk[MaxLength - 1]; *max = topk[MaxLength - 1];
if (max.v == -1) is_empty = true; if ((*max).v == -1) *is_empty = true;
beam = 0; *beam = 0;
} }
} }
template <typename T, int MaxLength, int BlockSize> template <typename T, int MaxLength, int BlockSize>
__device__ __forceinline__ void ThreadGetTopK(Pair<T> topk[], int& beam, __device__ __forceinline__ void ThreadGetTopK(Pair<T> topk[], int* beam,
int beam_size, const T* val, int beam_size, const T* val,
int* col, bool& firstStep, int* col, bool* firstStep,
bool& is_empty, Pair<T>& max, bool* is_empty, Pair<T>* max,
int dim, const int tid) { int dim, const int tid) {
if (beam > 0) { if (*beam > 0) {
int length = beam < beam_size ? beam : beam_size; int length = (*beam) < beam_size ? *beam : beam_size;
if (firstStep) { if (*firstStep) {
firstStep = false; *firstStep = false;
GetTopK<T, BlockSize>(topk, val, col, tid, dim, length); GetTopK<T, BlockSize>(topk, val, col, tid, dim, length);
} else { } else {
for (int k = 0; k < MaxLength; k++) { for (int k = 0; k < MaxLength; k++) {
if (k < MaxLength - beam) { if (k < MaxLength - *beam) {
topk[k] = topk[k + beam]; topk[k] = topk[k + *beam];
} else { } else {
topk[k].set(-INFINITY, -1); topk[k].set(-INFINITY, -1);
} }
} }
if (!is_empty) { if (!(*is_empty)) {
GetTopK<T, BlockSize>(topk + MaxLength - beam, val, col, tid, dim, max, GetTopK<T, BlockSize>(topk + MaxLength - *beam, val, col, tid, dim, max,
length); length);
} }
} }
max = topk[MaxLength - 1]; *max = topk[MaxLength - 1];
if (max.v == -1) is_empty = true; if ((*max).v == -1) *is_empty = true;
beam = 0; *beam = 0;
} }
} }
template <typename T, int MaxLength, int BlockSize> template <typename T, int MaxLength, int BlockSize>
__device__ __forceinline__ void BlockReduce(Pair<T>* sh_topk, int* maxid, __device__ __forceinline__ void BlockReduce(Pair<T>* sh_topk, int* maxid,
Pair<T> topk[], T** topVal, Pair<T> topk[], T** topVal,
int64_t** topIds, int& beam, int& k, int64_t** topIds, int* beam, int* k,
const int tid, const int warp) { const int tid, const int warp) {
while (true) { while (true) {
__syncthreads(); __syncthreads();
...@@ -225,17 +226,17 @@ __device__ __forceinline__ void BlockReduce(Pair<T>* sh_topk, int* maxid, ...@@ -225,17 +226,17 @@ __device__ __forceinline__ void BlockReduce(Pair<T>* sh_topk, int* maxid,
(*topVal)++; (*topVal)++;
(*topIds)++; (*topIds)++;
} }
if (tid == maxid[0]) beam++; if (tid == maxid[0]) (*beam)++;
if (--k == 0) break; if (--(*k) == 0) break;
__syncthreads(); __syncthreads();
if (tid == maxid[0]) { if (tid == maxid[0]) {
if (beam < MaxLength) { if (*beam < MaxLength) {
sh_topk[tid] = topk[beam]; sh_topk[tid] = topk[*beam];
} }
} }
if (maxid[0] / 32 == warp) { if (maxid[0] / 32 == warp) {
if (__shfl(beam, (maxid[0]) % 32, 32) == MaxLength) break; if (__shfl(*beam, (maxid[0]) % 32, 32) == MaxLength) break;
} }
} }
} }
...@@ -268,13 +269,13 @@ __global__ void KeMatrixTopK(T* output, int output_stride, int64_t* indices, ...@@ -268,13 +269,13 @@ __global__ void KeMatrixTopK(T* output, int output_stride, int64_t* indices,
topk[k].set(-INFINITY, -1); topk[k].set(-INFINITY, -1);
} }
while (k) { while (k) {
ThreadGetTopK<T, MaxLength, BlockSize>(topk, beam, k, ThreadGetTopK<T, MaxLength, BlockSize>(topk, &beam, k,
src + blockIdx.x * lds, firststep, src + blockIdx.x * lds, &firststep,
is_empty, max, dim, tid); &is_empty, &max, dim, tid);
sh_topk[tid] = topk[0]; sh_topk[tid] = topk[0];
BlockReduce<T, MaxLength, BlockSize>(sh_topk, maxid, topk, &output, BlockReduce<T, MaxLength, BlockSize>(sh_topk, maxid, topk, &output,
&indices, beam, k, tid, warp); &indices, &beam, &k, tid, warp);
} }
} }
...@@ -308,9 +309,9 @@ class TopkOpCUDAKernel : public framework::OpKernel<T> { ...@@ -308,9 +309,9 @@ class TopkOpCUDAKernel : public framework::OpKernel<T> {
KeMatrixTopK<T, 5, 256><<< KeMatrixTopK<T, 5, 256><<<
grid, threads, 0, reinterpret_cast<const platform::CUDADeviceContext&>( grid, threads, 0, reinterpret_cast<const platform::CUDADeviceContext&>(
ctx.device_context()) ctx.device_context())
.stream()>>>(output_data, output->dims()[1], .stream()>>>(
indices_data, input_data, output_data, output->dims()[1], indices_data, input_data, input_width,
input_width, input_width, int(k)); input_width, static_cast<int>(k));
} }
}; };
......
...@@ -1183,6 +1183,8 @@ class Parameter(Variable): ...@@ -1183,6 +1183,8 @@ class Parameter(Variable):
self.gradient_clip_attr = kwargs.get('gradient_clip_attr', None) self.gradient_clip_attr = kwargs.get('gradient_clip_attr', None)
self.do_model_average = kwargs.get('do_model_average', None)
def __str__(self): def __str__(self):
return self.to_string(True) return self.to_string(True)
...@@ -1203,7 +1205,7 @@ class Parameter(Variable): ...@@ -1203,7 +1205,7 @@ class Parameter(Variable):
if with_details: if with_details:
res_str = Variable.to_string(self, throw_on_error, True) res_str = Variable.to_string(self, throw_on_error, True)
additional_attr = ("trainable", "optimize_attr", "regularizer", additional_attr = ("trainable", "optimize_attr", "regularizer",
"gradient_clip_attr") "gradient_clip_attr", "do_model_average")
for attr_name in additional_attr: for attr_name in additional_attr:
res_str += "%s: %s\n" % (attr_name, res_str += "%s: %s\n" % (attr_name,
str(getattr(self, attr_name))) str(getattr(self, attr_name)))
......
...@@ -218,6 +218,7 @@ def fc(input, ...@@ -218,6 +218,7 @@ def fc(input,
def embedding(input, def embedding(input,
size, size,
is_sparse=False, is_sparse=False,
is_distributed=False,
padding_idx=None, padding_idx=None,
param_attr=None, param_attr=None,
dtype='float32'): dtype='float32'):
...@@ -268,8 +269,11 @@ def embedding(input, ...@@ -268,8 +269,11 @@ def embedding(input,
inputs={'Ids': input, inputs={'Ids': input,
'W': w}, 'W': w},
outputs={'Out': tmp}, outputs={'Out': tmp},
attrs={'is_sparse': is_sparse, attrs={
'padding_idx': padding_idx}) 'is_sparse': is_sparse,
'is_distributed': is_distributed,
'padding_idx': padding_idx
})
return tmp return tmp
...@@ -1516,7 +1520,8 @@ def batch_norm(input, ...@@ -1516,7 +1520,8 @@ def batch_norm(input,
in_place=False, in_place=False,
name=None, name=None,
moving_mean_name=None, moving_mean_name=None,
moving_variance_name=None): moving_variance_name=None,
do_model_average_for_mean_and_var=False):
""" """
This function helps create an operator to implement This function helps create an operator to implement
the BatchNorm layer using the configurations from the input parameters. the BatchNorm layer using the configurations from the input parameters.
...@@ -1547,7 +1552,10 @@ def batch_norm(input, ...@@ -1547,7 +1552,10 @@ def batch_norm(input,
mean = helper.create_parameter( mean = helper.create_parameter(
attr=ParamAttr( attr=ParamAttr(
name=moving_mean_name, initializer=Constant(0.0), trainable=False), name=moving_mean_name,
initializer=Constant(0.0),
trainable=False,
do_model_average=do_model_average_for_mean_and_var),
shape=param_shape, shape=param_shape,
dtype=input.dtype) dtype=input.dtype)
mean.stop_gradient = True mean.stop_gradient = True
...@@ -1556,7 +1564,8 @@ def batch_norm(input, ...@@ -1556,7 +1564,8 @@ def batch_norm(input,
attr=ParamAttr( attr=ParamAttr(
name=moving_variance_name, name=moving_variance_name,
initializer=Constant(1.0), initializer=Constant(1.0),
trainable=False), trainable=False,
do_model_average=do_model_average_for_mean_and_var),
shape=param_shape, shape=param_shape,
dtype=input.dtype) dtype=input.dtype)
variance.stop_gradient = True variance.stop_gradient = True
...@@ -3374,14 +3383,14 @@ def reshape(x, shape, actual_shape=None, act=None, inplace=True, name=None): ...@@ -3374,14 +3383,14 @@ def reshape(x, shape, actual_shape=None, act=None, inplace=True, name=None):
Here are some examples to explain it. Here are some examples to explain it.
1. Given a 3-D tensor x with a shape [2, 4, 6], and the target shape 1. Given a 3-D tensor x with a shape [2, 4, 6], and the target shape
is [6, 8], the reshape operator will transform x into a 2-D tensor with is [6, 8], the reshape operator will transform x into a 2-D tensor with
shape [6, 8] and leaving x's data unchanged. shape [6, 8] and leaving x's data unchanged.
2. Given a 3-D tensor x with a shape [2, 4, 6], and the target shape 2. Given a 3-D tensor x with a shape [2, 4, 6], and the target shape
specified is [2, 3, -1, 2], the reshape operator will transform x into a specified is [2, 3, -1, 2], the reshape operator will transform x into a
4-D tensor with shape [2, 3, 4, 2] and leaving x's data unchanged. In this 4-D tensor with shape [2, 3, 4, 2] and leaving x's data unchanged. In this
case, one dimension of the target shape is set to -1, the value of this case, one dimension of the target shape is set to -1, the value of this
dimension is inferred from the total element number of x and remaining dimension is inferred from the total element number of x and remaining
dimensions. dimensions.
3. Given a 3-D tensor x with a shape [2, 4, 6], and the target shape 3. Given a 3-D tensor x with a shape [2, 4, 6], and the target shape
...@@ -3615,7 +3624,7 @@ def lrn(input, n=5, k=1.0, alpha=1e-4, beta=0.75, name=None): ...@@ -3615,7 +3624,7 @@ def lrn(input, n=5, k=1.0, alpha=1e-4, beta=0.75, name=None):
def pad(x, paddings, pad_value=0., name=None): def pad(x, paddings, pad_value=0., name=None):
""" """
Pads a tensor with a constant value given by :attr:`pad_value`, and the Pads a tensor with a constant value given by :attr:`pad_value`, and the
padded width is specified by :attr:`paddings`. padded width is specified by :attr:`paddings`.
Specifically, the number of values padded before the contents of :attr:`x` Specifically, the number of values padded before the contents of :attr:`x`
in dimension :attr:`i` is indicated by :attr:`paddings[i]`, and the number in dimension :attr:`i` is indicated by :attr:`paddings[i]`, and the number
...@@ -3643,7 +3652,7 @@ def pad(x, paddings, pad_value=0., name=None): ...@@ -3643,7 +3652,7 @@ def pad(x, paddings, pad_value=0., name=None):
x (Variable): The input tensor variable. x (Variable): The input tensor variable.
paddings (list): A list of integers. Its elements specify the padded paddings (list): A list of integers. Its elements specify the padded
width before and after for each dimension in turn. width before and after for each dimension in turn.
The length of :attr:paddings must be The length of :attr:paddings must be
:math:`rank(x) \\times 2`. :math:`rank(x) \\times 2`.
pad_value (float): The constant value used to pad. pad_value (float): The constant value used to pad.
name(str|None): A name for this layer(optional). If set None, the layer name(str|None): A name for this layer(optional). If set None, the layer
......
...@@ -11,7 +11,7 @@ ...@@ -11,7 +11,7 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
import re
from collections import defaultdict from collections import defaultdict
from paddle.fluid.framework import Program from paddle.fluid.framework import Program
import framework import framework
...@@ -818,8 +818,8 @@ class ModelAverage(Optimizer): ...@@ -818,8 +818,8 @@ class ModelAverage(Optimizer):
min_average_window, max_average_window and current update times. min_average_window, max_average_window and current update times.
Args: Args:
params_grads: A list of parameter-grad variable pairs.
average_window_rate: The rate of average window. average_window_rate: The rate of average window.
params_grads: A list of parameter-grad variable pairs.
min_average_window: The minimum size of average window. min_average_window: The minimum size of average window.
max_average_window: The maximum size of average window. max_average_window: The maximum size of average window.
...@@ -840,8 +840,8 @@ class ModelAverage(Optimizer): ...@@ -840,8 +840,8 @@ class ModelAverage(Optimizer):
""" """
def __init__(self, def __init__(self,
params_grads,
average_window_rate, average_window_rate,
params_grads=None,
min_average_window=10000, min_average_window=10000,
max_average_window=10000, max_average_window=10000,
**kwargs): **kwargs):
...@@ -849,24 +849,37 @@ class ModelAverage(Optimizer): ...@@ -849,24 +849,37 @@ class ModelAverage(Optimizer):
self.average_window = average_window_rate self.average_window = average_window_rate
self.min_average_window = min_average_window self.min_average_window = min_average_window
self.max_average_window = max_average_window self.max_average_window = max_average_window
self.params_grads = params_grads
self.params_grads = [] if params_grads is None else params_grads
params = {}
for param, grad in self.params_grads:
if param.do_model_average != False:
params[param.name] = (param, grad)
for param in framework.default_main_program().global_block(
).all_parameters():
if param.name not in params and param.do_model_average != False:
grad = param.block.create_var(
name=unique_name.generate(".".join([param.name, 'tmp'])),
dtype=param.dtype,
persistable=False,
stop_gradient=True)
params[param.name] = (param, grad)
self.params_grads = params.values()
for param, grad in self.params_grads: for param, grad in self.params_grads:
if grad is not None: self._append_average_accumulate_op(param)
self._append_average_accumulate_op(param)
self.apply_program = Program() self.apply_program = Program()
block = self.apply_program.global_block() block = self.apply_program.global_block()
with program_guard(main_program=self.apply_program): with program_guard(main_program=self.apply_program):
for param_grad in self.params_grads: for param_grad in self.params_grads:
if param_grad[1] is not None: self._add_average_apply_op(block, param_grad)
self._add_average_apply_op(block, param_grad)
self.restore_program = Program() self.restore_program = Program()
block = self.restore_program.global_block() block = self.restore_program.global_block()
with program_guard(main_program=self.restore_program): with program_guard(main_program=self.restore_program):
for param_grad in self.params_grads: for param_grad in self.params_grads:
if param_grad[1] is not None: self._add_average_restore_op(block, param_grad)
self._add_average_restore_op(block, param_grad)
def _add_average_apply_op(self, block, param_grad): def _add_average_apply_op(self, block, param_grad):
param = block.clone_variable(param_grad[0]) param = block.clone_variable(param_grad[0])
......
...@@ -28,13 +28,15 @@ class ParamAttr(object): ...@@ -28,13 +28,15 @@ class ParamAttr(object):
learning_rate=1.0, learning_rate=1.0,
regularizer=None, regularizer=None,
trainable=True, trainable=True,
gradient_clip=None): gradient_clip=None,
do_model_average=None):
self.name = name self.name = name
self.initializer = initializer self.initializer = initializer
self.learning_rate = learning_rate self.learning_rate = learning_rate
self.regularizer = regularizer self.regularizer = regularizer
self.trainable = trainable self.trainable = trainable
self.gradient_clip = gradient_clip self.gradient_clip = gradient_clip
self.model_average = do_model_average
def set_default_initializer(self, initializer): def set_default_initializer(self, initializer):
if initializer is None: if initializer is None:
...@@ -80,7 +82,8 @@ class ParamAttr(object): ...@@ -80,7 +82,8 @@ class ParamAttr(object):
}, },
'regularizer': self.regularizer, 'regularizer': self.regularizer,
'trainable': self.trainable, 'trainable': self.trainable,
'gradient_clip_attr': self.gradient_clip 'gradient_clip_attr': self.gradient_clip,
'model_average': self.model_average
} }
if with_initializer: if with_initializer:
kwargs['initializer'] = self.initializer kwargs['initializer'] = self.initializer
...@@ -90,7 +93,7 @@ class ParamAttr(object): ...@@ -90,7 +93,7 @@ class ParamAttr(object):
class WeightNormParamAttr(ParamAttr): class WeightNormParamAttr(ParamAttr):
""" """
Used for weight normalization. Any field in ParamAttr can also be set here. Used for weight normalization. Any field in ParamAttr can also be set here.
Besides, an extra field dim can be set to indicate the dimension except Besides, an extra field dim can be set to indicate the dimension except
which to normalize. which to normalize.
""" """
# List to record the parameters reparameterized by weight normalization. # List to record the parameters reparameterized by weight normalization.
......
...@@ -37,7 +37,7 @@ depth = 8 ...@@ -37,7 +37,7 @@ depth = 8
mix_hidden_lr = 1e-3 mix_hidden_lr = 1e-3
IS_SPARSE = True IS_SPARSE = True
PASS_NUM = 10 PASS_NUM = 100
BATCH_SIZE = 10 BATCH_SIZE = 10
embedding_name = 'emb' embedding_name = 'emb'
...@@ -77,7 +77,8 @@ def db_lstm(word, predicate, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, mark, ...@@ -77,7 +77,8 @@ def db_lstm(word, predicate, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, mark,
emb_layers.append(mark_embedding) emb_layers.append(mark_embedding)
hidden_0_layers = [ hidden_0_layers = [
fluid.layers.fc(input=emb, size=hidden_dim) for emb in emb_layers fluid.layers.fc(input=emb, size=hidden_dim, act='tanh')
for emb in emb_layers
] ]
hidden_0 = fluid.layers.sums(input=hidden_0_layers) hidden_0 = fluid.layers.sums(input=hidden_0_layers)
...@@ -94,8 +95,8 @@ def db_lstm(word, predicate, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, mark, ...@@ -94,8 +95,8 @@ def db_lstm(word, predicate, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, mark,
for i in range(1, depth): for i in range(1, depth):
mix_hidden = fluid.layers.sums(input=[ mix_hidden = fluid.layers.sums(input=[
fluid.layers.fc(input=input_tmp[0], size=hidden_dim), fluid.layers.fc(input=input_tmp[0], size=hidden_dim, act='tanh'),
fluid.layers.fc(input=input_tmp[1], size=hidden_dim) fluid.layers.fc(input=input_tmp[1], size=hidden_dim, act='tanh')
]) ])
lstm = fluid.layers.dynamic_lstm( lstm = fluid.layers.dynamic_lstm(
...@@ -109,8 +110,8 @@ def db_lstm(word, predicate, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, mark, ...@@ -109,8 +110,8 @@ def db_lstm(word, predicate, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, mark,
input_tmp = [mix_hidden, lstm] input_tmp = [mix_hidden, lstm]
feature_out = fluid.layers.sums(input=[ feature_out = fluid.layers.sums(input=[
fluid.layers.fc(input=input_tmp[0], size=label_dict_len), fluid.layers.fc(input=input_tmp[0], size=label_dict_len, act='tanh'),
fluid.layers.fc(input=input_tmp[1], size=label_dict_len) fluid.layers.fc(input=input_tmp[1], size=label_dict_len, act='tanh')
]) ])
return feature_out return feature_out
...@@ -171,7 +172,7 @@ def train(use_cuda, save_dirname=None, is_local=True): ...@@ -171,7 +172,7 @@ def train(use_cuda, save_dirname=None, is_local=True):
# check other optimizers and check why out will be NAN # check other optimizers and check why out will be NAN
sgd_optimizer = fluid.optimizer.SGD( sgd_optimizer = fluid.optimizer.SGD(
learning_rate=fluid.layers.exponential_decay( learning_rate=fluid.layers.exponential_decay(
learning_rate=0.0001, learning_rate=0.01,
decay_steps=100000, decay_steps=100000,
decay_rate=0.5, decay_rate=0.5,
staircase=True)) staircase=True))
...@@ -233,7 +234,7 @@ def train(use_cuda, save_dirname=None, is_local=True): ...@@ -233,7 +234,7 @@ def train(use_cuda, save_dirname=None, is_local=True):
print("second per batch: " + str((time.time( print("second per batch: " + str((time.time(
) - start_time) / batch_id)) ) - start_time) / batch_id))
# Set the threshold low to speed up the CI test # Set the threshold low to speed up the CI test
if float(pass_precision) > 0.05: if float(pass_precision) > 0.01:
if save_dirname is not None: if save_dirname is not None:
# TODO(liuyiqun): Change the target to crf_decode # TODO(liuyiqun): Change the target to crf_decode
fluid.io.save_inference_model(save_dirname, [ fluid.io.save_inference_model(save_dirname, [
......
...@@ -157,7 +157,6 @@ def train(nn_type, ...@@ -157,7 +157,6 @@ def train(nn_type,
for ip in pserver_ips.split(","): for ip in pserver_ips.split(","):
eplist.append(':'.join([ip, port])) eplist.append(':'.join([ip, port]))
pserver_endpoints = ",".join(eplist) # ip:port,ip:port... pserver_endpoints = ",".join(eplist) # ip:port,ip:port...
pserver_endpoints = os.getenv("PSERVERS")
trainers = int(os.getenv("TRAINERS")) trainers = int(os.getenv("TRAINERS"))
current_endpoint = os.getenv("POD_IP") + ":" + port current_endpoint = os.getenv("POD_IP") + ":" + port
trainer_id = int(os.getenv("PADDLE_INIT_TRAINER_ID")) trainer_id = int(os.getenv("PADDLE_INIT_TRAINER_ID"))
......
file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py") file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py")
string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}") string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")
# The fully connected test is removed whe the WITH_MKLDNN flag is OFF
# Because the fully connected layer has only one kernel (MKLDNN)
if(NOT WITH_MKLDNN)
list(REMOVE_ITEM TEST_OPS test_fc_op)
endif(NOT WITH_MKLDNN)
if(NOT WITH_DISTRIBUTE) if(NOT WITH_DISTRIBUTE)
list(REMOVE_ITEM TEST_OPS test_recv_op) list(REMOVE_ITEM TEST_OPS test_recv_op)
endif(NOT WITH_DISTRIBUTE) endif(NOT WITH_DISTRIBUTE)
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册