提交 c1ab215e 编写于 作者: Y Yancey1989

Merge branch 'develop' of github.com:PaddlePaddle/Paddle into dist_pass_barrier

......@@ -61,8 +61,10 @@ option(EIGEN_USE_THREADS "Compile with multi-threaded Eigen" OFF)
option(WITH_ARM_FP16 "Use half precision support on armv8.2-a cpu" OFF)
option(WITH_FAST_BUNDLE_TEST "Bundle tests that can be run in a single process together to reduce launch overhead" OFF)
option(WITH_CONTRIB "Compile the third-party contributation" OFF)
option(REPLACE_ENFORCE_GLOG "Replace PADDLE_ENFORCE with glog/CHECK for better debug." OFF)
option(WITH_ANAKIN "Compile with Anakin library" OFF)
option(WITH_GRPC "Use grpc as the default rpc framework" ${WITH_DISTRIBUTE})
option(WITH_BRPC_RDMA "Use brpc rdma as the rpc protocal" OFF)
# CMAKE_BUILD_TYPE
if(NOT CMAKE_BUILD_TYPE)
......@@ -131,6 +133,10 @@ if (NOT DEFINED WITH_MKLDNN)
set(WITH_MKLDNN OFF)
endif()
endif()
if (REPLACE_ENFORCE_GLOG)
add_definitions("-DREPLACE_ENFORCE_GLOG")
endif()
########################################################################################
include(external/mklml) # download mklml package
......@@ -153,12 +159,24 @@ include(external/cares)
if(WITH_DISTRIBUTE)
if(WITH_GRPC)
include(external/grpc)
message(STATUS "Use grpc framework.")
else()
message(STATUS "Use brpc framework.")
include(external/leveldb)
include(external/brpc)
endif()
endif()
if(WITH_BRPC_RDMA)
message(STATUS "Use brpc with rdma.")
if(WITH_GRPC)
message(FATAL_ERROR "Can't use grpc with brpc rdma.")
endif()
if(NOT WITH_DISTRIBUTE)
message(FATAL_ERROR "Can't use brpc rdma in no distribute env.")
endif()
endif()
include(external/snappy) # download snappy
include(external/snappystream)
include(external/threadpool)
......
......@@ -4,7 +4,6 @@
[![Build Status](https://travis-ci.org/PaddlePaddle/Paddle.svg?branch=develop)](https://travis-ci.org/PaddlePaddle/Paddle)
[![Documentation Status](https://img.shields.io/badge/docs-latest-brightgreen.svg?style=flat)](http://www.paddlepaddle.org/docs/develop/documentation/en/getstarted/index_en.html)
[![Documentation Status](https://img.shields.io/badge/中文文档-最新-brightgreen.svg)](http://www.paddlepaddle.org/docs/develop/documentation/zh/getstarted/index_cn.html)
[![Coverage Status](https://coveralls.io/repos/github/PaddlePaddle/Paddle/badge.svg?branch=develop)](https://coveralls.io/github/PaddlePaddle/Paddle?branch=develop)
[![Release](https://img.shields.io/github/release/PaddlePaddle/Paddle.svg)](https://github.com/PaddlePaddle/Paddle/releases)
[![License](https://img.shields.io/badge/license-Apache%202-blue.svg)](LICENSE)
......
......@@ -174,3 +174,7 @@ endif(WITH_GOLANG)
if(WITH_GRPC)
add_definitions(-DPADDLE_WITH_GRPC)
endif(WITH_GRPC)
if(WITH_BRPC_RDMA)
add_definitions(-DPADDLE_WITH_BRPC_RDMA)
endif(WITH_BRPC_RDMA)
......@@ -14,6 +14,15 @@
INCLUDE(ExternalProject)
find_library(SSL_LIBRARY NAMES ssl)
ADD_LIBRARY(ssl SHARED IMPORTED GLOBAL)
SET_PROPERTY(TARGET ssl PROPERTY IMPORTED_LOCATION ${SSL_LIBRARY})
find_library(CRYPTO_LIBRARY NAMES crypto)
ADD_LIBRARY(crypto SHARED IMPORTED GLOBAL)
SET_PROPERTY(TARGET crypto PROPERTY IMPORTED_LOCATION ${CRYPTO_LIBRARY})
SET(BRPC_SOURCES_DIR ${THIRD_PARTY_PATH}/brpc)
SET(BRPC_INSTALL_DIR ${THIRD_PARTY_PATH}/install/brpc)
SET(BRPC_INCLUDE_DIR "${BRPC_INSTALL_DIR}/include" CACHE PATH "brpc include directory." FORCE)
......@@ -22,14 +31,14 @@ SET(BRPC_LIBRARIES "${BRPC_INSTALL_DIR}/lib/libbrpc.a" CACHE FILEPATH "brpc libr
INCLUDE_DIRECTORIES(${BRPC_INCLUDE_DIR})
# Reference https://stackoverflow.com/questions/45414507/pass-a-list-of-prefix-paths-to-externalproject-add-in-cmake-args
set(prefix_path "${THIRD_PARTY_PATH}/install/gflags|${THIRD_PARTY_PATH}/install/leveldb|${THIRD_PARTY_PATH}/install/snappy|${THIRD_PARTY_PATH}/install/gtest|${THIRD_PARTY_PATH}/install/protobuf")
set(prefix_path "${THIRD_PARTY_PATH}/install/gflags|${THIRD_PARTY_PATH}/install/leveldb|${THIRD_PARTY_PATH}/install/snappy|${THIRD_PARTY_PATH}/install/gtest|${THIRD_PARTY_PATH}/install/protobuf|${THIRD_PARTY_PATH}/install/zlib")
# If minimal .a is need, you can set WITH_DEBUG_SYMBOLS=OFF
ExternalProject_Add(
extern_brpc
${EXTERNAL_PROJECT_LOG_ARGS}
GIT_REPOSITORY "https://github.com/brpc/brpc"
GIT_TAG "6d153dd7ff00f960ae6895c9c5fff0ce9f07aff2"
GIT_REPOSITORY "https://github.com/gongweibao/brpc"
GIT_TAG "7dc04defad1fd4173aae170c3fcbde131b65155a"
PREFIX ${BRPC_SOURCES_DIR}
UPDATE_COMMAND ""
CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
......@@ -42,6 +51,8 @@ ExternalProject_Add(
-DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
-DCMAKE_PREFIX_PATH=${prefix_path}
-DBRPC_WITH_GLOG=ON
-DIOBUF_WITH_HUGE_BLOCK=ON
-DBRPC_WITH_RDMA=${WITH_BRPC_RDMA}
${EXTERNAL_OPTIONAL_ARGS}
LIST_SEPARATOR |
CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${BRPC_INSTALL_DIR}
......@@ -49,7 +60,7 @@ ExternalProject_Add(
-DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
-DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
)
ADD_DEPENDENCIES(extern_brpc protobuf leveldb gflags glog gtest snappy)
ADD_DEPENDENCIES(extern_brpc protobuf ssl crypto leveldb gflags glog gtest snappy)
ADD_LIBRARY(brpc STATIC IMPORTED GLOBAL)
SET_PROPERTY(TARGET brpc PROPERTY IMPORTED_LOCATION ${BRPC_LIBRARIES})
ADD_DEPENDENCIES(brpc extern_brpc)
......
......@@ -96,6 +96,20 @@ if(NOT APPLE AND NOT ANDROID)
set(CMAKE_CXX_LINK_EXECUTABLE "${CMAKE_CXX_LINK_EXECUTABLE} -pthread -ldl -lrt")
endif(NOT APPLE AND NOT ANDROID)
set_property(GLOBAL PROPERTY FLUID_MODULES "")
# find all fluid modules is used for paddle fluid static library
# for building inference libs
function(find_fluid_modules TARGET_NAME)
get_filename_component(__target_path ${TARGET_NAME} ABSOLUTE)
string(REGEX REPLACE "^${PADDLE_SOURCE_DIR}/" "" __target_path ${__target_path})
string(FIND "${__target_path}" "fluid" pos)
if(pos GREATER 1)
get_property(fluid_modules GLOBAL PROPERTY FLUID_MODULES)
set(fluid_modules ${fluid_modules} ${TARGET_NAME})
set_property(GLOBAL PROPERTY FLUID_MODULES "${fluid_modules}")
endif()
endfunction(find_fluid_modules)
function(merge_static_libs TARGET_NAME)
set(libs ${ARGN})
list(REMOVE_DUPLICATES libs)
......@@ -250,6 +264,7 @@ function(cc_test TARGET_NAME)
WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
if (${cc_test_SERIAL})
set_property(TEST ${TARGET_NAME} PROPERTY SERIAL 1)
set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_init_allocated_mem=true)
endif()
endif()
endfunction(cc_test)
......@@ -314,6 +329,7 @@ function(nv_test TARGET_NAME)
add_test(${TARGET_NAME} ${TARGET_NAME})
if (nv_test_SERIAL)
set_property(TEST ${TARGET_NAME} PROPERTY SERIAL 1)
set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_init_allocated_mem=true)
endif()
endif()
endfunction(nv_test)
......@@ -561,7 +577,7 @@ function(py_test TARGET_NAME)
set(multiValueArgs SRCS DEPS ARGS ENVS)
cmake_parse_arguments(py_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
add_test(NAME ${TARGET_NAME}
COMMAND env PYTHONPATH=${PADDLE_BINARY_DIR}/python ${py_test_ENVS}
COMMAND env FLAGS_init_allocated_mem=true PYTHONPATH=${PADDLE_BINARY_DIR}/python ${py_test_ENVS}
${PYTHON_EXECUTABLE} -u ${py_test_SRCS} ${py_test_ARGS}
WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
endif()
......
......@@ -12,19 +12,6 @@
# See the License for the specific language governing permissions and
# limitations under the License.
set_property(GLOBAL PROPERTY FLUID_MODULES "")
# find all fluid modules is used for paddle fluid static library
function(find_fluid_modules TARGET_NAME)
get_filename_component(__target_path ${TARGET_NAME} ABSOLUTE)
string(REGEX REPLACE "^${PADDLE_SOURCE_DIR}/" "" __target_path ${__target_path})
string(FIND "${__target_path}" "fluid" pos)
if(pos GREATER 1)
get_property(fluid_modules GLOBAL PROPERTY FLUID_MODULES)
set(fluid_modules ${fluid_modules} ${TARGET_NAME})
set_property(GLOBAL PROPERTY FLUID_MODULES "${fluid_modules}")
endif()
endfunction(find_fluid_modules)
# make package for paddle fluid shared and static library
function(copy TARGET)
set(options "")
......@@ -154,7 +141,7 @@ set(inference_deps paddle_fluid_shared paddle_fluid)
if(WITH_CONTRIB)
message(STATUS "installing contrib")
set(contrib_dst_dir "${FLUID_INSTALL_DIR}/contrib/inference")
if (WITH_ANAKIN)
if (WITH_ANAKIN AND WITH_GPU)
copy(contrib_anakin_inference_lib DEPS paddle_inference_api inference_anakin_api
SRCS
${PADDLE_BINARY_DIR}/paddle/contrib/inference/libinference_anakin_api* # compiled anakin api
......@@ -163,9 +150,9 @@ if(WITH_CONTRIB)
list(APPEND inference_deps contrib_anakin_inference_lib)
endif()
copy(contrib_inference_lib DEPS paddle_inference_api
copy(contrib_inference_lib DEPS paddle_inference_api paddle_inference_api_shared
SRCS ${PADDLE_SOURCE_DIR}/paddle/contrib/inference/paddle_inference_api.h
${PADDLE_BINARY_DIR}/paddle/contrib/inference/libpaddle_inference_api.*
${PADDLE_BINARY_DIR}/paddle/contrib/inference/libpaddle_inference_api*
DSTS ${contrib_dst_dir} ${contrib_dst_dir})
list(APPEND inference_deps contrib_inference_lib)
endif()
......
......@@ -46,6 +46,10 @@ cc_library(paddle_inference_api
SRCS paddle_inference_api.cc paddle_inference_api_impl.cc
DEPS ${FLUID_CORE_MODULES} ${GLOB_OP_LIB})
cc_library(paddle_inference_api_shared SHARED
SRCS paddle_inference_api.cc paddle_inference_api_impl.cc
DEPS ${FLUID_CORE_MODULES} ${GLOB_OP_LIB})
cc_test(test_paddle_inference_api
SRCS test_paddle_inference_api.cc
DEPS paddle_inference_api)
......
......@@ -147,9 +147,9 @@ void TransDataLayoutFromMKLDNN(const OpKernelType& kernel_type_for_var,
"Input tensor type is not supported: ", in.type().name());
memory::data_type out_type = in_type;
auto in_format = MKLDNNFormatForSize(in_tz.size(), in.format());
auto in_format = platform::MKLDNNFormatForSize(in_tz.size(), in.format());
auto out_format =
MKLDNNFormatForSize(in_tz.size(), ToMKLDNNFormat(out_layout));
platform::MKLDNNFormatForSize(in_tz.size(), ToMKLDNNFormat(out_layout));
void* in_data = GetDataFromTensor(in, in_type);
......
......@@ -62,12 +62,6 @@ inline MKLDNNDataType ToMKLDNNDataType(const std::type_index type) {
return MKLDNNDataType::data_undef;
}
inline MKLDNNFormat MKLDNNFormatForSize(size_t dims_size,
MKLDNNFormat default_format) {
return (dims_size == 1
? mkldnn::memory::format::x
: dims_size == 2 ? mkldnn::memory::format::nc : default_format);
}
#endif
void TransDataLayoutFromMKLDNN(const OpKernelType& kernel_type_for_var,
......
......@@ -18,6 +18,10 @@ limitations under the License. */
#include "paddle/fluid/framework/data_layout_transform.h"
#include "paddle/fluid/framework/data_type_transform.h"
#ifdef PADDLE_WITH_MKLDNN
#include "paddle/fluid/platform/mkldnn_helper.h"
#endif
namespace paddle {
namespace framework {
......@@ -48,8 +52,8 @@ void TransformData(const OpKernelType &expected_kernel_type,
// Case1 - transform from Non-MKLDNN OPKernel to MKLDNN OPKernel
// Just set layout/format. No real transform occur
auto out_format =
MKLDNNFormatForSize(in.dims().size(), ToMKLDNNFormat(lin));
auto out_format = platform::MKLDNNFormatForSize(in.dims().size(),
ToMKLDNNFormat(lin));
out.ShareDataWith(input_tensor);
out.set_layout(DataLayout::kMKLDNN);
......
......@@ -20,9 +20,7 @@ limitations under the License. */
#include "paddle/fluid/framework/lod_tensor_array.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/reader.h"
#ifdef PADDLE_WITH_DISTRIBUTE
#include "paddle/fluid/operators/distributed/grpc_client.h"
#endif
#include "paddle/fluid/operators/detail/macros.h"
#include "paddle/fluid/platform/place.h"
#include "paddle/fluid/platform/profiler.h"
......
......@@ -68,7 +68,7 @@ std::ostream &operator<<(std::ostream &os, const LoDTensor &t) {
// only print first ten elements
int64_t size = t.numel() < 10 ? t.numel() : 10;
for (int64_t i = 0; i < size; ++i) {
if (t.type().hash_code() == typeid(float).hash_code()) {
if (t.type().hash_code() == typeid(float).hash_code()) { // NOLINT
os << t.data<float>()[i] << " ";
} else if (t.type().hash_code() == typeid(int64_t).hash_code()) {
os << t.data<int64_t>()[i] << " ";
......
......@@ -748,10 +748,6 @@ proto::VarType::Type OperatorWithKernel::IndicateDataType(
t = &var->Get<LoDTensor>();
} else if (var->IsType<SelectedRows>()) {
t = &(var->Get<SelectedRows>().value());
} else if (var->IsType<LoDTensorArray>()) {
const LoDTensorArray& arr = var->Get<LoDTensorArray>();
PADDLE_ENFORCE(arr.size() > 0);
t = &(arr[0]);
}
if (t != nullptr) {
int tmp = static_cast<int>(ToDataType(t->type()));
......
......@@ -253,9 +253,6 @@ void ParallelExecutor::FeedAndSplitTensorIntoLocalScopes(
t->set_lod(lod_tensors[j].lod());
}
}
for (auto &p : member_->places_) {
platform::DeviceContextPool::Instance().Get(p)->Wait();
}
}
ParallelExecutor::~ParallelExecutor() {
......
......@@ -23,9 +23,9 @@ namespace framework {
template <typename T>
inline const T* Tensor::data() const {
check_memory_size();
PADDLE_ENFORCE(std::is_same<T, void>::value ||
holder_->type() == std::type_index(typeid(T)),
"Tensor holds the wrong type, it holds %s",
bool valid = std::is_same<T, void>::value ||
holder_->type() == std::type_index(typeid(T));
PADDLE_ENFORCE(valid, "Tensor holds the wrong type, it holds %s",
this->holder_->type().name());
return reinterpret_cast<const T*>(
......@@ -37,9 +37,9 @@ inline bool Tensor::IsInitialized() const { return holder_ != nullptr; }
template <typename T>
inline T* Tensor::data() {
check_memory_size();
PADDLE_ENFORCE(std::is_same<T, void>::value ||
holder_->type() == std::type_index(typeid(T)),
"Tensor holds the wrong type, it holds %s",
bool valid = std::is_same<T, void>::value ||
holder_->type() == std::type_index(typeid(T));
PADDLE_ENFORCE(valid, "Tensor holds the wrong type, it holds %s",
this->holder_->type().name());
return reinterpret_cast<T*>(reinterpret_cast<uintptr_t>(holder_->ptr()) +
offset_);
......
......@@ -69,7 +69,22 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place,
PADDLE_ENFORCE(platform::is_gpu_place(ctx_place));
auto stream =
reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream();
memory::Copy(dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size, stream);
if (platform::is_same_place(src_place, dst_place)) {
memory::Copy(dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size,
stream);
} else {
if (platform::is_same_place(ctx_place, src_place)) {
memory::Copy(dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size,
stream);
platform::DeviceContextPool::Instance().Get(src.place())->Wait();
} else if (platform::is_same_place(ctx_place, dst_place)) {
platform::DeviceContextPool::Instance().Get(src.place())->Wait();
memory::Copy(dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size,
stream);
} else {
PADDLE_THROW("ctx is not belong to dst_gpu_place or src_gpu_place.");
}
}
}
#endif
}
......@@ -78,10 +93,10 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place,
Tensor* dst) {
platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
const platform::DeviceContext* dev_ctx;
if (platform::is_gpu_place(src.place())) {
dev_ctx = pool.Get(src.place());
} else {
if (platform::is_gpu_place(dst_place)) {
dev_ctx = pool.Get(dst_place);
} else {
dev_ctx = pool.Get(src.place());
}
TensorCopy(src, dst_place, *dev_ctx, dst);
}
......
......@@ -23,10 +23,25 @@ limitations under the License. */
namespace paddle {
namespace framework {
// NOTE(zcd): Because TensorCopy is an async operation, when the src_place
// and dst_place are two different GPU, to ensure that the operation can
// be carried out correctly, there is a src_ctx wait operation in TensorCopy.
// If ctx_place and src_place are the same, src_ctx.Wait() is added
// after memory::Copy; if ctx_place and dst_place are the same,
// src_ctx.Wait() is added before memory::Copy.
void TensorCopy(const Tensor& src, const platform::Place& dst_place,
const platform::DeviceContext& ctx, Tensor* dst);
// NOTE(zcd): If the src.place() and dst_place are two different GPU,
// the copy operation is carried out on the dst_place's stream. This is
// very important, because TensorCopy is an async operator, and in most
// case, once this copy operator returns, dst is to be used in dst_place's
// stream, if this copy operation is carried out on the src_place's stream,
// when dst is used in dst_place's stream the copy operation may be
// not completed.
void TensorCopy(const Tensor& src, const platform::Place& dst_place,
Tensor* dst);
void TensorCopySync(const Tensor& src, const platform::Place& dst_place,
Tensor* dst);
......
# Inference Analysis
The `inference/analysis` module is used to analyze and optimize the inference program,
it references some philosophy from `LLVM/analysis`,
and make the various optimization features be pluggable and co-exist in a pipeline.
We borrowed some concepts from LLVM, such as
- [Pass](./pass.h)es to implement optimization that traverse the inference program,
- [DataFlowGraph](./data_flow_graph.h) to represent the data flow graph built from a program,
- [PassManager](./pass_manager.h) to manage a sequence of `Pass`es over a graph.
There are some other basic concepts here
- [Node](./node.h), the node in a `DataFlowGraph`,
- `Function`, the Operator in Fluid,
- `Value`, the Variable in Fluid;
- [Argument](./argument.h), the argument that treat as the input and output of all `Pass`es in the pipeline,
## How it works
The `inference/analysis` module make all the passes in a pipeline, and works in such way:
1. Build a `DataFlowGraph` from a Fluid inference ProgramDesc,
2. Call the middle passes one by one, the same `DataFlowGraph` is passed across all the passes,
3. Transform a new ProgramDesc from the modified `DataFlowGraph`.
The new optimization features can be added as an independent `Pass` and controlled by gflags,
each pass will generate unified debug information or visualization for better debugging.
## Supported Passes
### `FluidToDataFlowGraphPass`
Transform the fluid `ProgramDesc` to a `DataFlowGraph` to give an abstract representation for all the middle passes,
this should be the first pass of the pipeline.
### `DataFlowGraphToFluidPass`
Generate a final `ProgramDesc` from a data flow graph, this should be the last pass of the pipeline.
### `TensorRTSubgraphNodeMarkPass`
Mark the `Node` that are supported by TensorRT,
this pass will generate a visualization file which can be used for debugging.
### `TensorRTSubGraphPass`
Split the sub-graph that are can be accelerated by TensorRT.
### `DFG_GraphvizDrawPass`
This pass is just for debug, it will visualize the `DataFlowGraph` using the [graphviz](http://www.graphviz.org) tool.
It can be used as a helper class that draws the modified graph after each pass.
## Utilities
There is some helper function/class for analysis.
- [dot.h](./dot.h) give a easy to use interface for generating `DOT` codes,
- [graph_traits.h](./graph_traits.h) contains the graph traversal algorithms, it uses `iterator` to make the algorithms easy to share across different passes.
......@@ -13,6 +13,7 @@
// limitations under the License.
#include "paddle/fluid/inference/analysis/analyzer.h"
#include <string>
#include "paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.h"
#include "paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.h"
#include "paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h"
......
......@@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
/*
* This file contains Analyzer, an class that exposed as a library that analyze
* and optimize
......
......@@ -138,7 +138,7 @@ struct GraphTraits<DataFlowGraph> {
// sub-graph is the inputs nodes and output nodes that doesn't inside the
// sub-graph.
static std::pair<std::vector<Node *>, std::vector<Node *>>
ExtractInputAndOutputOfSubGraph(std::vector<Node *> &graph) {
ExtractInputAndOutputOfSubGraph(std::vector<Node *> &graph) { // NOLINT
std::unordered_set<Node *> nodes(graph.begin(), graph.end());
std::unordered_set<Node *> inputs;
std::unordered_set<Node *> outputs;
......
......@@ -13,6 +13,7 @@
// limitations under the License.
#include "paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.h"
#include <vector>
#include "paddle/fluid/framework/block_desc.h"
#include "paddle/fluid/framework/op_desc.h"
#include "paddle/fluid/framework/proto_desc.h"
......@@ -150,13 +151,14 @@ namespace {
class DFG_DebuggerPass : public DFG_GraphvizDrawPass {
public:
using Config = DFG_GraphvizDrawPass::Config;
DFG_DebuggerPass(const Config& config) : DFG_GraphvizDrawPass(config) {}
explicit DFG_DebuggerPass(const Config& config)
: DFG_GraphvizDrawPass(config) {}
std::string repr() const override { return "dfg-to-fluid-debuger-pass"; }
bool Finalize() override { return true; }
};
}
} // namespace
Pass* DataFlowGraphToFluidPass::CreateGraphvizDebugerPass() const {
return new DFG_DebuggerPass(DFG_GraphvizDrawPass::Config(
......
......@@ -19,6 +19,7 @@
#pragma once
#include <string>
#include "paddle/fluid/framework/program_desc.h"
#include "paddle/fluid/inference/analysis/data_flow_graph.h"
#include "paddle/fluid/inference/analysis/pass.h"
......
......@@ -46,7 +46,7 @@ class DFG_GraphvizDrawPass : public DataFlowGraphPass {
const bool display_deleted_node;
};
DFG_GraphvizDrawPass(const Config &config) : config_(config) {}
explicit DFG_GraphvizDrawPass(const Config &config) : config_(config) {}
bool Initialize(Argument *argument) override { return true; }
void Run(DataFlowGraph *graph) override;
......
......@@ -15,7 +15,7 @@ limitations under the License. */
#include <string>
#include <vector>
#include "analyzer.h"
#include "paddle/fluid/inference/analysis/analyzer.h"
#include "paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.h"
#include "paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h"
......@@ -88,7 +88,8 @@ namespace {
class DFG_DebuggerPass : public DFG_GraphvizDrawPass {
public:
using Config = DFG_GraphvizDrawPass::Config;
DFG_DebuggerPass(const Config &config) : DFG_GraphvizDrawPass(config) {}
explicit DFG_DebuggerPass(const Config &config)
: DFG_GraphvizDrawPass(config) {}
std::string repr() const override { return "fluid-to-dfg-debuger-pass"; }
bool Finalize() override { return true; }
};
......
......@@ -12,14 +12,14 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/inference/analysis/pass_manager.h"
#include <gtest/gtest.h>
#include "paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.h"
#include "paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.h"
#include "paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h"
#include "paddle/fluid/inference/analysis/pass_manager.h"
#include "paddle/fluid/inference/analysis/ut_helper.h"
#include <gtest/gtest.h>
namespace paddle {
namespace inference {
namespace analysis {
......
......@@ -12,10 +12,12 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass.h"
#include <string>
#include "paddle/fluid/inference/analysis/analyzer.h"
#include "paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.h"
#include "paddle/fluid/inference/analysis/node_attr_flags.h"
#include "paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass.h"
namespace paddle {
namespace inference {
......@@ -29,7 +31,7 @@ void TensorRTSubgraphNodeMarkPass::Run(DataFlowGraph *graph) {
class DfgDebuggerPass : public DFG_GraphvizDrawPass {
public:
DfgDebuggerPass(const DFG_GraphvizDrawPass::Config &config)
explicit DfgDebuggerPass(const DFG_GraphvizDrawPass::Config &config)
: DFG_GraphvizDrawPass(config) {}
std::string repr() const override {
......
......@@ -16,6 +16,10 @@
* This file defines TensorRTSubgraphNodeMarkPass which helps to mark the ops
* that supported by TensorRT engine.
*/
#pragma once
#include <string>
#include "paddle/fluid/inference/analysis/pass.h"
#include "paddle/fluid/inference/analysis/subgraph_splitter.h"
......@@ -30,7 +34,8 @@ class TensorRTSubgraphNodeMarkPass : public DataFlowGraphPass {
public:
using teller_t = SubGraphSplitter::NodeInsideSubgraphTeller;
TensorRTSubgraphNodeMarkPass(const teller_t& teller) : teller_(teller) {}
explicit TensorRTSubgraphNodeMarkPass(const teller_t& teller)
: teller_(teller) {}
bool Initialize(Argument* argument) override { return true; }
......@@ -38,8 +43,10 @@ class TensorRTSubgraphNodeMarkPass : public DataFlowGraphPass {
// sub-graph into TensorRT.
void Run(DataFlowGraph* graph) override;
std::string repr() const { return "tensorrt-sub-subgraph-mark"; }
std::string description() const { return "tensorrt sub-graph mark pass"; }
std::string repr() const override { return "tensorrt-sub-subgraph-mark"; }
std::string description() const override {
return "tensorrt sub-graph mark pass";
}
Pass* CreateGraphvizDebugerPass() const override;
bool Finalize() override;
......
......@@ -14,6 +14,7 @@ limitations under the License. */
#pragma once
#include <string>
#include "paddle/fluid/inference/analysis/node.h"
#include "paddle/fluid/inference/analysis/pass.h"
#include "paddle/fluid/inference/analysis/subgraph_splitter.h"
......@@ -30,7 +31,7 @@ class TensorRTSubGraphPass : public DataFlowGraphPass {
// Tell whether to transform a sub-graph into TensorRT.
using NodeInsideSubgraphTeller = SubGraphFuse::NodeInsideSubgraphTeller;
TensorRTSubGraphPass(const NodeInsideSubgraphTeller& teller);
explicit TensorRTSubGraphPass(const NodeInsideSubgraphTeller& teller);
bool Initialize(Argument* argument) override { return true; }
......@@ -40,8 +41,8 @@ class TensorRTSubGraphPass : public DataFlowGraphPass {
bool Finalize() override { return true; }
std::string repr() const { return "tensorrt-sub-graph"; }
std::string description() const { return "tensorrt sub graph pass"; }
std::string repr() const override { return "tensorrt-sub-graph"; }
std::string description() const override { return "tensorrt sub graph pass"; }
private:
NodeInsideSubgraphTeller node_inside_subgraph_teller_;
......@@ -49,4 +50,4 @@ class TensorRTSubGraphPass : public DataFlowGraphPass {
} // namespace analysis
} // namespace inference
} // paddle
} // namespace paddle
......@@ -20,6 +20,12 @@ limitations under the License. */
#include "paddle/fluid/memory/detail/system_allocator.h"
#include "paddle/fluid/platform/gpu_info.h"
DEFINE_bool(init_allocated_mem, false,
"It is a mistake that the values of the memory allocated by "
"BuddyAllocator are always zeroed in some op's implementation. "
"To find this error in time, we use init_allocated_mem to indicate "
"that initializing the allocated memory with a small value "
"during unit testing.");
DECLARE_double(fraction_of_gpu_memory_to_use);
namespace paddle {
......@@ -41,6 +47,9 @@ template <>
void* Alloc<platform::CPUPlace>(platform::CPUPlace place, size_t size) {
VLOG(10) << "Allocate " << size << " bytes on " << platform::Place(place);
void* p = GetCPUBuddyAllocator()->Alloc(size);
if (FLAGS_init_allocated_mem) {
memset(p, 0xEF, size);
}
VLOG(10) << " pointer=" << p;
return p;
}
......@@ -104,6 +113,9 @@ void* Alloc<platform::CUDAPlace>(platform::CUDAPlace place, size_t size) {
LOG(WARNING) << "GPU memory used: " << Used<platform::CUDAPlace>(place);
platform::SetDeviceId(cur_dev);
}
if (FLAGS_init_allocated_mem) {
cudaMemset(ptr, 0xEF, size);
}
return ptr;
}
......@@ -137,6 +149,9 @@ void* Alloc<platform::CUDAPinnedPlace>(platform::CUDAPinnedPlace place,
LOG(WARNING) << "cudaMallocHost Cannot allocate " << size
<< " bytes in CUDAPinnedPlace";
}
if (FLAGS_init_allocated_mem) {
memset(ptr, 0xEF, size);
}
return ptr;
}
......
......@@ -184,6 +184,7 @@ else()
set(DEPS_OPS ${DEPS_OPS} nccl_op)
endif()
set(DISTRIBUTE_DEPS "")
if(WITH_DISTRIBUTE)
add_subdirectory(distributed)
......@@ -192,6 +193,18 @@ if(WITH_DISTRIBUTE)
set(DISTRIBUTE_DEPS sendrecvop_grpc grpc++_unsecure grpc_unsecure gpr cares zlib protobuf)
else()
set(DISTRIBUTE_DEPS sendrecvop_brpc brpc leveldb snappystream snappy protobuf ssl crypto zlib)
if(WITH_BRPC_RDMA)
find_library(IBVERBS_LIBRARY NAMES ibverbs)
ADD_LIBRARY(ibverbs SHARED IMPORTED GLOBAL)
SET_PROPERTY(TARGET ibverbs PROPERTY IMPORTED_LOCATION ${IBVERBS_LIBRARY})
find_library(RDMACM_LIBRARY NAMES rdmacm)
ADD_LIBRARY(rdmacm SHARED IMPORTED GLOBAL)
SET_PROPERTY(TARGET rdmacm PROPERTY IMPORTED_LOCATION ${RDMACM_LIBRARY})
set(DISTRIBUTE_DEPS ${DISTRIBUTE_DEPS} ibverbs rdmacm)
endif()
endif()
set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
......@@ -205,7 +218,7 @@ if(WITH_DISTRIBUTE)
# listen_and_serv_op sum_op executor SERIAL)
if(WITH_GPU)
set_source_files_properties(test_send_nccl_id.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
cc_test(test_send_nccl_id SRCS test_send_nccl_id.cc DEPS listen_and_serv_op executor SERIAL)
cc_test(test_send_nccl_id SRCS test_send_nccl_id.cc DEPS listen_and_serv_op ${DISTRIBUTE_DEPS} executor SERIAL)
if(WITH_GRPC)
op_library(gen_nccl_id_op DEPS nccl_common sendrecvop_grpc)
else()
......@@ -297,6 +310,7 @@ foreach(src ${DETECTION_LIBRARY})
endforeach()
set(GLOB_OP_LIB ${OP_LIBRARY} CACHE INTERNAL "Global OP library")
set(GLOB_DISTRIBUTE_DEPS ${DISTRIBUTE_DEPS} CACHE INTERNAL "distributed dependency")
cc_test(gather_test SRCS gather_test.cc DEPS tensor)
cc_test(scatter_test SRCS scatter_test.cc DEPS tensor)
......
......@@ -115,8 +115,11 @@ class BatchNormMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
if (fuse_with_relu) flags |= mkldnn::fuse_bn_relu;
// create mkldnn memory from input x tensor
auto src_memory =
memory({{{src_tz}, memory::data_type::f32, x->format()}, mkldnn_engine},
mkldnn::memory::format input_format =
platform::MKLDNNFormatForSize(src_tz.size(), x->format());
auto src_memory = memory(
{{{src_tz}, memory::data_type::f32, input_format}, mkldnn_engine},
to_void_cast(x_data));
// create primitive descriptor for batch norm forward
......@@ -251,14 +254,20 @@ class BatchNormMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
using bn_bwd_types = bn_type_traits<mkldnn::batch_normalization_backward>;
// create mkldnn memory from input diff_y tensor
auto user_diff_dst_memory =
memory({{{diff_dst_tz}, memory::data_type::f32, diff_y->format()},
mkldnn_engine},
mkldnn::memory::format dst_format =
platform::MKLDNNFormatForSize(src_tz.size(), diff_y->format());
auto user_diff_dst_memory = memory(
{{{diff_dst_tz}, memory::data_type::f32, dst_format}, mkldnn_engine},
to_void_cast(diff_y_data));
// create mkldnn memory from input x tensor
auto src_memory =
memory({{{src_tz}, memory::data_type::f32, x->format()}, mkldnn_engine},
mkldnn::memory::format input_format =
platform::MKLDNNFormatForSize(src_tz.size(), x->format());
auto src_memory = memory(
{{{src_tz}, memory::data_type::f32, input_format}, mkldnn_engine},
to_void_cast(x_data));
// for diff_dst, try to use same format as dst in forward pass
......
......@@ -14,14 +14,22 @@
#pragma once
#ifdef PADDLE_WITH_DISTRIBUTE
#ifdef PADDLE_WITH_GRPC
#include "paddle/fluid/operators/distributed/grpc_client.h"
#include "paddle/fluid/operators/distributed/grpc_server.h"
#define RPCSERVER_T distributed::AsyncGRPCServer
#define RPCCLIENT_T distributed::GRPCClient
#else
#define RPCSERVER_T paddle::operators::distributed::AsyncGRPCServer
#define RPCCLIENT_T paddle::operators::distributed::GRPCClient
#else // PADDLE_WITH_GRPC
#include "paddle/fluid/operators/distributed/brpc_client.h"
#include "paddle/fluid/operators/distributed/brpc_server.h"
#define RPCSERVER_T distributed::AsyncBRPCServer
#define RPCCLIENT_T distributed::BRPCClient
#endif
#define RPCSERVER_T paddle::operators::distributed::AsyncBRPCServer
#define RPCCLIENT_T paddle::operators::distributed::BRPCClient
#endif // PADDLE_WITH_GRPC
#endif // PADDLE_WITH_DISTRIBUTE
......@@ -16,7 +16,7 @@
#include "gflags/gflags.h"
// default to 3min to avoid temprary network failures.
DEFINE_int32(rpc_deadline, 30000, "deadline timeouts for rpc");
DEFINE_int32(rpc_deadline, 180000, "deadline timeouts for rpc");
namespace paddle {
namespace operators {
......
......@@ -26,12 +26,8 @@ class FillZerosLikeOp : public framework::OperatorWithKernel {
"Input(X) of FillZerosLikeOp should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("Out"),
"Output(Out) of FillZerosLikeOp should not be null.");
if (ctx->IsRuntime() &&
ctx->GetOutputsVarType("Out")[0] ==
framework::proto::VarType::LOD_TENSOR_ARRAY) {
return; // skip runtime infershape when is tensor array;
}
ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
ctx->ShareLoD("X", /*->*/ "Out");
}
};
......@@ -43,7 +39,7 @@ class FillZerosLikeOpMaker : public framework::OpProtoAndCheckerMaker {
AddComment(R"DOC(
FillZerosLike Operator.
Fill up a variable with zeros, supporting both LoDTensor and LoDTensorArray.
Fill up a variable with zeros.
The output will have the same size as the input.
)DOC");
......
......@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "paddle/fluid/framework/lod_tensor_array.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/math/math_function.h"
......@@ -24,30 +23,13 @@ template <typename DeviceContext, typename T>
class FillZerosLikeKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
auto var = context.InputVar("X");
if (var->IsType<framework::LoDTensor>()) {
auto& input = *context.Input<framework::LoDTensor>("X");
auto& output = *context.Output<framework::LoDTensor>("Out");
output.Resize(input.dims());
output.set_lod(input.lod());
output.mutable_data<T>(context.GetPlace());
math::SetConstant<DeviceContext, T> setter;
setter(context.template device_context<DeviceContext>(), &(output),
static_cast<T>(0));
} else if (var->IsType<framework::LoDTensorArray>()) {
auto& input = *context.Input<framework::LoDTensorArray>("X");
auto& output = *context.Output<framework::LoDTensorArray>("Out");
output.resize(input.size());
for (auto i = 0; i < input.size(); i++) {
output[i].Resize(input[i].dims());
output[i].set_lod(input[i].lod());
output[i].mutable_data<T>(context.GetPlace());
auto* out = context.Output<framework::Tensor>("Out");
out->mutable_data<T>(context.GetPlace());
math::SetConstant<DeviceContext, T> setter;
setter(context.template device_context<DeviceContext>(), &(output[i]),
setter(context.template device_context<DeviceContext>(), out,
static_cast<T>(0));
}
}
}
};
} // namespace operators
......
......@@ -113,7 +113,11 @@ template <typename... Args>
inline typename std::enable_if<sizeof...(Args) != 0, void>::type throw_on_error(
bool stat, const Args&... args) {
if (UNLIKELY(!(stat))) {
#ifndef REPLACE_ENFORCE_GLOG
throw std::runtime_error(string::Sprintf(args...));
#else
LOG(FATAL) << string::Sprintf(args...);
#endif
}
}
......@@ -123,8 +127,12 @@ template <typename... Args>
inline typename std::enable_if<sizeof...(Args) != 0, void>::type throw_on_error(
cudaError_t e, const Args&... args) {
if (UNLIKELY(e)) {
#ifndef REPLACE_ENFORCE_GLOG
throw thrust::system_error(e, thrust::cuda_category(),
string::Sprintf(args...));
#else
LOG(FATAL) << string::Sprintf(args...);
#endif
}
}
......@@ -132,8 +140,12 @@ template <typename... Args>
inline typename std::enable_if<sizeof...(Args) != 0, void>::type throw_on_error(
curandStatus_t stat, const Args&... args) {
if (stat != CURAND_STATUS_SUCCESS) {
#ifndef REPLACE_ENFORCE_GLOG
throw thrust::system_error(cudaErrorLaunchFailure, thrust::cuda_category(),
string::Sprintf(args...));
#else
LOG(FATAL) << string::Sprintf(args...);
#endif
}
}
......@@ -143,8 +155,12 @@ inline typename std::enable_if<sizeof...(Args) != 0, void>::type throw_on_error(
if (stat == CUDNN_STATUS_SUCCESS) {
return;
} else {
#ifndef REPLACE_ENFORCE_GLOG
throw std::runtime_error(platform::dynload::cudnnGetErrorString(stat) +
string::Sprintf(args...));
#else
LOG(FATAL) << string::Sprintf(args...);
#endif
}
}
......@@ -173,7 +189,11 @@ inline typename std::enable_if<sizeof...(Args) != 0, void>::type throw_on_error(
} else if (stat == CUBLAS_STATUS_LICENSE_ERROR) {
err = "CUBLAS: license error, ";
}
#ifndef REPLACE_ENFORCE_GLOG
throw std::runtime_error(err + string::Sprintf(args...));
#else
LOG(FATAL) << err << string::Sprintf(args...);
#endif
}
#ifndef __APPLE__
......@@ -183,8 +203,13 @@ inline typename std::enable_if<sizeof...(Args) != 0, void>::type throw_on_error(
if (stat == ncclSuccess) {
return;
} else {
#ifndef REPLACE_ENFORCE_GLOG
throw std::runtime_error(platform::dynload::ncclGetErrorString(stat) +
string::Sprintf(args...));
#else
LOG(FATAL) << platform::dynload::ncclGetErrorString(stat)
<< string::Sprintf(args...);
#endif
}
}
#endif // __APPLE__
......@@ -203,6 +228,7 @@ inline void throw_on_error(T e) {
__FILE__, __LINE__); \
} while (false)
#ifndef REPLACE_ENFORCE_GLOG
#define PADDLE_ENFORCE(...) \
do { \
try { \
......@@ -212,6 +238,9 @@ inline void throw_on_error(T e) {
__FILE__, __LINE__); \
} \
} while (false)
#else
#define PADDLE_ENFORCE(...) ::paddle::platform::throw_on_error(__VA_ARGS__);
#endif
/*
* Some enforce helpers here, usage:
......
......@@ -14,6 +14,7 @@ limitations under the License. */
#pragma once
#include <mkldnn.h>
#include <string>
#include <vector>
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/platform/place.h"
......@@ -182,10 +183,11 @@ class MKLDNNHandler {
}
std::shared_ptr<mkldnn::memory> AcquireMemory(
mkldnn::memory::primitive_desc& mpd,
mkldnn::memory::primitive_desc& user_mpd,
mkldnn::memory::primitive_desc& mpd, // NOLINT
mkldnn::memory::primitive_desc& user_mpd, // NOLINT
const std::shared_ptr<mkldnn::memory> user_memory_p,
const std::string& suffix, std::vector<mkldnn::primitive>& pipeline) {
const std::string& suffix,
std::vector<mkldnn::primitive>& pipeline) { // NOLINT
// create reorder primitive if the input format is not the preferred one
auto local_key = key_ + suffix;
auto key_reorder_p = key_ + suffix + "reorder_p";
......@@ -218,7 +220,7 @@ class MKLDNNHandler {
return target_memory_p;
}
static std::string GetHash(mkldnn::memory::dims& operand_dims,
static std::string GetHash(mkldnn::memory::dims& operand_dims, // NOLINT
const std::string& suffix) {
auto dims2str = [](const mkldnn::memory::dims& operand_dims) {
std::string dstr = "";
......@@ -227,8 +229,9 @@ class MKLDNNHandler {
}
return dstr;
};
return dims2str(operand_dims) + suffix;
};
}
protected:
const MKLDNNDeviceContext& dev_ctx_;
......@@ -237,5 +240,15 @@ class MKLDNNHandler {
bool is_reusing_;
};
inline mkldnn::memory::format MKLDNNFormatForSize(
size_t dims_size, mkldnn::memory::format data_format) {
if (dims_size == 1) {
return mkldnn::memory::format::x;
} else if (dims_size == 2) {
return mkldnn::memory::format::nc;
}
return data_format;
}
} // namespace platform
} // namespace paddle
......@@ -84,7 +84,7 @@ void Fprintf(std::ostream& out, const char* fmt, const Args&... args) {
}
template <typename... Args>
std::string Sprintf(const char* fmt, const Args&... args) {
std::string Sprintf(const char* fmt = "", const Args&... args) {
std::ostringstream oss;
Fprintf(oss, fmt, args...);
return oss.str();
......
......@@ -118,7 +118,8 @@ def __bootstrap__():
read_env_flags = [
'use_pinned_memory', 'check_nan_inf', 'benchmark', 'warpctc_dir',
'eager_delete_scope', 'use_mkldnn', 'initial_cpu_memory_in_mb'
'eager_delete_scope', 'use_mkldnn', 'initial_cpu_memory_in_mb',
'init_allocated_mem'
]
if core.is_compiled_with_cuda():
read_env_flags += [
......
......@@ -95,7 +95,6 @@ __all__ = [
'relu',
'log',
'crop',
'fill_zeros_like',
]
......@@ -5185,40 +5184,3 @@ def crop(x, shape=None, offsets=None, name=None):
outputs={'Out': out},
attrs=None if len(attrs) == 0 else attrs)
return out
def fill_zeros_like(x):
"""
This layer takes an input and outputs a variable that has the same structure as
the input and with all the element values as zero. The variable can be a Tensor
or TensorArray.
.. code-block:: text
Given
X = [[0, 1, 2, 0],
[0, 3, 4, 0],
[0, 0, 0, 0]],
output is:
Out = [[0, 0, 0, 0],
[0, 0, 0, 0],
[0, 0, 0, 0]].
Args:
x (Variable): The input variable, which could be a tensor or tensor array
Returns:
Variable: The zero-filled variable, which has the same type and shape as
the input variable.
Examples:
.. code-block:: python
y = fluid.layers.fill_zeros_like(x)
"""
helper = LayerHelper('fill_zeros_like', **locals())
out = helper.create_tmp_variable(dtype=x.dtype)
helper.append_op(
type='fill_zeros_like', inputs={'X': [x]}, outputs={'Out': [out]})
return out
......@@ -110,14 +110,23 @@ def infer(use_cuda, save_dirname=None):
# The input's dimension should be 2-D and the second dim is 13
# The input data should be >= 0
batch_size = 10
tensor_x = numpy.random.uniform(0, 10,
[batch_size, 13]).astype("float32")
test_reader = paddle.batch(
paddle.dataset.uci_housing.test(), batch_size=batch_size)
test_data = test_reader().next()
test_feat = numpy.array(
[data[0] for data in test_data]).astype("float32")
test_label = numpy.array(
[data[1] for data in test_data]).astype("float32")
assert feed_target_names[0] == 'x'
results = exe.run(inference_program,
feed={feed_target_names[0]: tensor_x},
feed={feed_target_names[0]: numpy.array(test_feat)},
fetch_list=fetch_targets)
print("infer shape: ", results[0].shape)
print("infer results: ", results[0])
print("ground truth: ", test_label)
def main(use_cuda, is_local=True):
......
......@@ -52,3 +52,4 @@ py_test_modules(test_parallel_executor_crf MODULES test_parallel_executor_crf SE
py_test_modules(test_parallel_executor_fetch_feed MODULES test_parallel_executor_fetch_feed SERIAL)
set_tests_properties(test_listen_and_serv_op PROPERTIES TIMEOUT 20)
set_tests_properties(test_dist_mnist PROPERTIES TIMEOUT 180)
set_tests_properties(test_dist_word2vec PROPERTIES TIMEOUT 180)
......@@ -18,6 +18,8 @@ import unittest
import paddle.fluid as fluid
import time
import numpy as np
import math
import sys
__all__ = ['TestParallelExecutorBase']
......@@ -93,6 +95,12 @@ class TestParallelExecutorBase(unittest.TestCase):
print "%.4f Instance per second" % (
(batch_size * iter + 2) / (end - begin))
avg_last_loss_val = np.array(last_loss).mean()
avg_first_loss_val = np.array(first_loss).mean()
if math.isnan(float(avg_last_loss_val)) or math.isnan(
float(avg_first_loss_val)):
sys.exit("got NaN loss, training failed.")
print first_loss, last_loss
# self.assertGreater(first_loss[0], last_loss[0])
return first_loss, last_loss
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import numpy as np
import argparse
import time
import math
import paddle
import paddle.fluid as fluid
import paddle.fluid.profiler as profiler
from paddle.fluid import core
import unittest
from multiprocessing import Process
import os
import signal
IS_SPARSE = True
EMBED_SIZE = 32
HIDDEN_SIZE = 256
N = 5
BATCH_SIZE = 32
ExecutionStrategy = core.ParallelExecutor.ExecutionStrategy
def get_model():
def __network__(words):
embed_first = fluid.layers.embedding(
input=words[0],
size=[dict_size, EMBED_SIZE],
dtype='float32',
is_sparse=IS_SPARSE,
param_attr='shared_w')
embed_second = fluid.layers.embedding(
input=words[1],
size=[dict_size, EMBED_SIZE],
dtype='float32',
is_sparse=IS_SPARSE,
param_attr='shared_w')
embed_third = fluid.layers.embedding(
input=words[2],
size=[dict_size, EMBED_SIZE],
dtype='float32',
is_sparse=IS_SPARSE,
param_attr='shared_w')
embed_forth = fluid.layers.embedding(
input=words[3],
size=[dict_size, EMBED_SIZE],
dtype='float32',
is_sparse=IS_SPARSE,
param_attr='shared_w')
concat_embed = fluid.layers.concat(
input=[embed_first, embed_second, embed_third, embed_forth], axis=1)
hidden1 = fluid.layers.fc(input=concat_embed,
size=HIDDEN_SIZE,
act='sigmoid')
predict_word = fluid.layers.fc(input=hidden1,
size=dict_size,
act='softmax')
cost = fluid.layers.cross_entropy(input=predict_word, label=words[4])
avg_cost = fluid.layers.mean(cost)
return avg_cost, predict_word
word_dict = paddle.dataset.imikolov.build_dict()
dict_size = len(word_dict)
first_word = fluid.layers.data(name='firstw', shape=[1], dtype='int64')
second_word = fluid.layers.data(name='secondw', shape=[1], dtype='int64')
third_word = fluid.layers.data(name='thirdw', shape=[1], dtype='int64')
forth_word = fluid.layers.data(name='forthw', shape=[1], dtype='int64')
next_word = fluid.layers.data(name='nextw', shape=[1], dtype='int64')
avg_cost, predict_word = __network__(
[first_word, second_word, third_word, forth_word, next_word])
inference_program = paddle.fluid.default_main_program().clone()
sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.001)
sgd_optimizer.minimize(avg_cost)
train_reader = paddle.batch(
paddle.dataset.imikolov.train(word_dict, N), BATCH_SIZE)
test_reader = paddle.batch(
paddle.dataset.imikolov.test(word_dict, N), BATCH_SIZE)
return inference_program, avg_cost, train_reader, test_reader, predict_word
def get_transpiler(trainer_id, main_program, pserver_endpoints, trainers):
t = fluid.DistributeTranspiler()
t.transpile(
trainer_id=trainer_id,
program=main_program,
pservers=pserver_endpoints,
trainers=trainers)
return t
def run_pserver(pserver_endpoints, trainers, current_endpoint):
get_model()
t = get_transpiler(0,
fluid.default_main_program(), pserver_endpoints,
trainers)
pserver_prog = t.get_pserver_program(current_endpoint)
startup_prog = t.get_startup_program(current_endpoint, pserver_prog)
place = fluid.CPUPlace()
exe = fluid.Executor(place)
exe.run(startup_prog)
exe.run(pserver_prog)
class TestDistMnist(unittest.TestCase):
def setUp(self):
self._trainers = 1
self._pservers = 1
self._ps_endpoints = "127.0.0.1:9123"
def start_pserver(self, endpoint):
p = Process(
target=run_pserver,
args=(self._ps_endpoints, self._trainers, endpoint))
p.start()
return p.pid
def _wait_ps_ready(self, pid):
retry_times = 5
while True:
assert retry_times >= 0, "wait ps ready failed"
time.sleep(1)
try:
# the listen_and_serv_op would touch a file which contains the listen port
# on the /tmp directory until it was ready to process all the RPC call.
os.stat("/tmp/paddle.%d.port" % pid)
return
except os.error:
retry_times -= 1
def stop_pserver(self, pid):
os.kill(pid, signal.SIGKILL)
def test_with_place(self):
p = fluid.CUDAPlace(0) if core.is_compiled_with_cuda(
) else fluid.CPUPlace()
pserver_pid = self.start_pserver(self._ps_endpoints)
self._wait_ps_ready(pserver_pid)
self.run_trainer(p, 0)
self.stop_pserver(pserver_pid)
def run_trainer(self, place, trainer_id):
test_program, avg_cost, train_reader, test_reader, predict = get_model()
t = get_transpiler(trainer_id,
fluid.default_main_program(), self._ps_endpoints,
self._trainers)
trainer_prog = t.get_trainer_program()
exe = fluid.Executor(place)
exe.run(fluid.default_startup_program())
use_gpu = True if core.is_compiled_with_cuda() else False
exec_strategy = ExecutionStrategy()
exec_strategy.use_cuda = use_gpu
train_exe = fluid.ParallelExecutor(
use_cuda=use_gpu,
main_program=trainer_prog,
loss_name=avg_cost.name,
exec_strategy=exec_strategy)
feed_var_list = [
var for var in trainer_prog.global_block().vars.itervalues()
if var.is_data
]
feeder = fluid.DataFeeder(feed_var_list, place)
for pass_id in xrange(10):
for batch_id, data in enumerate(train_reader()):
avg_loss_np = train_exe.run(feed=feeder.feed(data),
fetch_list=[avg_cost.name])
loss = np.array(avg_loss_np).mean()
if float(loss) < 5.0:
return
if math.isnan(loss):
assert ("Got Nan loss, training failed")
if __name__ == "__main__":
unittest.main()
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
import paddle.fluid.core as core
import numpy
import paddle.fluid.layers as layers
from paddle.fluid.framework import Program, program_guard
from paddle.fluid.executor import Executor
import paddle.fluid as fluid
import paddle.fluid.core as core
class TestFillZerosLikeOpForTensorArray(unittest.TestCase):
def place(self):
return core.CPUPlace()
def test_zero_filling_lod_tensor_array(self):
tensor = core.LoDTensor()
tensor.set(
numpy.arange(20).reshape(20, 1).astype('int32'), self.place())
tensor.set_lod([[0, 2, 5], [0, 3, 9, 11, 17, 20]])
expect = [
numpy.array(
[0, 0, 0, 0, 0], dtype='int32'), numpy.array(
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype='int32'),
numpy.array(
[0, 0, 0], dtype='int32')
]
lod = [[[0, 2, 5]], [[0, 6, 12]], [[0, 3]]]
self.main(
tensor=tensor,
expect_array=expect,
expect_lod=lod,
expect_max_len=3)
def main(self, tensor, expect_array, expect_lod, expect_max_len, level=0):
place = self.place()
program = Program()
with program_guard(program):
x = layers.data(name='x', shape=[10])
x.persistable = True
table = layers.lod_rank_table(x, level=level)
max_len = layers.max_sequence_len(table)
max_len.persistable = True
array = layers.lod_tensor_to_array(x, table)
array = layers.fill_zeros_like(array)
array.persistable = True
result = layers.array_to_lod_tensor(array, table)
result.persistable = True
exe = Executor(place)
scope = core.Scope()
exe.run(program, feed={'x': tensor}, scope=scope)
var = scope.find_var(array.name)
array = var.get_lod_tensor_array()
if expect_array is not None and expect_lod is not None:
self.check_array_same(array, expect_array, expect_lod)
self.assertEqual(
numpy.array(scope.find_var(max_len.name).get_tensor())[0],
expect_max_len)
def check_array_same(self, array, expect_tensor, expect_lod):
self.assertEqual(len(expect_tensor), len(array))
for i, exp in enumerate(zip(expect_tensor, expect_lod)):
exp_tensor, exp_lod = exp
exp_tensor = numpy.expand_dims(exp_tensor, axis=1)
self.assertTrue(numpy.allclose(exp_tensor, numpy.array(array[i])))
self.assertEqual(exp_lod, array[i].lod())
if __name__ == '__main__':
unittest.main()
......@@ -16,6 +16,8 @@ import paddle.fluid as fluid
import numpy as np
import unittest
import os
import sys
import math
def simple_fc_net():
......@@ -73,6 +75,14 @@ class ParallelExecutorTestingDuringTraining(unittest.TestCase):
train_loss, = train_exe.run([loss.name], feed=feed_dict)
avg_test_loss_val = np.array(test_loss).mean()
if math.isnan(float(avg_test_loss_val)):
sys.exit("got NaN loss, testing failed.")
avg_train_loss_val = np.array(train_loss).mean()
if math.isnan(float(avg_train_loss_val)):
sys.exit("got NaN loss, training failed.")
self.assertTrue(
np.allclose(
train_loss, test_loss, atol=1e-8),
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册