提交 49a04d75 编写于 作者: F fengjiayi

Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into dev_data_balance

......@@ -4,7 +4,6 @@
[![Build Status](https://travis-ci.org/PaddlePaddle/Paddle.svg?branch=develop)](https://travis-ci.org/PaddlePaddle/Paddle)
[![Documentation Status](https://img.shields.io/badge/docs-latest-brightgreen.svg?style=flat)](http://www.paddlepaddle.org/docs/develop/documentation/en/getstarted/index_en.html)
[![Documentation Status](https://img.shields.io/badge/中文文档-最新-brightgreen.svg)](http://www.paddlepaddle.org/docs/develop/documentation/zh/getstarted/index_cn.html)
[![Coverage Status](https://coveralls.io/repos/github/PaddlePaddle/Paddle/badge.svg?branch=develop)](https://coveralls.io/github/PaddlePaddle/Paddle?branch=develop)
[![Release](https://img.shields.io/github/release/PaddlePaddle/Paddle.svg)](https://github.com/PaddlePaddle/Paddle/releases)
[![License](https://img.shields.io/badge/license-Apache%202-blue.svg)](LICENSE)
......
......@@ -96,6 +96,20 @@ if(NOT APPLE AND NOT ANDROID)
set(CMAKE_CXX_LINK_EXECUTABLE "${CMAKE_CXX_LINK_EXECUTABLE} -pthread -ldl -lrt")
endif(NOT APPLE AND NOT ANDROID)
set_property(GLOBAL PROPERTY FLUID_MODULES "")
# find all fluid modules is used for paddle fluid static library
# for building inference libs
function(find_fluid_modules TARGET_NAME)
get_filename_component(__target_path ${TARGET_NAME} ABSOLUTE)
string(REGEX REPLACE "^${PADDLE_SOURCE_DIR}/" "" __target_path ${__target_path})
string(FIND "${__target_path}" "fluid" pos)
if(pos GREATER 1)
get_property(fluid_modules GLOBAL PROPERTY FLUID_MODULES)
set(fluid_modules ${fluid_modules} ${TARGET_NAME})
set_property(GLOBAL PROPERTY FLUID_MODULES "${fluid_modules}")
endif()
endfunction(find_fluid_modules)
function(merge_static_libs TARGET_NAME)
set(libs ${ARGN})
list(REMOVE_DUPLICATES libs)
......@@ -250,6 +264,7 @@ function(cc_test TARGET_NAME)
WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
if (${cc_test_SERIAL})
set_property(TEST ${TARGET_NAME} PROPERTY SERIAL 1)
set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_init_allocated_mem=true)
endif()
endif()
endfunction(cc_test)
......@@ -314,6 +329,7 @@ function(nv_test TARGET_NAME)
add_test(${TARGET_NAME} ${TARGET_NAME})
if (nv_test_SERIAL)
set_property(TEST ${TARGET_NAME} PROPERTY SERIAL 1)
set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_init_allocated_mem=true)
endif()
endif()
endfunction(nv_test)
......@@ -561,7 +577,7 @@ function(py_test TARGET_NAME)
set(multiValueArgs SRCS DEPS ARGS ENVS)
cmake_parse_arguments(py_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
add_test(NAME ${TARGET_NAME}
COMMAND env PYTHONPATH=${PADDLE_BINARY_DIR}/python ${py_test_ENVS}
COMMAND env FLAGS_init_allocated_mem=true PYTHONPATH=${PADDLE_BINARY_DIR}/python ${py_test_ENVS}
${PYTHON_EXECUTABLE} -u ${py_test_SRCS} ${py_test_ARGS}
WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
endif()
......
......@@ -12,19 +12,6 @@
# See the License for the specific language governing permissions and
# limitations under the License.
set_property(GLOBAL PROPERTY FLUID_MODULES "")
# find all fluid modules is used for paddle fluid static library
function(find_fluid_modules TARGET_NAME)
get_filename_component(__target_path ${TARGET_NAME} ABSOLUTE)
string(REGEX REPLACE "^${PADDLE_SOURCE_DIR}/" "" __target_path ${__target_path})
string(FIND "${__target_path}" "fluid" pos)
if(pos GREATER 1)
get_property(fluid_modules GLOBAL PROPERTY FLUID_MODULES)
set(fluid_modules ${fluid_modules} ${TARGET_NAME})
set_property(GLOBAL PROPERTY FLUID_MODULES "${fluid_modules}")
endif()
endfunction(find_fluid_modules)
# make package for paddle fluid shared and static library
function(copy TARGET)
set(options "")
......@@ -163,9 +150,9 @@ if(WITH_CONTRIB)
list(APPEND inference_deps contrib_anakin_inference_lib)
endif()
copy(contrib_inference_lib DEPS paddle_inference_api
copy(contrib_inference_lib DEPS paddle_inference_api paddle_inference_api_shared
SRCS ${PADDLE_SOURCE_DIR}/paddle/contrib/inference/paddle_inference_api.h
${PADDLE_BINARY_DIR}/paddle/contrib/inference/libpaddle_inference_api.*
${PADDLE_BINARY_DIR}/paddle/contrib/inference/libpaddle_inference_api*
DSTS ${contrib_dst_dir} ${contrib_dst_dir})
list(APPEND inference_deps contrib_inference_lib)
endif()
......
......@@ -74,10 +74,10 @@ void OperatorWithKernel::Run(
auto kernel_type_for_var = this->GetKernelTypeForVar(...);
if (kernel_type_for_var.place_ != expected_kernel_key.place_) {
auto* trans_var = new_scope.Var(var_name);
auto* out = DataTransform(expected_kernel_key,
auto* out = TransformData(expected_kernel_key,
kernel_type_for_var,
*tensor_in);
CopyVariableWithTensor(...);
SetTensorToVariable(...);
}
}
......
......@@ -46,6 +46,10 @@ cc_library(paddle_inference_api
SRCS paddle_inference_api.cc paddle_inference_api_impl.cc
DEPS ${FLUID_CORE_MODULES} ${GLOB_OP_LIB})
cc_library(paddle_inference_api_shared SHARED
SRCS paddle_inference_api.cc paddle_inference_api_impl.cc
DEPS ${FLUID_CORE_MODULES} ${GLOB_OP_LIB})
cc_test(test_paddle_inference_api
SRCS test_paddle_inference_api.cc
DEPS paddle_inference_api)
......
......@@ -147,9 +147,9 @@ void TransDataLayoutFromMKLDNN(const OpKernelType& kernel_type_for_var,
"Input tensor type is not supported: ", in.type().name());
memory::data_type out_type = in_type;
auto in_format = MKLDNNFormatForSize(in_tz.size(), in.format());
auto in_format = platform::MKLDNNFormatForSize(in_tz.size(), in.format());
auto out_format =
MKLDNNFormatForSize(in_tz.size(), ToMKLDNNFormat(out_layout));
platform::MKLDNNFormatForSize(in_tz.size(), ToMKLDNNFormat(out_layout));
void* in_data = GetDataFromTensor(in, in_type);
......
......@@ -62,12 +62,6 @@ inline MKLDNNDataType ToMKLDNNDataType(const std::type_index type) {
return MKLDNNDataType::data_undef;
}
inline MKLDNNFormat MKLDNNFormatForSize(size_t dims_size,
MKLDNNFormat default_format) {
return (dims_size == 1
? mkldnn::memory::format::x
: dims_size == 2 ? mkldnn::memory::format::nc : default_format);
}
#endif
void TransDataLayoutFromMKLDNN(const OpKernelType& kernel_type_for_var,
......
......@@ -18,17 +18,21 @@ limitations under the License. */
#include "paddle/fluid/framework/data_layout_transform.h"
#include "paddle/fluid/framework/data_type_transform.h"
#ifdef PADDLE_WITH_MKLDNN
#include "paddle/fluid/platform/mkldnn_helper.h"
#endif
namespace paddle {
namespace framework {
static void PassTensorData(Tensor* from, Tensor* to) {
static void PassTensorData(Tensor *from, Tensor *to) {
to->ShareDataWith(*from);
*from = Tensor();
}
void DataTransform(const OpKernelType& expected_kernel_type,
const OpKernelType& kernel_type_for_var,
const Tensor& input_tensor, Tensor* output_tensor) {
void TransformData(const OpKernelType &expected_kernel_type,
const OpKernelType &kernel_type_for_var,
const Tensor &input_tensor, Tensor *output_tensor) {
bool transformed = false;
Tensor in;
in.ShareDataWith(input_tensor);
......@@ -48,8 +52,8 @@ void DataTransform(const OpKernelType& expected_kernel_type,
// Case1 - transform from Non-MKLDNN OPKernel to MKLDNN OPKernel
// Just set layout/format. No real transform occur
auto out_format =
MKLDNNFormatForSize(in.dims().size(), ToMKLDNNFormat(lin));
auto out_format = platform::MKLDNNFormatForSize(in.dims().size(),
ToMKLDNNFormat(lin));
out.ShareDataWith(input_tensor);
out.set_layout(DataLayout::kMKLDNN);
......@@ -89,17 +93,17 @@ void DataTransform(const OpKernelType& expected_kernel_type,
output_tensor->ShareDataWith(in);
}
void CopyVariableWithTensor(const Variable& in_var, const Tensor& tensor,
Variable* out_var) {
void SetTensorToVariable(const Variable &in_var, const Tensor &tensor,
Variable *out_var) {
if (in_var.IsType<LoDTensor>()) {
auto& in_lod_tensor = in_var.Get<LoDTensor>();
auto* tran_lod_tensor = out_var->GetMutable<LoDTensor>();
auto &in_lod_tensor = in_var.Get<LoDTensor>();
auto *tran_lod_tensor = out_var->GetMutable<LoDTensor>();
tran_lod_tensor->set_lod(in_lod_tensor.lod());
tran_lod_tensor->set_layout(in_lod_tensor.layout());
tran_lod_tensor->ShareDataWith(tensor);
} else if (in_var.IsType<SelectedRows>()) {
auto& in_selected_rows = in_var.Get<SelectedRows>();
auto* trans_selected_rows = out_var->GetMutable<SelectedRows>();
auto &in_selected_rows = in_var.Get<SelectedRows>();
auto *trans_selected_rows = out_var->GetMutable<SelectedRows>();
trans_selected_rows->set_height(in_selected_rows.height());
trans_selected_rows->set_rows(in_selected_rows.rows());
trans_selected_rows->mutable_value()->ShareDataWith(tensor);
......
......@@ -30,12 +30,15 @@ limitations under the License. */
namespace paddle {
namespace framework {
void DataTransform(const OpKernelType& expected_kernel_type,
const OpKernelType& kernel_type_for_var,
const Tensor& input_tensor, Tensor* out);
void CopyVariableWithTensor(const Variable& in_var, const Tensor& tensor,
Variable* out_var);
void TransformData(const OpKernelType &expected_kernel_type,
const OpKernelType &kernel_type_for_var,
const Tensor &input_tensor, Tensor *out);
/**
* Set OutVar from InVar, except the tensor is shared with `tensor`
*/
void SetTensorToVariable(const Variable &in_var, const Tensor &tensor,
Variable *out_var);
} // namespace framework
} // namespace paddle
......@@ -97,7 +97,7 @@ inline bool NeedTransformLayout(const DataLayout& l, const DataLayout& r) {
return ret;
}
inline bool TransFromNeeded(const OpKernelType& l, const OpKernelType& r) {
inline bool NeedTransform(const OpKernelType& l, const OpKernelType& r) {
return (!platform::places_are_same_class(l.place_, r.place_)) ||
(l.data_type_ != r.data_type_) ||
NeedTransformLayout(l.data_layout_, r.data_layout_);
......
......@@ -620,8 +620,6 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
"There are no kernels which are registered in the %s operator.", type_);
}
ExecutionContext ctx(*this, scope, *dev_ctx);
OpKernelMap& kernels = kernels_iter->second;
// TODO(dzhwinter) : kernel fallback mechanism will be added when all the
......@@ -631,7 +629,8 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
// Do selection
// }
auto expected_kernel_key = this->GetExpectedKernelType(ctx);
auto expected_kernel_key =
this->GetExpectedKernelType(ExecutionContext(*this, scope, *dev_ctx));
VLOG(3) << "expected_kernel_key:" << expected_kernel_key;
auto kernel_iter = kernels.find(expected_kernel_key);
......@@ -640,56 +639,34 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
KernelTypeToString(expected_kernel_key));
}
// do data transform
Scope& new_scope = scope.NewScope();
// do data transformScope &transfer_scope;
std::vector<std::string> transfered_inplace_vars;
auto* transfer_scope =
TryTransferData(scope, expected_kernel_key, &transfered_inplace_vars);
std::vector<std::string> inplace_vars;
for (auto& var_name_item : this->Inputs()) {
for (auto& var_name : var_name_item.second) {
auto* var = scope.FindVar(var_name);
if (var && VarIsTensor(var)) {
auto* tensor_in = GetTensorFromVar(var);
if (tensor_in->IsInitialized()) {
auto kernel_type_for_var = this->GetKernelTypeForVar(
var_name_item.first, *tensor_in, expected_kernel_key);
if (TransFromNeeded(kernel_type_for_var, expected_kernel_key)) {
auto out_var_names = OutputVars(true);
if (std::find(out_var_names.begin(), out_var_names.end(),
var_name) != out_var_names.end()) {
inplace_vars.push_back(var_name);
}
VLOG(3) << "Transform Variable " << var_name << " from "
<< kernel_type_for_var << " to " << expected_kernel_key;
auto* trans_var = new_scope.Var(var_name);
std::shared_ptr<Tensor> out(new Tensor);
DataTransform(expected_kernel_key, kernel_type_for_var, *tensor_in,
out.get());
CopyVariableWithTensor(*var, *(out.get()), trans_var);
}
}
}
}
// exec scope is the scope that kernel actually executed on.
const Scope& exec_scope =
(transfer_scope == nullptr ? scope : *transfer_scope);
if (!(expected_kernel_key.place_ == dev_ctx->GetPlace())) {
dev_ctx = pool.Get(expected_kernel_key.place_);
}
auto* new_dev_ctx = pool.Get(expected_kernel_key.place_);
kernel_iter->second->Compute(
ExecutionContext(*this, new_scope, *new_dev_ctx));
kernel_iter->second->Compute(ExecutionContext(*this, exec_scope, *dev_ctx));
for (auto& var_name : inplace_vars) {
VLOG(3) << "share inplace var " + var_name + " back to it's original scope";
auto* original_tensor = GetMutableTensorFromVar(scope.FindVar(var_name));
auto* transformed_tensor = GetTensorFromVar(new_scope.FindVar(var_name));
original_tensor->ShareDataWith(*transformed_tensor);
if (!transfered_inplace_vars.empty()) {
// there is inplace variable has been transfered.
TransferInplaceVarsBack(scope, transfered_inplace_vars, *transfer_scope);
}
/*For profiling/benchmark only*/
if (FLAGS_benchmark) {
new_dev_ctx->Wait();
dev_ctx->Wait();
}
if (FLAGS_check_nan_inf) {
for (auto& vname : OutputVars(true)) {
auto* var = new_scope.FindVar(vname);
auto* var = exec_scope.FindVar(vname);
if (var == nullptr) continue;
if (var->IsType<framework::LoDTensor>()) {
CheckTensorNANOrInf(vname, var->Get<framework::LoDTensor>());
......@@ -697,6 +674,64 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
}
}
}
void OperatorWithKernel::TransferInplaceVarsBack(
const Scope& scope, const std::vector<std::string>& inplace_vars,
const Scope& transfer_scope) const {
for (auto& var_name : inplace_vars) {
VLOG(3) << "share inplace var " + var_name + " back to it's original scope";
auto* original_tensor = GetMutableTensorFromVar(scope.FindVar(var_name));
auto* transformed_tensor =
GetTensorFromVar(transfer_scope.FindVar(var_name));
original_tensor->ShareDataWith(*transformed_tensor);
}
}
Scope* OperatorWithKernel::TryTransferData(
const Scope& scope, const OpKernelType& expected_kernel_key,
std::vector<std::string>* transfered_inplace_vars) const {
Scope* new_scope = nullptr;
for (auto& var_name_item : Inputs()) {
for (auto& var_name : var_name_item.second) {
auto* var = scope.FindVar(var_name);
// Only tensor can be tranfer to another device.
if (var == nullptr || !VarIsTensor(var)) {
continue;
}
auto* tensor_in = GetTensorFromVar(var);
if (!tensor_in->IsInitialized()) {
continue;
}
auto kernel_type_for_var = GetKernelTypeForVar(
var_name_item.first, *tensor_in, expected_kernel_key);
if (!NeedTransform(kernel_type_for_var, expected_kernel_key)) {
continue;
}
auto out_var_names = OutputVars(true);
if (std::find(out_var_names.begin(), out_var_names.end(), var_name) !=
out_var_names.end()) {
transfered_inplace_vars->emplace_back(var_name);
}
VLOG(3) << "Transform Variable " << var_name << " from "
<< kernel_type_for_var << " to " << expected_kernel_key;
if (new_scope == nullptr) {
new_scope = &scope.NewScope();
}
auto* trans_var = new_scope->Var(var_name);
Tensor out;
TransformData(expected_kernel_key, kernel_type_for_var, *tensor_in, &out);
SetTensorToVariable(*var, out, trans_var);
}
}
return new_scope;
}
proto::VarType::Type OperatorWithKernel::IndicateDataType(
const ExecutionContext& ctx) const {
......
......@@ -384,6 +384,20 @@ class OperatorWithKernel : public OperatorBase {
// same.
proto::VarType::Type IndicateDataType(const ExecutionContext& ctx) const;
void RunImpl(const Scope& scope, const platform::Place& place) const final;
/**
* Transfer data from scope to a transfered scope. If there is no data need to
* be tranfered, it returns nullptr.
*
* * transfered_inplace_vars is a output vector.
*/
Scope* TryTransferData(
const Scope& scope, const OpKernelType& expected_kernel_key,
std::vector<std::string>* transfered_inplace_vars) const;
void TransferInplaceVarsBack(const Scope& scope,
const std::vector<std::string>& inplace_vars,
const Scope& exec_scope) const;
};
extern bool OpSupportGPU(const std::string& op_type);
......
......@@ -253,9 +253,6 @@ void ParallelExecutor::FeedAndSplitTensorIntoLocalScopes(
t->set_lod(lod_tensors[j].lod());
}
}
for (auto &p : member_->places_) {
platform::DeviceContextPool::Instance().Get(p)->Wait();
}
}
ParallelExecutor::~ParallelExecutor() {
......
......@@ -69,7 +69,22 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place,
PADDLE_ENFORCE(platform::is_gpu_place(ctx_place));
auto stream =
reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream();
memory::Copy(dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size, stream);
if (platform::is_same_place(src_place, dst_place)) {
memory::Copy(dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size,
stream);
} else {
if (platform::is_same_place(ctx_place, src_place)) {
memory::Copy(dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size,
stream);
platform::DeviceContextPool::Instance().Get(src.place())->Wait();
} else if (platform::is_same_place(ctx_place, dst_place)) {
platform::DeviceContextPool::Instance().Get(src.place())->Wait();
memory::Copy(dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size,
stream);
} else {
PADDLE_THROW("ctx is not belong to dst_gpu_place or src_gpu_place.");
}
}
}
#endif
}
......@@ -78,10 +93,10 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place,
Tensor* dst) {
platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
const platform::DeviceContext* dev_ctx;
if (platform::is_gpu_place(src.place())) {
dev_ctx = pool.Get(src.place());
} else {
if (platform::is_gpu_place(dst_place)) {
dev_ctx = pool.Get(dst_place);
} else {
dev_ctx = pool.Get(src.place());
}
TensorCopy(src, dst_place, *dev_ctx, dst);
}
......
......@@ -23,10 +23,25 @@ limitations under the License. */
namespace paddle {
namespace framework {
// NOTE(zcd): Because TensorCopy is an async operation, when the src_place
// and dst_place are two different GPU, to ensure that the operation can
// be carried out correctly, there is a src_ctx wait operation in TensorCopy.
// If ctx_place and src_place are the same, src_ctx.Wait() is added
// after memory::Copy; if ctx_place and dst_place are the same,
// src_ctx.Wait() is added before memory::Copy.
void TensorCopy(const Tensor& src, const platform::Place& dst_place,
const platform::DeviceContext& ctx, Tensor* dst);
// NOTE(zcd): If the src.place() and dst_place are two different GPU,
// the copy operation is carried out on the dst_place's stream. This is
// very important, because TensorCopy is an async operator, and in most
// case, once this copy operator returns, dst is to be used in dst_place's
// stream, if this copy operation is carried out on the src_place's stream,
// when dst is used in dst_place's stream the copy operation may be
// not completed.
void TensorCopy(const Tensor& src, const platform::Place& dst_place,
Tensor* dst);
void TensorCopySync(const Tensor& src, const platform::Place& dst_place,
Tensor* dst);
......
# Inference Analysis
The `inference/analysis` module is used to analyze and optimize the inference program,
it references some philosophy from `LLVM/analysis`,
and make the various optimization features be pluggable and co-exist in a pipeline.
We borrowed some concepts from LLVM, such as
- [Pass](./pass.h)es to implement optimization that traverse the inference program,
- [DataFlowGraph](./data_flow_graph.h) to represent the data flow graph built from a program,
- [PassManager](./pass_manager.h) to manage a sequence of `Pass`es over a graph.
There are some other basic concepts here
- [Node](./node.h), the node in a `DataFlowGraph`,
- `Function`, the Operator in Fluid,
- `Value`, the Variable in Fluid;
- [Argument](./argument.h), the argument that treat as the input and output of all `Pass`es in the pipeline,
## How it works
The `inference/analysis` module make all the passes in a pipeline, and works in such way:
1. Build a `DataFlowGraph` from a Fluid inference ProgramDesc,
2. Call the middle passes one by one, the same `DataFlowGraph` is passed across all the passes,
3. Transform a new ProgramDesc from the modified `DataFlowGraph`.
The new optimization features can be added as an independent `Pass` and controlled by gflags,
each pass will generate unified debug information or visualization for better debugging.
## Supported Passes
### `FluidToDataFlowGraphPass`
Transform the fluid `ProgramDesc` to a `DataFlowGraph` to give an abstract representation for all the middle passes,
this should be the first pass of the pipeline.
### `DataFlowGraphToFluidPass`
Generate a final `ProgramDesc` from a data flow graph, this should be the last pass of the pipeline.
### `TensorRTSubgraphNodeMarkPass`
Mark the `Node` that are supported by TensorRT,
this pass will generate a visualization file which can be used for debugging.
### `TensorRTSubGraphPass`
Split the sub-graph that are can be accelerated by TensorRT.
### `DFG_GraphvizDrawPass`
This pass is just for debug, it will visualize the `DataFlowGraph` using the [graphviz](http://www.graphviz.org) tool.
It can be used as a helper class that draws the modified graph after each pass.
## Utilities
There is some helper function/class for analysis.
- [dot.h](./dot.h) give a easy to use interface for generating `DOT` codes,
- [graph_traits.h](./graph_traits.h) contains the graph traversal algorithms, it uses `iterator` to make the algorithms easy to share across different passes.
......@@ -13,6 +13,7 @@
// limitations under the License.
#include "paddle/fluid/inference/analysis/analyzer.h"
#include <string>
#include "paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.h"
#include "paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.h"
#include "paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h"
......
......@@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
/*
* This file contains Analyzer, an class that exposed as a library that analyze
* and optimize
......
......@@ -138,7 +138,7 @@ struct GraphTraits<DataFlowGraph> {
// sub-graph is the inputs nodes and output nodes that doesn't inside the
// sub-graph.
static std::pair<std::vector<Node *>, std::vector<Node *>>
ExtractInputAndOutputOfSubGraph(std::vector<Node *> &graph) {
ExtractInputAndOutputOfSubGraph(std::vector<Node *> &graph) { // NOLINT
std::unordered_set<Node *> nodes(graph.begin(), graph.end());
std::unordered_set<Node *> inputs;
std::unordered_set<Node *> outputs;
......
......@@ -13,6 +13,7 @@
// limitations under the License.
#include "paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.h"
#include <vector>
#include "paddle/fluid/framework/block_desc.h"
#include "paddle/fluid/framework/op_desc.h"
#include "paddle/fluid/framework/proto_desc.h"
......@@ -150,13 +151,14 @@ namespace {
class DFG_DebuggerPass : public DFG_GraphvizDrawPass {
public:
using Config = DFG_GraphvizDrawPass::Config;
DFG_DebuggerPass(const Config& config) : DFG_GraphvizDrawPass(config) {}
explicit DFG_DebuggerPass(const Config& config)
: DFG_GraphvizDrawPass(config) {}
std::string repr() const override { return "dfg-to-fluid-debuger-pass"; }
bool Finalize() override { return true; }
};
}
} // namespace
Pass* DataFlowGraphToFluidPass::CreateGraphvizDebugerPass() const {
return new DFG_DebuggerPass(DFG_GraphvizDrawPass::Config(
......
......@@ -19,6 +19,7 @@
#pragma once
#include <string>
#include "paddle/fluid/framework/program_desc.h"
#include "paddle/fluid/inference/analysis/data_flow_graph.h"
#include "paddle/fluid/inference/analysis/pass.h"
......
......@@ -46,7 +46,7 @@ class DFG_GraphvizDrawPass : public DataFlowGraphPass {
const bool display_deleted_node;
};
DFG_GraphvizDrawPass(const Config &config) : config_(config) {}
explicit DFG_GraphvizDrawPass(const Config &config) : config_(config) {}
bool Initialize(Argument *argument) override { return true; }
void Run(DataFlowGraph *graph) override;
......
......@@ -15,7 +15,7 @@ limitations under the License. */
#include <string>
#include <vector>
#include "analyzer.h"
#include "paddle/fluid/inference/analysis/analyzer.h"
#include "paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.h"
#include "paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h"
......@@ -88,7 +88,8 @@ namespace {
class DFG_DebuggerPass : public DFG_GraphvizDrawPass {
public:
using Config = DFG_GraphvizDrawPass::Config;
DFG_DebuggerPass(const Config &config) : DFG_GraphvizDrawPass(config) {}
explicit DFG_DebuggerPass(const Config &config)
: DFG_GraphvizDrawPass(config) {}
std::string repr() const override { return "fluid-to-dfg-debuger-pass"; }
bool Finalize() override { return true; }
};
......
......@@ -12,14 +12,14 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/inference/analysis/pass_manager.h"
#include <gtest/gtest.h>
#include "paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.h"
#include "paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.h"
#include "paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h"
#include "paddle/fluid/inference/analysis/pass_manager.h"
#include "paddle/fluid/inference/analysis/ut_helper.h"
#include <gtest/gtest.h>
namespace paddle {
namespace inference {
namespace analysis {
......
......@@ -12,10 +12,12 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass.h"
#include <string>
#include "paddle/fluid/inference/analysis/analyzer.h"
#include "paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.h"
#include "paddle/fluid/inference/analysis/node_attr_flags.h"
#include "paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass.h"
namespace paddle {
namespace inference {
......@@ -29,7 +31,7 @@ void TensorRTSubgraphNodeMarkPass::Run(DataFlowGraph *graph) {
class DfgDebuggerPass : public DFG_GraphvizDrawPass {
public:
DfgDebuggerPass(const DFG_GraphvizDrawPass::Config &config)
explicit DfgDebuggerPass(const DFG_GraphvizDrawPass::Config &config)
: DFG_GraphvizDrawPass(config) {}
std::string repr() const override {
......
......@@ -20,6 +20,12 @@ limitations under the License. */
#include "paddle/fluid/memory/detail/system_allocator.h"
#include "paddle/fluid/platform/gpu_info.h"
DEFINE_bool(init_allocated_mem, false,
"It is a mistake that the values of the memory allocated by "
"BuddyAllocator are always zeroed in some op's implementation. "
"To find this error in time, we use init_allocated_mem to indicate "
"that initializing the allocated memory with a small value "
"during unit testing.");
DECLARE_double(fraction_of_gpu_memory_to_use);
namespace paddle {
......@@ -41,6 +47,9 @@ template <>
void* Alloc<platform::CPUPlace>(platform::CPUPlace place, size_t size) {
VLOG(10) << "Allocate " << size << " bytes on " << platform::Place(place);
void* p = GetCPUBuddyAllocator()->Alloc(size);
if (FLAGS_init_allocated_mem) {
memset(p, 0xEF, size);
}
VLOG(10) << " pointer=" << p;
return p;
}
......@@ -104,6 +113,9 @@ void* Alloc<platform::CUDAPlace>(platform::CUDAPlace place, size_t size) {
LOG(WARNING) << "GPU memory used: " << Used<platform::CUDAPlace>(place);
platform::SetDeviceId(cur_dev);
}
if (FLAGS_init_allocated_mem) {
cudaMemset(ptr, 0xEF, size);
}
return ptr;
}
......@@ -137,6 +149,9 @@ void* Alloc<platform::CUDAPinnedPlace>(platform::CUDAPinnedPlace place,
LOG(WARNING) << "cudaMallocHost Cannot allocate " << size
<< " bytes in CUDAPinnedPlace";
}
if (FLAGS_init_allocated_mem) {
memset(ptr, 0xEF, size);
}
return ptr;
}
......
......@@ -115,8 +115,11 @@ class BatchNormMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
if (fuse_with_relu) flags |= mkldnn::fuse_bn_relu;
// create mkldnn memory from input x tensor
auto src_memory =
memory({{{src_tz}, memory::data_type::f32, x->format()}, mkldnn_engine},
mkldnn::memory::format input_format =
platform::MKLDNNFormatForSize(src_tz.size(), x->format());
auto src_memory = memory(
{{{src_tz}, memory::data_type::f32, input_format}, mkldnn_engine},
to_void_cast(x_data));
// create primitive descriptor for batch norm forward
......@@ -251,14 +254,20 @@ class BatchNormMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
using bn_bwd_types = bn_type_traits<mkldnn::batch_normalization_backward>;
// create mkldnn memory from input diff_y tensor
auto user_diff_dst_memory =
memory({{{diff_dst_tz}, memory::data_type::f32, diff_y->format()},
mkldnn_engine},
mkldnn::memory::format dst_format =
platform::MKLDNNFormatForSize(src_tz.size(), diff_y->format());
auto user_diff_dst_memory = memory(
{{{diff_dst_tz}, memory::data_type::f32, dst_format}, mkldnn_engine},
to_void_cast(diff_y_data));
// create mkldnn memory from input x tensor
auto src_memory =
memory({{{src_tz}, memory::data_type::f32, x->format()}, mkldnn_engine},
mkldnn::memory::format input_format =
platform::MKLDNNFormatForSize(src_tz.size(), x->format());
auto src_memory = memory(
{{{src_tz}, memory::data_type::f32, input_format}, mkldnn_engine},
to_void_cast(x_data));
// for diff_dst, try to use same format as dst in forward pass
......
......@@ -228,7 +228,7 @@ class MKLDNNHandler {
return dstr;
};
return dims2str(operand_dims) + suffix;
};
}
protected:
const MKLDNNDeviceContext& dev_ctx_;
......@@ -237,5 +237,15 @@ class MKLDNNHandler {
bool is_reusing_;
};
inline mkldnn::memory::format MKLDNNFormatForSize(
size_t dims_size, mkldnn::memory::format data_format) {
if (dims_size == 1) {
return mkldnn::memory::format::x;
} else if (dims_size == 2) {
return mkldnn::memory::format::nc;
}
return data_format;
}
} // namespace platform
} // namespace paddle
......@@ -118,7 +118,8 @@ def __bootstrap__():
read_env_flags = [
'use_pinned_memory', 'check_nan_inf', 'benchmark', 'warpctc_dir',
'eager_delete_scope', 'use_mkldnn', 'initial_cpu_memory_in_mb'
'eager_delete_scope', 'use_mkldnn', 'initial_cpu_memory_in_mb',
'init_allocated_mem'
]
if core.is_compiled_with_cuda():
read_env_flags += [
......
......@@ -18,6 +18,8 @@ import unittest
import paddle.fluid as fluid
import time
import numpy as np
import math
import sys
__all__ = ['TestParallelExecutorBase']
......@@ -93,6 +95,12 @@ class TestParallelExecutorBase(unittest.TestCase):
print "%.4f Instance per second" % (
(batch_size * iter + 2) / (end - begin))
avg_last_loss_val = np.array(last_loss).mean()
avg_first_loss_val = np.array(first_loss).mean()
if math.isnan(float(avg_last_loss_val)) or math.isnan(
float(avg_first_loss_val)):
sys.exit("got NaN loss, training failed.")
print first_loss, last_loss
# self.assertGreater(first_loss[0], last_loss[0])
return first_loss, last_loss
......@@ -16,6 +16,8 @@ import paddle.fluid as fluid
import numpy as np
import unittest
import os
import sys
import math
def simple_fc_net():
......@@ -73,6 +75,14 @@ class ParallelExecutorTestingDuringTraining(unittest.TestCase):
train_loss, = train_exe.run([loss.name], feed=feed_dict)
avg_test_loss_val = np.array(test_loss).mean()
if math.isnan(float(avg_test_loss_val)):
sys.exit("got NaN loss, testing failed.")
avg_train_loss_val = np.array(train_loss).mean()
if math.isnan(float(avg_train_loss_val)):
sys.exit("got NaN loss, training failed.")
self.assertTrue(
np.allclose(
train_loss, test_loss, atol=1e-8),
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册