未验证 提交 1d6dc8c4 编写于 作者: L lujun 提交者: GitHub

Merge pull request #17 from PaddlePaddle/develop

merge to local
......@@ -134,7 +134,7 @@ paddle.fluid.layers.sampled_softmax_with_cross_entropy (ArgSpec(args=['logits',
paddle.fluid.layers.hsigmoid (ArgSpec(args=['input', 'label', 'num_classes', 'param_attr', 'bias_attr', 'name', 'path_table', 'path_code', 'is_custom', 'is_sparse'], varargs=None, keywords=None, defaults=(None, None, None, None, None, False, False)), ('document', '80641ee6810b1cdc3fd6e14fc89ecc9d'))
paddle.fluid.layers.beam_search (ArgSpec(args=['pre_ids', 'pre_scores', 'ids', 'scores', 'beam_size', 'end_id', 'level', 'is_accumulated', 'name', 'return_parent_idx'], varargs=None, keywords=None, defaults=(0, True, None, False)), ('document', 'b350b9a30a18e7efd7e1bb740eef6996'))
paddle.fluid.layers.row_conv (ArgSpec(args=['input', 'future_context_size', 'param_attr', 'act'], varargs=None, keywords=None, defaults=(None, None)), ('document', '17485788fffe4e2d36dc58c2ac8d174e'))
paddle.fluid.layers.multiplex (ArgSpec(args=['inputs', 'index'], varargs=None, keywords=None, defaults=None), ('document', '013795af319e2e86d3506741941078ee'))
paddle.fluid.layers.multiplex (ArgSpec(args=['inputs', 'index'], varargs=None, keywords=None, defaults=None), ('document', '2c4d1ae83da6ed35e3b36ba1b3b51d23'))
paddle.fluid.layers.layer_norm (ArgSpec(args=['input', 'scale', 'shift', 'begin_norm_axis', 'epsilon', 'param_attr', 'bias_attr', 'act', 'name'], varargs=None, keywords=None, defaults=(True, True, 1, 1e-05, None, None, None, None)), ('document', 'de6a906950bae9f3c245cb744d22b94e'))
paddle.fluid.layers.group_norm (ArgSpec(args=['input', 'groups', 'epsilon', 'param_attr', 'bias_attr', 'act', 'data_layout', 'name'], varargs=None, keywords=None, defaults=(1e-05, None, None, None, 'NCHW', None)), ('document', '419c3a24a83cc89219a029cf4092788b'))
paddle.fluid.layers.spectral_norm (ArgSpec(args=['weight', 'dim', 'power_iters', 'eps', 'name'], varargs=None, keywords=None, defaults=(0, 1, 1e-12, None)), ('document', '3f536aafba30d793287b52d231baff1b'))
......
......@@ -195,8 +195,7 @@ cc_library(prune SRCS prune.cc DEPS framework_proto)
cc_test(prune_test SRCS prune_test.cc DEPS op_info prune recurrent_op device_context)
cc_test(var_type_inference_test SRCS var_type_inference_test.cc DEPS op_registry
proto_desc)
cc_test(inplace_op_inference_test SRCS inplace_op_inference_test.cc DEPS op_registry proto_desc op_info memory_optimize_helper)
cc_test(inplace_op_inference_test SRCS inplace_op_inference_test.cc DEPS inplace_op_pass op_registry proto_desc op_info memory_optimize_helper pass_builder)
cc_library(selected_rows SRCS selected_rows.cc DEPS tensor)
cc_test(selected_rows_test SRCS selected_rows_test.cc DEPS selected_rows)
......
......@@ -134,6 +134,11 @@ void TransDataLayoutFromMKLDNN(const OpKernelType& kernel_type_for_var,
out_layout =
out_layout == DataLayout::kAnyLayout ? DataLayout::kNCHW : out_layout;
auto& pool = platform::DeviceContextPool::Instance();
auto* dev_ctx = dynamic_cast<platform::MKLDNNDeviceContext*>(
pool.Get(expected_kernel_type.place_));
auto& cpu_engine = dev_ctx->GetEngine();
std::vector<int> in_tz = paddle::framework::vectorize2int(in.dims());
std::vector<int> out_tz = in_tz;
......@@ -142,25 +147,29 @@ void TransDataLayoutFromMKLDNN(const OpKernelType& kernel_type_for_var,
"Input tensor type is not supported: %s", in.type());
memory::data_type out_type = in_type;
auto in_format = platform::MKLDNNFormatForSize(in_tz.size(), in.format());
auto out_format =
platform::MKLDNNFormatForSize(in_tz.size(), ToMKLDNNFormat(out_layout));
// output tensor has the same dims as input. Reorder don't change dims
out->Resize(in.dims());
// tempory mem pd fr out , to make reorder
auto out_mem_pd = paddle::platform::create_prim_desc_from_dims(
paddle::framework::vectorize2int(out->dims()),
mkldnn::memory::format::blocked, out_type);
if (in.get_mkldnn_prim_desc() != out_mem_pd) {
if (in_format != out_format) {
void* in_data = GetDataFromTensor(in, in_type);
auto out_data = out->mutable_data(expected_kernel_type.place_, in.type());
auto in_memory = memory(in.get_mkldnn_prim_desc(), in_data);
auto out_memory = memory(out_mem_pd, out_data);
auto in_memory =
memory({{{in_tz}, in_type, in_format}, cpu_engine}, in_data);
auto out_memory =
memory({{{out_tz}, out_type, out_format}, cpu_engine}, out_data);
platform::Reorder(in_memory, out_memory);
} else {
out->ShareDataWith(in);
}
out->set_layout(out_layout);
// reset format since the out tensor will be feed to non-MKLDNN OPkernel
out->set_format(memory::format::format_undef);
#endif
}
......
......@@ -51,31 +51,13 @@ void TransformData(const OpKernelType &expected_kernel_type,
#ifdef PADDLE_WITH_MKLDNN
// Case1 - transform from Non-MKLDNN OPKernel to MKLDNN OPKernel
// Just set layout/format. No real transform occur
auto out_format = platform::MKLDNNFormatForSize(in.dims().size(),
ToMKLDNNFormat(lin));
out.ShareDataWith(input_tensor);
// TODO(jczaja): Remove that once all mkldnn ops
// are modified to work with mkldnn_blocked
auto mkldnn_fmt = [&](int rank) {
switch (rank) {
case 5:
return mkldnn::memory::format::ncdhw;
case 4:
return mkldnn::memory::format::nchw;
case 3:
return mkldnn::memory::format::ncw;
case 2:
return mkldnn::memory::format::nc;
case 1:
return mkldnn::memory::format::x;
default:
return mkldnn::memory::format::blocked;
}
};
auto out_mem_pd = paddle::platform::create_prim_desc_from_dims(
paddle::framework::vectorize2int(out.dims()),
mkldnn_fmt(out.dims().size()));
out.set_mkldnn_prim_desc(out_mem_pd);
out.set_layout(DataLayout::kMKLDNN);
out.set_format(out_format);
#endif
} else {
// Case2 - transfrom from MKLDNN OPKernel to Non-MKLDNN OPKernel
......
......@@ -10,7 +10,10 @@ cc_library(fetch_barrier_op_handle SRCS fetch_barrier_op_handle.cc DEPS framewor
cc_library(multi_devices_helper SRCS multi_devices_helper.cc DEPS graph graph_helper)
cc_library(multi_devices_graph_print_pass SRCS multi_devices_graph_print_pass.cc DEPS multi_devices_helper)
cc_library(multi_devices_graph_check_pass SRCS multi_devices_graph_check_pass.cc DEPS multi_devices_helper)
cc_library(alloc_continuous_space_for_grad_pass SRCS alloc_continuous_space_for_grad_pass.cc DEPS graph graph_helper)
cc_library(fuse_adam_op_pass SRCS fuse_adam_op_pass.cc fuse_optimizer_op_pass.cc DEPS graph graph_helper)
cc_library(fuse_sgd_op_pass SRCS fuse_sgd_op_pass.cc fuse_optimizer_op_pass.cc DEPS graph graph_helper)
cc_library(variable_visitor SRCS variable_visitor.cc DEPS lod_tensor selected_rows)
......@@ -104,5 +107,7 @@ cc_library(build_strategy SRCS build_strategy.cc DEPS
graph_viz_pass multi_devices_graph_pass
multi_devices_graph_print_pass multi_devices_graph_check_pass
fuse_elewise_add_act_pass multi_batch_merge_pass
fuse_relu_depthwise_conv_pass
memory_optimize_pass lock_free_optimize_pass alloc_continuous_space_for_grad_pass fuse_all_reduce_op_pass)
fuse_relu_depthwise_conv_pass
memory_optimize_pass lock_free_optimize_pass
alloc_continuous_space_for_grad_pass fuse_all_reduce_op_pass
fuse_adam_op_pass fuse_sgd_op_pass)
......@@ -21,6 +21,7 @@
#include "paddle/fluid/framework/details/multi_devices_helper.h"
#include "paddle/fluid/framework/ir/graph_helper.h"
#include "paddle/fluid/framework/op_registry.h"
DEFINE_uint32(fuse_parameter_memory_size, 0, // 0 KB
"fuse_parameter_memory_size is up limited memory size "
"of one group parameters' gradient which is the input "
......@@ -105,20 +106,29 @@ class AllocContinuousSpaceForGradPass : public ir::Pass {
auto ele_dtype = iter->second->Var()->GetDataType();
if (dtype == kDefaultDtype) {
dtype = ele_dtype;
PADDLE_ENFORCE_NE(ele_dtype, kDefaultDtype);
PADDLE_ENFORCE_NE(ele_dtype, kDefaultDtype,
"The data type should not be bool.");
}
PADDLE_ENFORCE_EQ(ele_dtype, dtype);
PADDLE_ENFORCE_EQ(ele_dtype, dtype,
"The data type of input is not consistent.");
}
// Create the fused variable name.
// Create a FusedVarsSet to avoid duplicating names for fused_var in other
// pass.
if (!result.Has(kFusedVars)) {
result.Set(kFusedVars, new FusedVars);
}
const std::string prefix(kFusedVarNamePrefix);
// The fused_var_name should be unique.
auto fused_var_name = prefix + "GRAD@" + params_grads[0].second;
// the kFusedGrads is used be fuse_optimizer_op_pass.
result.Set(kFusedGrads, new FusedGrads);
// the fused_var_name should be unique, so it appends
// params_grads.begin()->second.
auto fused_var_name = std::string(kFusedVarNamePrefix) + "@GRAD@" +
params_grads.begin()->second;
result.Get<FusedGrads>(kFusedGrads) = fused_var_name;
auto &fused_var_set = result.Get<FusedVars>(kFusedVars);
PADDLE_ENFORCE_EQ(fused_var_set.count(fused_var_name), 0);
PADDLE_ENFORCE_EQ(fused_var_set.count(fused_var_name), 0,
"%s is duplicate in FusedVars.", fused_var_name);
fused_var_set.insert(fused_var_name);
InitFusedVarsAndAllocSpaceForVars(places, local_scopes, vars,
......@@ -295,17 +305,6 @@ class AllocContinuousSpaceForGradPass : public ir::Pass {
return type == proto::VarType::LOD_TENSOR;
}
void AppendAllocSpaceForVarsOp(const std::vector<std::string> &params_name,
const std::vector<std::string> &grads_name,
const std::string &fused_var_name,
BlockDesc *global_block) const {
auto op_desc = global_block->AppendOp();
op_desc->SetType("alloc_continuous_space");
op_desc->SetInput("Input", params_name);
op_desc->SetOutput("Output", grads_name);
op_desc->SetOutput("FusedOutput", {fused_var_name});
}
void RecordParamsAndGrads(ir::Node *node,
ParamsAndGrads *params_grads) const {
try {
......@@ -358,6 +357,7 @@ class AllocContinuousSpaceForGradPass : public ir::Pass {
}
}
// Alloc continuous space for vars.
std::vector<std::string> grads_name;
std::vector<std::string> params_name;
grads_name.reserve(params_grads.size());
......@@ -370,7 +370,6 @@ class AllocContinuousSpaceForGradPass : public ir::Pass {
AppendAllocSpaceForVarsOp(params_name, grads_name, fused_var_name,
program_desc.MutableBlock(0));
// Run Only Once Programs
for (size_t i = 0; i < local_scopes.size(); ++i) {
for (auto &op_desc : program_desc.Block(0).AllOps()) {
auto op = OpRegistry::CreateOp(*op_desc);
......@@ -378,6 +377,17 @@ class AllocContinuousSpaceForGradPass : public ir::Pass {
}
}
}
void AppendAllocSpaceForVarsOp(const std::vector<std::string> &params_name,
const std::vector<std::string> &grads_name,
const std::string &fused_var_name,
BlockDesc *global_block) const {
auto op_desc = global_block->AppendOp();
op_desc->SetType("alloc_continuous_space");
op_desc->SetInput("Input", params_name);
op_desc->SetOutput("Output", grads_name);
op_desc->SetOutput("FusedOutput", {fused_var_name});
}
};
} // namespace details
......
......@@ -27,20 +27,17 @@ void BroadcastOpHandle::RunImpl() {
if (places_.size() == 1) return;
// The input and output may have dummy vars.
VarHandle *in_var_handle;
{
auto in_var_handles = DynamicCast<VarHandle>(inputs_);
PADDLE_ENFORCE_EQ(in_var_handles.size(), 1UL,
"The number of input should be one.");
in_var_handle = in_var_handles[0];
}
auto in_var_handles = DynamicCast<VarHandle>(inputs_);
auto out_var_handles = DynamicCast<VarHandle>(outputs_);
PADDLE_ENFORCE_EQ(in_var_handles.size(), 1UL,
"The number of input should be one.");
PADDLE_ENFORCE_EQ(
out_var_handles.size(), places_.size(),
"The number of output should equal to the number of places.");
VarHandle *in_var_handle = in_var_handles[0];
WaitInputVarGenerated();
std::vector<const Scope *> var_scopes;
......
......@@ -17,7 +17,6 @@ limitations under the License. */
#include <glog/logging.h>
#include <memory>
#include <utility>
#include "paddle/fluid/framework/details/memory_optimize_helper.h"
#include "paddle/fluid/framework/details/multi_devices_graph_pass.h"
#include "paddle/fluid/framework/details/multi_devices_graph_print_pass.h"
......@@ -82,23 +81,43 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder {
AppendPass("inplace_pass");
}
if (strategy.fuse_elewise_add_act_ops_) {
if (strategy_.fuse_elewise_add_act_ops_) {
VLOG(10) << "Add fuse_elewise_add_act_pass";
AppendPass("fuse_elewise_add_act_pass");
}
// for single card training, fuse_all_reduce_ops is unnecessary.
// alloc_continuous_space_for_grad_pass should be before of MultiDevPass.
if (strategy.fuse_all_reduce_ops_) {
if (strategy_.fuse_all_reduce_ops_) {
VLOG(10) << "Add alloc_continuous_space_for_grad_pass";
AppendPass("alloc_continuous_space_for_grad_pass");
}
if (strategy_.fuse_all_optimizer_ops_) {
if (strategy_.reduce_ == BuildStrategy::ReduceStrategy::kReduce ||
strategy_.is_distribution_) {
VLOG(3)
<< "Currently, fuse_all_optimizer_ops only works under AllReduce "
"mode.";
strategy_.fuse_all_optimizer_ops_ = false;
} else {
VLOG(10) << "Add alloc_continuous_space_for_grad_pass";
AppendPass("alloc_continuous_space_for_grad_pass");
// NOTE: fuse_all_xx_ops will count the number of xx operator first,
// if the number is zero, fuse_all_reduce_ops will do nothing.
// Currently, only one type of optimization algorithm can be fused.
VLOG(10) << "Add fuse_adam_op_pass";
AppendPass("fuse_adam_op_pass");
VLOG(10) << "Add fuse_sgd_op_pass";
AppendPass("fuse_sgd_op_pass");
}
}
// Add a graph viz pass to record a graph.
if (!strategy.debug_graphviz_path_.empty()) {
auto viz_pass = AppendPass("graph_viz_pass");
const std::string graph_path = string::Sprintf(
"%s%s", strategy.debug_graphviz_path_.c_str(), "_fused_graph");
"%s%s", strategy_.debug_graphviz_path_.c_str(), "_fused_graph");
viz_pass->Set<std::string>("graph_viz_path", new std::string(graph_path));
}
......@@ -118,14 +137,14 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder {
// the de-fact IR, any reuse on Graph is meaningless.
// A side-effect of that, memory optimize cannot forsee the fetched vars
// , so fetchlist should be set persistable before call the Run interface.
if (strategy.memory_optimize_) {
if (strategy_.memory_optimize_) {
VLOG(10) << "Add memory_optimize_pass";
AppendPass("memory_optimize_pass");
}
AppendMultiDevPass(strategy);
AppendMultiDevPass(strategy_);
if (strategy.fuse_all_reduce_ops_) {
if (strategy_.fuse_all_reduce_ops_) {
// NOTE: fuse_all_reduce_ops will count the number of all_reduce operator
// first, if the number is zero, fuse_all_reduce_ops will do nothing.
VLOG(10) << "Add fuse_all_reduce_op_pass";
......@@ -151,7 +170,7 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder {
AppendPass("all_reduce_deps_pass");
}
if (SeqOnlyAllReduceOps(strategy)) {
if (SeqOnlyAllReduceOps(strategy_)) {
VLOG(10) << "Add all_reduce_deps_pass";
AppendPass("all_reduce_deps_pass");
}
......@@ -165,7 +184,7 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder {
// Convert graph to run on multi-devices.
void AppendMultiDevPass(const BuildStrategy &strategy) {
ir::Pass *multi_devices_pass = nullptr;
if (strategy_.is_distribution_) {
if (strategy.is_distribution_) {
VLOG(10) << "Add dist_multi_devices_pass";
multi_devices_pass = AppendPass("dist_multi_devices_pass").get();
} else {
......@@ -235,17 +254,22 @@ ir::Graph *BuildStrategy::Apply(ir::Graph *graph,
pass->Erase(kNCCLCtxs);
pass->SetNotOwned<platform::NCCLContextMap>(kNCCLCtxs, nctx);
#endif
} else if (pass->Type() == "fuse_all_reduce_op_pass") {
} else if (pass->Type() == "alloc_continuous_space_for_grad_pass" ||
pass->Type() == "fuse_adam_op_pass" ||
pass->Type() == "fuse_sgd_op_pass" ||
pass->Type() == "fuse_all_reduce_op_pass") {
pass->Erase(kPlaces);
pass->SetNotOwned<const std::vector<platform::Place>>(kPlaces, &places);
pass->Erase(kLocalScopes);
pass->SetNotOwned<const std::vector<Scope *>>(kLocalScopes,
&local_scopes);
if (pass->Type() == "fuse_all_reduce_op_pass") {
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
platform::NCCLContextMap *nctx = use_cuda ? nccl_ctxs : nullptr;
pass->Erase(kNCCLCtxs);
pass->SetNotOwned<platform::NCCLContextMap>(kNCCLCtxs, nctx);
platform::NCCLContextMap *nctx = use_cuda ? nccl_ctxs : nullptr;
pass->Erase(kNCCLCtxs);
pass->SetNotOwned<platform::NCCLContextMap>(kNCCLCtxs, nctx);
#endif
}
} else if (pass->Type() == "alloc_continuous_space_for_grad_pass") {
pass->Erase(kPlaces);
pass->SetNotOwned<const std::vector<platform::Place>>(kPlaces, &places);
......@@ -294,4 +318,6 @@ USE_PASS(inplace_pass);
USE_PASS(lock_free_optimize_pass);
USE_PASS(alloc_continuous_space_for_grad_pass);
USE_PASS(graph_to_program_pass);
USE_PASS(fuse_adam_op_pass);
USE_PASS(fuse_sgd_op_pass);
USE_PASS(fuse_all_reduce_op_pass);
......@@ -18,7 +18,6 @@
#include <string>
#include <utility>
#include <vector>
#include "paddle/fluid/framework/ir/pass_builder.h"
#include "paddle/fluid/framework/program_desc.h"
#include "paddle/fluid/framework/scope.h"
......@@ -76,6 +75,8 @@ struct BuildStrategy {
bool fuse_elewise_add_act_ops_{false};
bool fuse_all_optimizer_ops_{false};
bool fuse_all_reduce_ops_{false};
bool fuse_relu_depthwise_conv_{false};
......
......@@ -31,9 +31,10 @@ FastThreadedSSAGraphExecutor::FastThreadedSSAGraphExecutor(
local_scopes_(local_scopes),
places_(places),
graph_(graph),
fetch_ctxs_(places),
pool_(strategy.num_threads_),
prepare_pool_(1), // add one more thread for generate op_deps
fetch_ctxs_(places) {
// add one more thread for generate op_deps
prepare_pool_(1) {
for (auto &op : ir::FilterByNodeWrapper<OpHandleBase>(*graph_)) {
int dep = static_cast<int>(op->NotReadyInputSize());
op_deps_.emplace(op, dep);
......
......@@ -14,7 +14,9 @@
#pragma once
#include <ThreadPool.h>
#include <memory>
#include <string>
#include <unordered_map>
#include <vector>
#include "paddle/fluid/framework/blocking_queue.h"
#include "paddle/fluid/framework/details/exception_holder.h"
......@@ -37,6 +39,8 @@ class FastThreadedSSAGraphExecutor : public SSAGraphExecutor {
const ir::Graph &Graph() const override;
private:
// Note(zcd): the ThreadPool should be placed last so that ThreadPool should
// be destroyed first.
ExecutionStrategy strategy_;
std::vector<Scope *> local_scopes_;
std::vector<platform::Place> places_;
......@@ -45,21 +49,22 @@ class FastThreadedSSAGraphExecutor : public SSAGraphExecutor {
std::unordered_map<OpHandleBase *, int> op_deps_;
std::vector<OpHandleBase *> bootstrap_ops_;
::ThreadPool pool_;
::ThreadPool prepare_pool_;
platform::DeviceContextPool fetch_ctxs_;
std::atomic<int> remaining_;
std::future<
std::unique_ptr<std::unordered_map<OpHandleBase *, std::atomic<int>>>>
atomic_op_deps_;
ExceptionHolder exception_;
::ThreadPool pool_;
::ThreadPool prepare_pool_;
void RunOpAsync(std::unordered_map<OpHandleBase *, std::atomic<int>> *op_deps,
OpHandleBase *op,
const std::shared_ptr<BlockingQueue<size_t>> &complete_q);
void PrepareAtomicOpDeps();
std::future<
std::unique_ptr<std::unordered_map<OpHandleBase *, std::atomic<int>>>>
atomic_op_deps_;
ExceptionHolder exception_;
};
} // namespace details
} // namespace framework
......
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/details/fuse_adam_op_pass.h"
#include <algorithm>
#include "paddle/fluid/framework/ir/graph_helper.h"
#include "paddle/fluid/framework/op_registry.h"
namespace paddle {
namespace framework {
namespace details {
const std::string FuseAdamOpPass::GetOpType() const { return "adam"; }
const std::vector<std::string> FuseAdamOpPass::GetAuxiliaryVarNames() const {
return {"Param", "Moment1", "Moment2", "Beta1Pow", "Beta2Pow"};
}
void FuseAdamOpPass::FuseOptimizerOps(
const std::unordered_map<std::string, std::vector<std::string>>
&aux_var_set,
const std::unordered_map<std::string, std::string> &fused_vars_name,
const std::vector<ir::Node *> &adam_ops, ir::Graph *graph) const {
FuseAdamOps(aux_var_set, fused_vars_name, adam_ops, graph);
FuseScaleOps(aux_var_set.at("Beta1Pow"), fused_vars_name.at("Beta1Pow"),
adam_ops, graph);
FuseScaleOps(aux_var_set.at("Beta2Pow"), fused_vars_name.at("Beta2Pow"),
adam_ops, graph);
}
void FuseAdamOpPass::FuseAdamOps(
const std::unordered_map<std::string, std::vector<std::string>> &vars_set,
const std::unordered_map<std::string, std::string> &fused_vars_name,
const std::vector<ir::Node *> &adam_ops, ir::Graph *graph) const {
PADDLE_ENFORCE_GT(adam_ops.size(), static_cast<size_t>(0));
// Check attributions
// NOTE: If new attribution is added, the following code maybe need change.
int op_role = boost::get<int>(
adam_ops[0]->Op()->GetAttr(OpProtoAndCheckerMaker::OpRoleAttrName()));
float beta1 = boost::get<float>(adam_ops[0]->Op()->GetAttr("beta1"));
float beta2 = boost::get<float>(adam_ops[0]->Op()->GetAttr("beta2"));
float epsilon = boost::get<float>(adam_ops[0]->Op()->GetAttr("epsilon"));
bool lazy_mode = boost::get<bool>(adam_ops[0]->Op()->GetAttr("lazy_mode"));
int64_t min_row_size_to_use_multithread = boost::get<int64_t>(
adam_ops[0]->Op()->GetAttr("min_row_size_to_use_multithread"));
for (auto &adam_op : adam_ops) {
PADDLE_ENFORCE_EQ(beta1,
boost::get<float>(adam_op->Op()->GetAttr("beta1")));
PADDLE_ENFORCE_EQ(beta2,
boost::get<float>(adam_op->Op()->GetAttr("beta2")));
PADDLE_ENFORCE_EQ(epsilon,
boost::get<float>(adam_op->Op()->GetAttr("epsilon")));
PADDLE_ENFORCE_EQ(lazy_mode,
boost::get<bool>(adam_op->Op()->GetAttr("lazy_mode")));
PADDLE_ENFORCE_EQ(min_row_size_to_use_multithread,
boost::get<int64_t>(adam_op->Op()->GetAttr(
"min_row_size_to_use_multithread")));
PADDLE_ENFORCE_EQ(op_role, boost::get<int>(adam_op->Op()->GetAttr(
OpProtoAndCheckerMaker::OpRoleAttrName())));
}
// NOTE: fused_var is only exist in scope, so the graph doesn't have fused_var
// node.
VLOG(10) << "Insert adam to graph ";
OpDesc adam_desc(adam_ops[0]->Op()->Block());
adam_desc.SetType("adam");
adam_desc.SetInput("Param", {fused_vars_name.at("Param")});
adam_desc.SetInput("Grad", {fused_vars_name.at("Grad")});
adam_desc.SetInput("Moment1", {fused_vars_name.at("Moment1")});
adam_desc.SetInput("Moment2", {fused_vars_name.at("Moment2")});
// TODO(zcd): The LearningRate, Beta1Pow, Beta2Pow should be equal.
adam_desc.SetInput("LearningRate", adam_ops[0]->Op()->Input("LearningRate"));
adam_desc.SetInput("Beta1Pow", adam_ops[0]->Op()->Input("Beta1Pow"));
adam_desc.SetInput("Beta2Pow", adam_ops[0]->Op()->Input("Beta2Pow"));
adam_desc.SetOutput("ParamOut", {fused_vars_name.at("Param")});
adam_desc.SetOutput("Moment1Out", {fused_vars_name.at("Moment1")});
adam_desc.SetOutput("Moment2Out", {fused_vars_name.at("Moment2")});
adam_desc.SetAttr("beta1", beta1);
adam_desc.SetAttr("beta2", beta2);
adam_desc.SetAttr("epsilon", epsilon);
adam_desc.SetAttr("lazy_mode", lazy_mode);
adam_desc.SetAttr("min_row_size_to_use_multithread",
min_row_size_to_use_multithread);
adam_desc.SetAttr(OpProtoAndCheckerMaker::OpRoleAttrName(), op_role);
auto adam_node = graph->CreateOpNode(&adam_desc);
InserInputAndOutputForOptOps(adam_ops, adam_node);
}
void FuseAdamOpPass::FuseScaleOps(const std::vector<std::string> &beta_name,
const std::string &fused_var_name,
const std::vector<ir::Node *> &adam_ops,
ir::Graph *graph) const {
PADDLE_ENFORCE_EQ(beta_name.size(), adam_ops.size());
const std::string scale_op_name = "scale";
// Get the scale_ops of dealing the adam's beta var.
std::vector<ir::Node *> scale_ops;
scale_ops.reserve(beta_name.size());
for (size_t i = 0; i < adam_ops.size(); ++i) {
auto &beta_1_pow_name = beta_name[i];
auto beta_pow_iter = std::find_if(
adam_ops[i]->inputs.begin(), adam_ops[i]->inputs.end(),
[&beta_name, &beta_1_pow_name](ir::Node *var_node) -> bool {
return var_node->Var() && var_node->Var()->Name() == beta_1_pow_name;
});
PADDLE_ENFORCE(beta_pow_iter != adam_ops[i]->inputs.end());
auto beta_pow_node = *beta_pow_iter;
auto scale_op_iter = std::find_if(
beta_pow_node->outputs.begin(), beta_pow_node->outputs.end(),
[&scale_op_name](ir::Node *op_node) -> bool {
return op_node->Op() && op_node->Op()->Type() == scale_op_name;
});
PADDLE_ENFORCE(scale_op_iter != beta_pow_node->outputs.end());
scale_ops.emplace_back(*scale_op_iter);
}
PADDLE_ENFORCE_EQ(scale_ops.size(), beta_name.size());
// Check attributions
// NOTE: If new attribution is added, the following code maybe need change.
int op_role = boost::get<int>(
scale_ops[0]->Op()->GetAttr(OpProtoAndCheckerMaker::OpRoleAttrName()));
float scale = boost::get<float>(scale_ops[0]->Op()->GetAttr("scale"));
float bias = boost::get<float>(scale_ops[0]->Op()->GetAttr("bias"));
bool bias_after_scale =
boost::get<bool>(scale_ops[0]->Op()->GetAttr("bias_after_scale"));
for (auto &scale_op : scale_ops) {
PADDLE_ENFORCE_EQ(scale,
boost::get<float>(scale_op->Op()->GetAttr("scale")));
PADDLE_ENFORCE_EQ(bias, boost::get<float>(scale_op->Op()->GetAttr("bias")));
PADDLE_ENFORCE_EQ(
bias_after_scale,
boost::get<bool>(scale_op->Op()->GetAttr("bias_after_scale")));
PADDLE_ENFORCE_EQ(op_role, boost::get<int>(scale_op->Op()->GetAttr(
OpProtoAndCheckerMaker::OpRoleAttrName())));
}
// NOTE: fused_var is only exist in scope, so the graph doesn't have fused_var
// node.
VLOG(10) << "Insert fused scale to graph.";
OpDesc scale_desc(scale_ops[0]->Op()->Block());
scale_desc.SetType("scale");
scale_desc.SetInput("X", {fused_var_name});
scale_desc.SetOutput("Out", {fused_var_name});
scale_desc.SetAttr("scale", scale);
scale_desc.SetAttr("bias", bias);
scale_desc.SetAttr("bias_after_scale", bias_after_scale);
scale_desc.SetAttr(OpProtoAndCheckerMaker::OpRoleAttrName(), op_role);
auto scale_node = graph->CreateOpNode(&scale_desc);
for (auto scale_op : scale_ops) {
// set inputs
scale_node->inputs.insert(scale_node->inputs.begin(),
scale_op->inputs.begin(), scale_op->inputs.end());
for (auto &input : scale_op->inputs) {
std::replace(input->outputs.begin(), input->outputs.end(), scale_op,
scale_node);
}
// set outputs
scale_node->outputs.insert(scale_node->outputs.begin(),
scale_op->outputs.begin(),
scale_op->outputs.end());
for (auto &output : scale_op->outputs) {
std::replace(output->inputs.begin(), output->inputs.end(), scale_op,
scale_node);
}
}
// Delete scale_ops
for (auto &scale_op : scale_ops) {
graph->RemoveNode(scale_op);
}
}
} // namespace details
} // namespace framework
} // namespace paddle
REGISTER_PASS(fuse_adam_op_pass, paddle::framework::details::FuseAdamOpPass)
.RequirePassAttr(paddle::framework::details::kPlaces)
.RequirePassAttr(paddle::framework::details::kLocalScopes);
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <string>
#include <unordered_map>
#include <utility>
#include <vector>
#include "paddle/fluid/framework/details/build_strategy.h"
#include "paddle/fluid/framework/details/fuse_optimizer_op_pass.h"
#include "paddle/fluid/framework/details/multi_devices_helper.h"
#include "paddle/fluid/framework/ir/graph.h"
namespace paddle {
namespace framework {
namespace details {
class FuseAdamOpPass : public FuseOptimizerOpPass {
private:
virtual const std::string GetOpType() const;
virtual const std::vector<std::string> GetAuxiliaryVarNames() const;
// Fuse Adam Ops and Scale Ops which are used to update "Beta1Pow", "Beta2Pow"
virtual void FuseOptimizerOps(
const std::unordered_map<std::string, std::vector<std::string>> &vars_set,
const std::unordered_map<std::string, std::string> &fused_vars_name,
const std::vector<ir::Node *> &adam_ops, ir::Graph *graph) const;
void FuseAdamOps(
const std::unordered_map<std::string, std::vector<std::string>> &vars_set,
const std::unordered_map<std::string, std::string> &fused_vars_name,
const std::vector<ir::Node *> &adam_ops, ir::Graph *graph) const;
void FuseScaleOps(const std::vector<std::string> &aux_var_set,
const std::string &fused_var_name,
const std::vector<ir::Node *> &adam_ops,
ir::Graph *graph) const;
};
} // namespace details
} // namespace framework
} // namespace paddle
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/details/fuse_optimizer_op_pass.h"
#include <algorithm>
#include <unordered_set>
#include "paddle/fluid/framework/ir/graph_helper.h"
#include "paddle/fluid/framework/op_registry.h"
namespace paddle {
namespace framework {
namespace details {
void FuseOptimizerOpPass::ApplyImpl(ir::Graph *graph) const {
ir::Graph &result = *graph;
auto &places = Get<const std::vector<platform::Place>>(kPlaces);
auto &local_scopes = Get<const std::vector<Scope *>>(kLocalScopes);
const std::string fuse_op_type = GetOpType();
const std::vector<std::string> aux_var_names = GetAuxiliaryVarNames();
// Step 1: Get the specified op and auxiliary variables.
std::vector<ir::Node *> topo_nodes = ir::TopologySortOperations(result);
std::unordered_map<std::string, std::vector<std::string>> aux_var_set;
std::vector<ir::Node *> opt_ops;
for (auto &node : topo_nodes) {
GetSpecifiedOpsAndVars(fuse_op_type, aux_var_names, node, &opt_ops,
&aux_var_set);
}
VLOG(10) << "Find " << fuse_op_type << " operators: " << opt_ops.size();
if (opt_ops.size() == 0) {
return;
}
if (result.Has(kFusedOptType)) {
VLOG(10)
<< "Currently only support fusing one type optimizer op. Has fused "
<< result.Get<FusedOptType>(kFusedOptType);
return;
} else {
result.Set(kFusedOptType, new FusedOptType);
}
result.Get<FusedOptType>(kFusedOptType) = fuse_op_type;
// Step 2: Insert fused_var_name to FusedVars, and the FusedVars need be
// initialized in scopes before execution.
if (!result.Has(kFusedVars)) {
result.Set(kFusedVars, new FusedVars);
}
std::unordered_map<std::string, std::string> fused_vars_name;
fused_vars_name.reserve(aux_var_names.size() + 1);
auto &fused_var_set = result.Get<FusedVars>(kFusedVars);
const std::string prefix(kFusedVarNamePrefix);
// NOTE: the fused_var_name should be unique.
for (auto &var_name : aux_var_names) {
auto fused_var_name = prefix + "_" + fuse_op_type + "_" + var_name + "_" +
aux_var_set[var_name][0];
VLOG(10) << fused_var_name;
fused_vars_name.emplace(var_name, fused_var_name);
PADDLE_ENFORCE_EQ(fused_var_set.count(fused_var_name), 0);
fused_var_set.insert(fused_var_name);
}
// Step 3: Get the fused Gradient's name
auto &params_grads = result.Get<ParamsAndGrads>(kParamsAndGrads);
if (!result.Has(kFusedGrads)) {
PADDLE_THROW(
"The alloc_continuous_space_for_grad_pass should be called before this "
"pass.");
}
auto &fused_grad = result.Get<FusedGrads>(kFusedGrads);
auto &fused_vars = result.Get<FusedVars>(kFusedVars);
auto iter = std::find(fused_vars.begin(), fused_vars.end(), fused_grad);
PADDLE_ENFORCE(iter != fused_vars.end(), "Not find the fused_grad.");
fused_vars_name.emplace("Grad", fused_grad);
// Step 4: Sort the parameters and auxiliary variables according
// to parameters' name to make variables' name correspond correctly.
PADDLE_ENFORCE(result.Has(kParamsAndGrads), "Does't find kParamsAndGrads.");
PADDLE_ENFORCE_EQ(params_grads.size(), aux_var_set.begin()->second.size(),
"The size of params_grads and aux_var_set are not equal.");
SortParametersAndAuxVars(params_grads, &aux_var_set, &opt_ops);
// Step 5: Alloc continuous space for Parameters and AuxiliaryVar(e.g.
// Moment1, Moment2, Beta1Pow, Beta2Pow) of all the optimizer ops separately.
InitFusedVarsAndAllocSpaceForVars(places, local_scopes, aux_var_names,
aux_var_set, fused_vars_name);
// Step 6: Fuse optimizer Ops and Scale Ops
FuseOptimizerOps(aux_var_set, fused_vars_name, opt_ops, &result);
// Step 7: Remove optimizer Ops
for (auto &opt_op : opt_ops) {
graph->RemoveNode(opt_op);
}
}
void FuseOptimizerOpPass::InitFusedVarsAndAllocSpaceForVars(
const std::vector<platform::Place> &places,
const std::vector<Scope *> &local_scopes,
const std::vector<std::string> &aux_var_names,
const std::unordered_map<std::string, std::vector<std::string>>
&aux_var_set,
const std::unordered_map<std::string, std::string> &fused_vars_name) const {
VLOG(10) << "Init FusedVars.";
// Alloc parameters and auxiliary vars in the respective scope.
size_t idx = local_scopes.size();
for (auto iter = local_scopes.rbegin(); iter != local_scopes.rend();
++iter, --idx) {
auto &scope = *iter;
for (auto &var_name : aux_var_names) {
auto fused_var_name = fused_vars_name.at(var_name);
VLOG(10) << "Init " << fused_var_name;
PADDLE_ENFORCE(scope->FindVar(fused_var_name) == nullptr,
"%s has exist in scope[%d]", fused_var_name, idx);
scope->Var(fused_var_name)->GetMutable<LoDTensor>();
}
}
ProgramDesc program_desc;
auto *global_block = program_desc.MutableBlock(0);
for (auto &var_name : aux_var_names) {
AppendAllocContinuousSpace(aux_var_set.at(var_name),
fused_vars_name.at(var_name), true,
global_block);
}
for (size_t i = 0; i < local_scopes.size(); ++i) {
for (auto &op_desc : global_block->AllOps()) {
auto op = OpRegistry::CreateOp(*op_desc);
op->Run(*local_scopes[i], places[i]);
}
}
}
void FuseOptimizerOpPass::SortParametersAndAuxVars(
const std::vector<std::pair<std::string, std::string>> &params_grads,
std::unordered_map<std::string, std::vector<std::string>> *aux_vars_set,
std::vector<ir::Node *> *ops) const {
PADDLE_ENFORCE_NE(aux_vars_set->count("Param"), static_cast<size_t>(0));
auto &param_vec = aux_vars_set->at("Param");
std::vector<size_t> param_sort_idx;
param_sort_idx.reserve(param_vec.size());
for (auto &p_g : params_grads) {
auto iter = std::find(param_vec.begin(), param_vec.end(), p_g.first);
PADDLE_ENFORCE(iter != param_vec.end());
auto idx = std::distance(param_vec.begin(), iter);
param_sort_idx.emplace_back(idx);
}
for (auto &aux_vars : *aux_vars_set) {
std::vector<std::string> sorted_vars;
sorted_vars.reserve(aux_vars.second.size());
for (size_t i = 0; i < aux_vars.second.size(); ++i) {
sorted_vars.emplace_back(aux_vars.second.at(param_sort_idx[i]));
}
std::swap(aux_vars.second, sorted_vars);
std::stringstream out;
for (auto &var_name : aux_vars.second) {
out << var_name << " ";
}
VLOG(10) << aux_vars.first << ": " << out.str();
}
std::vector<ir::Node *> sorted_ops;
sorted_ops.reserve(ops->size());
for (size_t i = 0; i < ops->size(); ++i) {
sorted_ops.emplace_back(ops->at(param_sort_idx[i]));
}
std::swap(*ops, sorted_ops);
}
void FuseOptimizerOpPass::GetSpecifiedOpsAndVars(
const std::string &op_type, const std::vector<std::string> &aux_vars_name,
ir::Node *node, std::vector<ir::Node *> *ops,
std::unordered_map<std::string, std::vector<std::string>> *aux_args_name)
const {
if (node->Op()->Type() != op_type) return;
for (auto &var_n : aux_vars_name) {
auto arg_names = node->Op()->Input(var_n);
PADDLE_ENFORCE_EQ(arg_names.size(), static_cast<size_t>(1));
(*aux_args_name)[var_n].emplace_back(arg_names[0]);
VLOG(10) << var_n << ", " << arg_names[0];
}
ops->emplace_back(node);
}
void FuseOptimizerOpPass::AppendAllocContinuousSpace(
const std::vector<std::string> &args, const std::string &out_arg,
bool copy_data, BlockDesc *global_block) const {
auto op_desc = global_block->AppendOp();
op_desc->SetType("alloc_continuous_space");
op_desc->SetInput("Input", args);
op_desc->SetOutput("Output", args);
op_desc->SetOutput("FusedOutput", {out_arg});
op_desc->SetAttr("copy_data", copy_data);
op_desc->SetAttr("check_name", true);
}
void FuseOptimizerOpPass::InserInputAndOutputForOptOps(
const std::vector<ir::Node *> &opt_ops, ir::Node *opt_node) const {
std::unordered_set<ir::Node *> inputs;
std::unordered_set<ir::Node *> outputs;
for (auto opt_op : opt_ops) {
// set inputs
inputs.insert(opt_op->inputs.begin(), opt_op->inputs.end());
for (auto &input : opt_op->inputs) {
replace(input->outputs.begin(), input->outputs.end(), opt_op, opt_node);
}
// set outputs
outputs.insert(opt_op->outputs.begin(), opt_op->outputs.end());
for (auto &output : opt_op->outputs) {
replace(output->inputs.begin(), output->inputs.end(), opt_op, opt_node);
}
}
opt_node->inputs.insert(opt_node->inputs.begin(), inputs.begin(),
inputs.end());
opt_node->outputs.insert(opt_node->outputs.begin(), outputs.begin(),
outputs.end());
}
} // namespace details
} // namespace framework
} // namespace paddle
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <memory>
#include <string>
#include <unordered_map>
#include <utility>
#include <vector>
#include "paddle/fluid/framework/details/build_strategy.h"
#include "paddle/fluid/framework/details/multi_devices_helper.h"
#include "paddle/fluid/framework/ir/graph.h"
namespace paddle {
namespace framework {
namespace details {
class FuseOptimizerOpPass : public ir::Pass {
protected:
void ApplyImpl(ir::Graph *graph) const override;
protected:
virtual void SortParametersAndAuxVars(
const std::vector<std::pair<std::string, std::string>> &params_grads,
std::unordered_map<std::string, std::vector<std::string>> *aux_var_set,
std::vector<ir::Node *> *ops) const;
void InserInputAndOutputForOptOps(const std::vector<ir::Node *> &opt_ops,
ir::Node *opt_node) const;
private:
virtual const std::string GetOpType() const = 0;
virtual const std::vector<std::string> GetAuxiliaryVarNames() const = 0;
virtual void FuseOptimizerOps(
const std::unordered_map<std::string, std::vector<std::string>> &vars_set,
const std::unordered_map<std::string, std::string> &fused_vars_name,
const std::vector<ir::Node *> &adam_ops, ir::Graph *graph) const = 0;
void GetSpecifiedOpsAndVars(
const std::string &op_type, const std::vector<std::string> &aux_vars_name,
ir::Node *node, std::vector<ir::Node *> *ops,
std::unordered_map<std::string, std::vector<std::string>> *aux_args_name)
const;
void AppendAllocContinuousSpace(const std::vector<std::string> &args,
const std::string &out_arg, bool copy_data,
BlockDesc *global_block) const;
void InitFusedVarsAndAllocSpaceForVars(
const std::vector<platform::Place> &places,
const std::vector<Scope *> &local_scopes,
const std::vector<std::string> &aux_var_names,
const std::unordered_map<std::string, std::vector<std::string>>
&aux_var_set,
const std::unordered_map<std::string, std::string> &fused_vars_name)
const;
};
} // namespace details
} // namespace framework
} // namespace paddle
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/details/fuse_sgd_op_pass.h"
#include <algorithm>
#include "paddle/fluid/framework/ir/graph_helper.h"
#include "paddle/fluid/framework/op_registry.h"
namespace paddle {
namespace framework {
namespace details {
const std::string FuseSgdOpPass::GetOpType() const { return "sgd"; }
const std::vector<std::string> FuseSgdOpPass::GetAuxiliaryVarNames() const {
return {"Param"};
}
void FuseSgdOpPass::FuseOptimizerOps(
const std::unordered_map<std::string, std::vector<std::string>>
&aux_var_set,
const std::unordered_map<std::string, std::string> &fused_vars_name,
const std::vector<ir::Node *> &sgd_ops, ir::Graph *graph) const {
FuseSgdOps(aux_var_set, fused_vars_name, sgd_ops, graph);
}
void FuseSgdOpPass::FuseSgdOps(
const std::unordered_map<std::string, std::vector<std::string>> &vars_set,
const std::unordered_map<std::string, std::string> &fused_vars_name,
const std::vector<ir::Node *> &sgd_ops, ir::Graph *graph) const {
PADDLE_ENFORCE_GT(sgd_ops.size(), static_cast<size_t>(0));
// NOTE: fused_var is only exist in scope, so the graph doesn't have fused_var
// node.
int op_role = boost::get<int>(
sgd_ops[0]->Op()->GetAttr(OpProtoAndCheckerMaker::OpRoleAttrName()));
VLOG(10) << "Insert sgd to graph ";
// Add fused scale
OpDesc Sgd_desc(sgd_ops[0]->Op()->Block());
Sgd_desc.SetType("sgd");
Sgd_desc.SetInput("Param", {fused_vars_name.at("Param")});
Sgd_desc.SetInput("Grad", {fused_vars_name.at("Grad")});
Sgd_desc.SetOutput("ParamOut", {fused_vars_name.at("Param")});
// TODO(zcd): The LearningRate, Beta1Pow, Beta2Pow should be equal.
Sgd_desc.SetInput("LearningRate", sgd_ops[0]->Op()->Input("LearningRate"));
// NOTE: multi_devices_pass requires that every op should have a role.
Sgd_desc.SetAttr(OpProtoAndCheckerMaker::OpRoleAttrName(), op_role);
auto sgd_node = graph->CreateOpNode(&Sgd_desc);
InserInputAndOutputForOptOps(sgd_ops, sgd_node);
}
} // namespace details
} // namespace framework
} // namespace paddle
REGISTER_PASS(fuse_sgd_op_pass, paddle::framework::details::FuseSgdOpPass)
.RequirePassAttr(paddle::framework::details::kPlaces)
.RequirePassAttr(paddle::framework::details::kLocalScopes);
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <string>
#include <unordered_map>
#include <utility>
#include <vector>
#include "paddle/fluid/framework/details/build_strategy.h"
#include "paddle/fluid/framework/details/fuse_optimizer_op_pass.h"
#include "paddle/fluid/framework/details/multi_devices_helper.h"
#include "paddle/fluid/framework/ir/graph.h"
namespace paddle {
namespace framework {
namespace details {
class FuseSgdOpPass : public FuseOptimizerOpPass {
private:
virtual const std::string GetOpType() const;
virtual const std::vector<std::string> GetAuxiliaryVarNames() const;
// Fuse Sgd Ops
virtual void FuseOptimizerOps(
const std::unordered_map<std::string, std::vector<std::string>> &vars_set,
const std::unordered_map<std::string, std::string> &fused_vars_name,
const std::vector<ir::Node *> &sgd_ops, ir::Graph *graph) const;
void FuseSgdOps(
const std::unordered_map<std::string, std::vector<std::string>> &vars_set,
const std::unordered_map<std::string, std::string> &fused_vars_name,
const std::vector<ir::Node *> &sgd_ops, ir::Graph *graph) const;
};
} // namespace details
} // namespace framework
} // namespace paddle
......@@ -24,6 +24,19 @@ namespace paddle {
namespace framework {
namespace details {
// Note(zcd): Addresses should be aligned, otherwise, the results may have
// diff.
static size_t Alignment(size_t size, const platform::Place &place) {
// Allow to allocate the minimum chunk size is 4 KB.
size_t alignment = 1 << 12;
if (platform::is_gpu_place(place)) {
// Allow to allocate the minimum chunk size is 256 B.
alignment = 1 << 8;
}
size_t remaining = size % alignment;
return remaining == 0 ? size : size + (alignment - remaining);
}
typedef std::vector<std::vector<std::pair<std::string, const LoDTensor *>>>
GradientAndLoDTensor;
......@@ -111,10 +124,11 @@ void FusedAllReduceOpHandle::RunImpl() {
return grad1.second->data<void>() < grad2.second->data<void>();
});
size_t size_of_dtype = framework::SizeOfType(dtype);
for (size_t k = 1; k < g_tensor.size(); ++k) {
const void *cur_address = g_tensor.at(k - 1).second->data<void>();
int64_t len = g_tensor.at(k - 1).second->numel();
auto offset = len * framework::SizeOfType(dtype);
auto offset = Alignment(len * size_of_dtype, places_[0]);
void *infer_next_address = reinterpret_cast<void *>(
reinterpret_cast<uintptr_t>(cur_address) + offset);
const void *next_address = g_tensor.at(k).second->data<void>();
......@@ -228,18 +242,21 @@ void FusedAllReduceOpHandle::GetDTypeAndNumel(
const std::vector<std::pair<std::string, const LoDTensor *>> &grad_tensor,
proto::VarType::Type *dtype, int64_t *numel) const {
*numel = 0;
size_t size_of_dtype = 0;
for (size_t i = 0; i < grad_tensor.size(); ++i) {
// Get element number
int64_t len = grad_tensor.at(i).second->numel();
PADDLE_ENFORCE_GT(len, 0);
*numel += len;
// Get dtype
auto ele_type = grad_tensor.at(i).second->type();
if (i == 0) {
*dtype = ele_type;
size_of_dtype = framework::SizeOfType(ele_type);
}
PADDLE_ENFORCE_EQ(ele_type, *dtype);
// Get element number
int64_t len = grad_tensor.at(i).second->numel();
PADDLE_ENFORCE_GT(len, 0);
// Alignment(len)
*numel += Alignment(len * size_of_dtype, places_[0]) / size_of_dtype;
}
}
......
......@@ -156,7 +156,6 @@ void InplacePass::ApplyImpl(ir::Graph* graph) const {
continue;
TryInplaceOpInputOutput(op, graph);
}
// graph->ResolveHazard(var_nodes_);
}
void InplacePass::InplaceModifyDesc(const std::string& var,
......@@ -168,7 +167,7 @@ void InplacePass::InplaceModifyDesc(const std::string& var,
auto* op_desc = op->Op();
op_desc->RenameInput(var, cache_var);
op_desc->RenameOutput(var, cache_var);
if (op_desc->Block()->HasVar(var)) op_desc->Block()->RemoveVar(var);
op_desc->Flush();
}
}
......@@ -265,8 +264,6 @@ void InplacePass::WithdrawModify(const NodeSwapQueue& nodes,
void InplacePass::TryInplaceOpInputOutput(ir::Node* op,
ir::Graph* graph) const {
VLOG(4) << "Try to inplace op " << op->Name();
// PADDLE_ENFORCE(op->Op() != nullptr && op->Op()->Block() != nullptr,
// "op_desc is nullptr");
// some pre-requirments need to meet if the op want to inplaced.
PADDLE_ENFORCE(op->Op() != nullptr, "op_desc is nullptr");
......@@ -446,19 +443,20 @@ bool GraphView::CheckDeps(ir::Node* var, ir::Node* current_op) const {
// check if op2 depends on op1's output
bool GraphView::CheckOpDeps(ir::Node* op1, ir::Node* op2) const {
auto print_op = [&](ir::Node* op, const char* name) {
std::ostringstream os;
os << " " << name << " : " << op->Name() << " ";
os << "Input args : ";
for (auto& arg : op->inputs) os << arg->Name() << " ";
os << "Output args : ";
for (auto& arg : op->outputs) os << arg->Name() << " ";
os << "Level : " << op_level_.at(op);
VLOG(4) << os.str();
};
print_op(op1, "OP1");
print_op(op2, "OP2");
if (VLOG_IS_ON(4)) {
auto print_op = [&](ir::Node* op, const char* name) {
std::ostringstream os;
os << " " << name << " : " << op->Name() << " ";
os << "Input args : ";
for (auto& arg : op->inputs) os << arg->Name() << " ";
os << "Output args : ";
for (auto& arg : op->outputs) os << arg->Name() << " ";
os << "Level : " << op_level_.at(op);
VLOG(4) << os.str();
};
print_op(op1, "OP1");
print_op(op2, "OP2");
}
if (op1 == op2) return true;
if (op_level_.at(op1) >= op_level_.at(op2)) return false;
......
......@@ -142,16 +142,15 @@ TEST(OrderedSet, FindBestFitNode) {
for (auto& node : nodes) {
pool.Insert(node.get());
}
// FIXME(liuwei1031) this API has changed,
// disable these tests temporarily
// FindNextBestFitNode
// auto* n = nodes[0].get();
// auto* cache = pool.FindBestFitNode(n);
// PADDLE_ENFORCE(cache->Name() == "a");
// cache = pool.FindNextBestFitNode(n, cache);
// PADDLE_ENFORCE(cache->Name() == "c");
// cache = pool.FindNextBestFitNode(n, cache);
// PADDLE_ENFORCE(cache->Name() == "b");
auto* n = nodes[0].get();
auto* cache = pool.FindBestFitNode(n);
ASSERT_TRUE(cache->Name() == "a" || cache->Name() == "c");
auto* cache_b = pool.FindNextBestFitNode(n, cache);
ASSERT_TRUE(cache_b->Name() != cache->Name());
ASSERT_TRUE(cache_b->Name() == "a" || cache_b->Name() == "c");
cache = pool.FindNextBestFitNode(n, cache_b);
ASSERT_TRUE(cache == nullptr);
}
} // namespace details
......
......@@ -20,7 +20,6 @@
#include <unordered_set>
#include <utility>
#include <vector>
#include "paddle/fluid/framework/details/build_strategy.h"
#include "paddle/fluid/framework/details/multi_devices_helper.h"
#include "paddle/fluid/framework/ir/graph.h"
......@@ -34,6 +33,10 @@ namespace framework {
class Scope;
namespace details {
constexpr char kLossVarName[] = "loss_var_name";
constexpr char kStrategy[] = "strategy";
constexpr char kNRanks[] = "nranks";
class MultiDevSSAGraphBuilderBase : public ir::Pass {
protected:
void ApplyImpl(ir::Graph *graph) const override;
......
......@@ -20,7 +20,6 @@
#include <unordered_set>
#include <utility>
#include <vector>
#include "paddle/fluid/framework/details/op_handle_base.h"
#include "paddle/fluid/framework/details/var_handle.h"
......@@ -41,22 +40,25 @@ namespace details {
// `std::vector<VarHandle*>` is the version of varaibles.
typedef std::vector<std::unordered_map<std::string, std::vector<VarHandle *>>>
GraphVars;
const char kGraphVars[] = "vars";
// aux variables to represent dependency. Useful to resolve data hazard.
typedef std::unordered_set<VarHandleBase *> GraphDepVars;
const char kGraphDepVars[] = "dep_vars";
constexpr char kGraphVars[] = "vars";
constexpr char kNCCLCtxs[] = "nccl_ctxs";
constexpr char kLossVarName[] = "loss_var_name";
constexpr char kPlaces[] = "places";
constexpr char kLocalScopes[] = "local_scopes";
constexpr char kStrategy[] = "strategy";
constexpr char kNRanks[] = "nranks";
constexpr char kNCCLCtxs[] = "nccl_ctxs";
// aux variables to represent dependency. Useful to resolve data hazard.
typedef std::unordered_set<VarHandleBase *> GraphDepVars;
constexpr char kGraphDepVars[] = "dep_vars";
typedef std::unordered_set<std::string> FusedVars;
constexpr char kFusedVars[] = "fused_vars";
constexpr char kFusedVarNamePrefix[] = "@FUSEDVAR@";
typedef std::string FusedOptType;
constexpr char kFusedOptType[] = "fused_opt_type";
typedef std::string FusedGrads;
constexpr char kFusedGrads[] = "fused_gradients";
typedef std::vector<std::pair<std::string, std::string>> ParamsAndGrads;
constexpr char kParamsAndGrads[] = "params_grads";
......@@ -65,8 +67,6 @@ typedef std::vector<std::vector<std::pair<std::string, std::string>>>
GroupGradsAndParams;
constexpr char kGroupGradsAndParams[] = "group_grads_params";
constexpr char kFusedVarNamePrefix[] = "@FUSEDVAR@";
} // namespace details
} // namespace framework
} // namespace paddle
......@@ -24,13 +24,13 @@ ThreadedSSAGraphExecutor::ThreadedSSAGraphExecutor(
const ExecutionStrategy &strategy, const std::vector<Scope *> &local_scopes,
const std::vector<platform::Place> &places, ir::Graph *graph)
: graph_(graph),
pool_(strategy.num_threads_ >= 2 ? new ::ThreadPool(strategy.num_threads_)
: nullptr),
prepare_pool_(1),
local_scopes_(local_scopes),
places_(places),
fetch_ctxs_(places),
strategy_(strategy) {
strategy_(strategy),
prepare_pool_(1),
pool_(strategy.num_threads_ >= 2 ? new ::ThreadPool(strategy.num_threads_)
: nullptr) {
PrepareOpDeps();
CopyOpDeps();
}
......
......@@ -63,13 +63,20 @@ class ThreadedSSAGraphExecutor : public SSAGraphExecutor {
details::OpHandleBase *op);
private:
// Note(zcd): the ThreadPool should be placed last so that ThreadPool should
// be destroyed first.
ir::Graph *graph_;
std::unique_ptr<::ThreadPool> pool_;
::ThreadPool prepare_pool_;
std::vector<Scope *> local_scopes_;
std::vector<platform::Place> places_;
platform::DeviceContextPool fetch_ctxs_;
ExceptionHolder exception_holder_;
std::unique_ptr<OpDependentData> op_deps_;
std::future<std::unique_ptr<OpDependentData>> op_deps_futures_;
ExecutionStrategy strategy_;
// use std::list because clear(), push_back, and for_each are O(1)
std::list<std::future<void>> run_op_futures_;
::ThreadPool prepare_pool_;
std::unique_ptr<::ThreadPool> pool_;
void InsertPendingOp(std::unordered_map<OpHandleBase *, size_t> *pending_ops,
OpHandleBase *op_instance) const;
......@@ -88,14 +95,6 @@ class ThreadedSSAGraphExecutor : public SSAGraphExecutor {
void PrepareOpDeps();
void CopyOpDeps();
private:
std::future<std::unique_ptr<OpDependentData>> op_deps_futures_;
ExecutionStrategy strategy_;
std::unique_ptr<OpDependentData> op_deps_;
// use std::list because clear(), push_back, and for_each are O(1)
std::list<std::future<void>> run_op_futures_;
};
} // namespace details
......
......@@ -12,9 +12,14 @@
See the License for the specific language governing permissions and
limitations under the License. */
#include <iostream>
#include <iterator>
#include <memory>
#include <string>
#include <vector>
#include "gtest/gtest.h"
#include "paddle/fluid/framework/details/inplace_op_pass.h"
#include "paddle/fluid/framework/ir/pass_builder.h"
#include "paddle/fluid/framework/op_info.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/operator.h"
......@@ -165,118 +170,147 @@ REGISTER_OPERATOR(multi_out_grad, f::NOP, f::MultiOutGradInplaceInToOut,
namespace paddle {
namespace framework {
// TEST(InferInplace, SingleOpInplaceInToOut) {
// ProgramDesc prog;
// auto* op = prog.MutableBlock(0)->AppendOp();
// op->SetType("single_op");
// op->SetInput("X", {"test2_a", "test2_b", "test2_c"});
// op->SetOutput("Out", {"test2_out"});
//
// prog.MutableBlock(0)->Var("test2_a")->SetType(proto::VarType::LOD_TENSOR);
// prog.MutableBlock(0)->Var("test2_a")->SetShape({32, 64, 128, 128});
// prog.MutableBlock(0)->Var("test2_b")->SetType(proto::VarType::LOD_TENSOR);
// prog.MutableBlock(0)->Var("test2_c")->SetType(proto::VarType::LOD_TENSOR);
// prog.MutableBlock(0)->Var("test2_out");
// prog.MutableBlock(0)->Var("test2_out")->SetShape({32, 16, 128, 128});
//
// auto& infer_inplace = OpInfoMap::Instance().Get(op->Type()).infer_inplace_;
// auto in_to_outs = infer_inplace(*op);
// EXPECT_EQ(in_to_outs.size(), 1ul);
// auto it = in_to_outs.begin();
// EXPECT_EQ(it->first, "test2_a");
// EXPECT_EQ(it->second, "test2_out");
// }
//
// TEST(InferInplace, SingleGradOpInplaceInToOut) {
// ProgramDesc prog;
// auto* op = prog.MutableBlock(0)->AppendOp();
// op->SetType("single_op_grad");
// op->SetInput(GradVarName("Out"), {"test2_out"});
// op->SetOutput(GradVarName("X"), {"test2_a", "test2_b", "test2_c"});
//
// prog.MutableBlock(0)->Var("test2_a")->SetType(proto::VarType::LOD_TENSOR);
// prog.MutableBlock(0)->Var("test2_a")->SetShape({32, 16, 1024, 1024});
// prog.MutableBlock(0)->Var("test2_b")->SetType(proto::VarType::LOD_TENSOR);
// prog.MutableBlock(0)->Var("test2_c")->SetType(proto::VarType::LOD_TENSOR);
// prog.MutableBlock(0)->Var("test2_out");
// prog.MutableBlock(0)->Var("test2_out")->SetShape({32, 16, 1024, 1024});
//
// auto& infer_inplace = OpInfoMap::Instance().Get(op->Type()).infer_inplace_;
// auto in_to_outs = infer_inplace(*op);
// EXPECT_EQ(in_to_outs.size(), 1ul);
// auto it = in_to_outs.begin();
// EXPECT_EQ(it->first, "test2_out");
// EXPECT_EQ(it->second, "test2_a");
// }
//
// TEST(InferInplace, MultiOutInplaceInToOut) {
// ProgramDesc prog;
// auto* op = prog.MutableBlock(0)->AppendOp();
// op->SetType("multi_out_op");
// op->SetInput("X", {"a0", "a1"});
// op->SetInput("Y", {"b0"});
// op->SetInput("Z", {"c0", "c1"});
// op->SetOutput("Out", {"o0"});
// op->SetOutput("YOut", {"y0"});
// op->SetOutput("ZOut", {"z0"});
//
// prog.MutableBlock(0)->Var("a0")->SetType(proto::VarType::LOD_TENSOR);
// prog.MutableBlock(0)->Var("b0")->SetType(proto::VarType::LOD_TENSOR);
// prog.MutableBlock(0)->Var("c0")->SetType(proto::VarType::LOD_TENSOR);
// prog.MutableBlock(0)->Var("c1")->SetType(proto::VarType::LOD_TENSOR);
// prog.MutableBlock(0)->Var("o0");
// prog.MutableBlock(0)->Var("y0");
// prog.MutableBlock(0)->Var("z0");
// prog.MutableBlock(0)->Var("a0")->SetShape({32, 16, 1024, 1024});
// prog.MutableBlock(0)->Var("b0")->SetShape({32, 16, 1024, 1024});
// prog.MutableBlock(0)->Var("c0")->SetShape({32, 16, 1024, 1024});
// prog.MutableBlock(0)->Var("o0")->SetShape({32, 16, 1024, 1024});
// prog.MutableBlock(0)->Var("y0")->SetShape({32, 16, 1024, 1024});
// prog.MutableBlock(0)->Var("z0")->SetShape({32, 16, 1024, 1024});
//
// auto& infer_inplace = OpInfoMap::Instance().Get(op->Type()).infer_inplace_;
// auto in_to_outs = infer_inplace(*op);
// EXPECT_EQ(in_to_outs.size(), 3ul);
// std::unordered_map<std::string, std::string> expects = {
// {"a0", "o0"}, {"b0", "y0"}, {"c0", "z0"},
// };
// EXPECT_TRUE(expects == in_to_outs);
// }
//
// TEST(InferInplace, MultiGradInplaceInToOut) {
// ProgramDesc prog;
// auto* op = prog.MutableBlock(0)->AppendOp();
// op->SetType("multi_out_grad");
// op->SetInput(GradVarName("Out"), {"o0"});
// op->SetInput(GradVarName("YOut"), {"y0"});
// op->SetInput(GradVarName("ZOut"), {"z0"});
// op->SetOutput(GradVarName("X"), {"a0", "a1"});
// op->SetOutput(GradVarName("Y"), {"b0"});
// op->SetOutput(GradVarName("Z"), {"c0", "c1"});
//
// prog.MutableBlock(0)->Var("a0")->SetType(proto::VarType::LOD_TENSOR);
// prog.MutableBlock(0)->Var("b0")->SetType(proto::VarType::LOD_TENSOR);
// prog.MutableBlock(0)->Var("c0")->SetType(proto::VarType::LOD_TENSOR);
// prog.MutableBlock(0)->Var("c1")->SetType(proto::VarType::LOD_TENSOR);
// prog.MutableBlock(0)->Var("o0");
// prog.MutableBlock(0)->Var("y0");
// prog.MutableBlock(0)->Var("z0");
// prog.MutableBlock(0)->Var("a0")->SetShape({32, 16, 1024, 1024});
// prog.MutableBlock(0)->Var("b0")->SetShape({32, 16, 1024, 1024});
// prog.MutableBlock(0)->Var("c0")->SetShape({32, 16, 1024, 1024});
// prog.MutableBlock(0)->Var("o0")->SetShape({32, 16, 1024, 1024});
// prog.MutableBlock(0)->Var("y0")->SetShape({32, 16, 1024, 1024});
// prog.MutableBlock(0)->Var("z0")->SetShape({32, 16, 1024, 1024});
//
// auto& infer_inplace = OpInfoMap::Instance().Get(op->Type()).infer_inplace_;
// auto in_to_outs = infer_inplace(*op);
//
// EXPECT_EQ(in_to_outs.size(), 3ul);
// std::unordered_map<std::string, std::string> expects = {
// {"o0", "a0"}, {"y0", "b0"}, {"z0", "c0"},
// };
// EXPECT_TRUE(expects == in_to_outs);
// }
void FakeSuccData(ProgramDesc* prog) { // NOLINT
prog->MutableBlock(0)->Var("test2_a")->SetType(proto::VarType::LOD_TENSOR);
prog->MutableBlock(0)->Var("test2_a")->SetShape({32, 64, 128, 128});
prog->MutableBlock(0)->Var("test2_b")->SetType(proto::VarType::LOD_TENSOR);
prog->MutableBlock(0)->Var("test2_c")->SetType(proto::VarType::LOD_TENSOR);
prog->MutableBlock(0)->Var("test2_out");
prog->MutableBlock(0)->Var("test2_out")->SetShape({64, 32, 128, 128});
}
void FakeNoInplaceData(ProgramDesc* prog) { // NOLINT
prog->MutableBlock(0)->Var("test2_a")->SetType(proto::VarType::LOD_TENSOR);
prog->MutableBlock(0)->Var("test2_a")->SetShape({32, 64, 128, 128});
prog->MutableBlock(0)->Var("test2_b")->SetType(proto::VarType::LOD_TENSOR);
prog->MutableBlock(0)->Var("test2_c")->SetType(proto::VarType::LOD_TENSOR);
prog->MutableBlock(0)->Var("test2_out");
prog->MutableBlock(0)->Var("test2_out")->SetShape({64, 31, 128, 128});
}
ir::Node* GetNodeFromGraph(ir::Graph* g, std::string name) {
ir::Node* op_node = nullptr;
for (auto& item : g->Nodes()) {
if (item->Name() == name) {
op_node = item;
break;
}
}
return op_node;
}
std::unique_ptr<ir::Graph> test_SingleOpInplaceInToOut(
std::unique_ptr<ir::Graph> g) {
std::unique_ptr<details::InplacePass> pass(new details::InplacePass());
ir::Node* op_node = GetNodeFromGraph(g.get(), "single_op");
EXPECT_NE(op_node, nullptr);
pass->Apply(g.get());
return g;
}
TEST(InferInplace, SingleOpInplaceInToOut) {
ProgramDesc prog;
auto* op = prog.MutableBlock(0)->AppendOp();
op->SetType("single_op");
op->SetInput("X", {"test2_a", "test2_b", "test2_c"});
op->SetOutput("Out", {"test2_out"});
FakeSuccData(&prog);
std::unique_ptr<ir::Graph> g(new ir::Graph(prog));
g = test_SingleOpInplaceInToOut(std::move(g));
auto op_node = GetNodeFromGraph(g.get(), "single_op");
EXPECT_EQ(op_node->outputs[0]->Name(), "test2_a");
}
TEST(InferInplace, SingleOpInplaceInToOutNoInplace) {
ProgramDesc prog;
auto* op = prog.MutableBlock(0)->AppendOp();
op->SetType("single_op");
op->SetInput("X", {"test2_a", "test2_b", "test2_c"});
op->SetOutput("Out", {"test2_out"});
FakeNoInplaceData(&prog);
std::unique_ptr<ir::Graph> g(new ir::Graph(prog));
g = test_SingleOpInplaceInToOut(std::move(g));
auto op_node = GetNodeFromGraph(g.get(), "single_op");
EXPECT_EQ(op_node->outputs[0]->Name(), "test2_out");
}
TEST(InferInplace, MultiOutInplaceInToOut) {
ProgramDesc prog;
auto* op = prog.MutableBlock(0)->AppendOp();
op->SetType("multi_out_op");
op->SetInput("X", {"a0", "a1"});
op->SetInput("Y", {"b0"});
op->SetInput("Z", {"c0", "c1"});
op->SetOutput("Out", {"o0"});
op->SetOutput("YOut", {"y0"});
op->SetOutput("ZOut", {"z0"});
prog.MutableBlock(0)->Var("a0")->SetType(proto::VarType::LOD_TENSOR);
prog.MutableBlock(0)->Var("b0")->SetType(proto::VarType::LOD_TENSOR);
prog.MutableBlock(0)->Var("c0")->SetType(proto::VarType::LOD_TENSOR);
prog.MutableBlock(0)->Var("c1")->SetType(proto::VarType::LOD_TENSOR);
prog.MutableBlock(0)->Var("o0");
prog.MutableBlock(0)->Var("y0");
prog.MutableBlock(0)->Var("z0");
prog.MutableBlock(0)->Var("a0")->SetShape({32, 16, 1024, 1024});
prog.MutableBlock(0)->Var("b0")->SetShape({32, 16, 1024, 1024});
prog.MutableBlock(0)->Var("c0")->SetShape({32, 16, 1024, 1024});
prog.MutableBlock(0)->Var("o0")->SetShape({32, 16, 1024, 1024});
prog.MutableBlock(0)->Var("y0")->SetShape({32, 16, 1024, 1024});
prog.MutableBlock(0)->Var("z0")->SetShape({32, 16, 1024, 1024});
std::unique_ptr<ir::Graph> g(new ir::Graph(prog));
std::unique_ptr<details::InplacePass> pass(new details::InplacePass());
pass->Apply(g.get());
auto op_node = GetNodeFromGraph(g.get(), "multi_out_op");
ASSERT_TRUE(op_node != nullptr);
EXPECT_EQ(op_node->outputs[0]->Name(), "a0");
EXPECT_EQ(op_node->outputs[1]->Name(), "b0");
EXPECT_EQ(op_node->outputs[2]->Name(), "c0");
}
TEST(InferInplace, MultiGradInplaceInToOut) {
ProgramDesc prog;
auto* op = prog.MutableBlock(0)->AppendOp();
op->SetType("multi_out_grad");
op->SetInput(GradVarName("Out"), {"o0"});
op->SetInput(GradVarName("YOut"), {"y0"});
op->SetInput(GradVarName("ZOut"), {"z0"});
op->SetOutput(GradVarName("X"), {"a0", "a1"});
op->SetOutput(GradVarName("Y"), {"b0"});
op->SetOutput(GradVarName("Z"), {"c0", "c1"});
prog.MutableBlock(0)->Var("a0")->SetType(proto::VarType::LOD_TENSOR);
prog.MutableBlock(0)->Var("b0")->SetType(proto::VarType::LOD_TENSOR);
prog.MutableBlock(0)->Var("c0")->SetType(proto::VarType::LOD_TENSOR);
prog.MutableBlock(0)->Var("c1")->SetType(proto::VarType::LOD_TENSOR);
prog.MutableBlock(0)->Var("o0");
prog.MutableBlock(0)->Var("y0");
prog.MutableBlock(0)->Var("z0");
prog.MutableBlock(0)->Var("a0")->SetShape({32, 16, 1024, 1024});
prog.MutableBlock(0)->Var("b0")->SetShape({32, 16, 1024, 1024});
prog.MutableBlock(0)->Var("c0")->SetShape({32, 16, 1024, 1024});
prog.MutableBlock(0)->Var("o0")->SetShape({32, 16, 1024, 1024});
prog.MutableBlock(0)->Var("y0")->SetShape({32, 16, 1024, 1024});
prog.MutableBlock(0)->Var("z0")->SetShape({32, 15, 1024, 1024});
std::unique_ptr<ir::Graph> g(new ir::Graph(prog));
std::unique_ptr<details::InplacePass> pass(new details::InplacePass());
pass->Apply(g.get());
auto op_node = GetNodeFromGraph(g.get(), "multi_out_grad");
ASSERT_TRUE(op_node != nullptr);
EXPECT_EQ(op_node->outputs[0]->Name(), "o0");
EXPECT_EQ(op_node->outputs[2]->Name(), "y0");
EXPECT_EQ(op_node->outputs[3]->Name(), "c0");
std::unordered_map<std::string, std::string> expects = {
{"o0", "a0"}, {"y0", "b0"}, {"z0", "c0"},
};
}
} // namespace framework
} // namespace paddle
......@@ -68,21 +68,12 @@ pass_library(transpose_flatten_concat_fuse_pass inference)
pass_library(identity_scale_op_clean_pass base)
pass_library(sync_batch_norm_pass base)
pass_library(runtime_context_cache_pass base)
pass_library(simplify_anakin_detection_pattern_pass inference)
pass_library(anakin_fillconstant_elementwisemul_fuse inference)
pass_library(quant_conv2d_dequant_fuse_pass inference)
pass_library(fillconstant_elementwisemul_fuse inference)
# There may be many transpose-flatten structures in a model, and the output of
# these structures will be used as inputs to the concat Op. This pattern will
# be detected by our pass. The index here represents the number of structures in the
# pattern. We use index 3 ~ 6, because these quantities of structures are
# common in the models.
foreach (index RANGE 2 6)
file(APPEND ${pass_file} "USE_PASS(transpose_flatten${index}_concat_fuse_pass);\n")
endforeach()
foreach (index RANGE 2 6)
file(APPEND ${pass_file} "USE_PASS(simplify_anakin_detection_pattern_pass${index});\n")
endforeach()
if(ANAKIN_FOUND)
pass_library(simplify_anakin_priorbox_detection_out_pass inference)
endif()
if(WITH_MKLDNN)
pass_library(mkldnn_placement_pass base mkldnn)
......
......@@ -15,7 +15,7 @@
#include <memory>
#include <string>
#include "paddle/fluid/framework/ir/anakin_fillconstant_elementwisemul_fuse.h"
#include "paddle/fluid/framework/ir/fillconstant_elementwisemul_fuse.h"
#include "paddle/fluid/framework/ir/graph_viz_pass.h"
namespace paddle {
......@@ -29,8 +29,8 @@ namespace ir {
GET_IR_NODE(elementwise_mul); \
GET_IR_NODE(elementwise_mul_out);
void AnakinFillconstantElementwisemulFuse::ApplyImpl(ir::Graph* graph) const {
const std::string pattern_name = "anakin_fillconstant_elementwisemul_fuse";
void FillconstantElementwisemulFuse::ApplyImpl(ir::Graph* graph) const {
const std::string pattern_name = "fillconstant_elementwisemul_fuse";
FusePassBase::Init(pattern_name, graph);
GraphPatternDetector gpd;
......@@ -39,8 +39,8 @@ void AnakinFillconstantElementwisemulFuse::ApplyImpl(ir::Graph* graph) const {
->assert_is_op_input("elementwise_mul", "X")
->AsInput();
patterns::AnakinFillConstantElementWiseMulFuse pattern(gpd.mutable_pattern(),
pattern_name);
patterns::FillConstantElementWiseMulFuse pattern(gpd.mutable_pattern(),
pattern_name);
pattern(x);
auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
......@@ -79,5 +79,5 @@ void AnakinFillconstantElementwisemulFuse::ApplyImpl(ir::Graph* graph) const {
} // namespace framework
} // namespace paddle
REGISTER_PASS(anakin_fillconstant_elementwisemul_fuse,
paddle::framework::ir::AnakinFillconstantElementwisemulFuse);
REGISTER_PASS(fillconstant_elementwisemul_fuse,
paddle::framework::ir::FillconstantElementwisemulFuse);
......@@ -21,9 +21,9 @@ namespace paddle {
namespace framework {
namespace ir {
class AnakinFillconstantElementwisemulFuse : public FusePassBase {
class FillconstantElementwisemulFuse : public FusePassBase {
public:
virtual ~AnakinFillconstantElementwisemulFuse() {}
virtual ~FillconstantElementwisemulFuse() {}
protected:
void ApplyImpl(ir::Graph* graph) const override;
......
......@@ -1471,7 +1471,8 @@ PDNode *patterns::TransposeFlattenConcat::operator()(
}
PDNode *patterns::AnakinDetectionPattern::operator()(
std::vector<PDNode *> conv_in, int times) {
std::vector<PDNode *> conv_in, int times, std::string priorbox_type,
bool is_reshape) {
// The times represents the repeat times of the
// {prior_box, prior_box_loc_out, flatten, prior_box_var_out, reshape}
const int kNumFields = 7;
......@@ -1486,37 +1487,38 @@ PDNode *patterns::AnakinDetectionPattern::operator()(
const int kMultiClassSecondInputNmsOffset = times + 1;
std::vector<PDNode *> nodes;
std::string op_after_priorbox = is_reshape ? "reshape2" : "flatten2";
for (int i = 0; i < times; i++) {
nodes.push_back(
pattern->NewNode(GetNodeName("prior_box" + std::to_string(i)))
->assert_is_op("density_prior_box"));
->assert_is_op(priorbox_type));
nodes.push_back(pattern->NewNode(GetNodeName("box_out" + std::to_string(i)))
->assert_is_op_output("density_prior_box", "Boxes")
->assert_is_op_input("reshape2", "X")
->assert_is_op_output(priorbox_type, "Boxes")
->assert_is_op_input(op_after_priorbox, "X")
->AsIntermediate());
nodes.push_back(
pattern->NewNode(GetNodeName("reshape1" + std::to_string(i)))
->assert_is_op("reshape2"));
->assert_is_op(op_after_priorbox));
nodes.push_back(
pattern->NewNode(GetNodeName("reshape1_out" + std::to_string(i)))
->assert_is_op_output("reshape2")
->assert_is_op_output(op_after_priorbox)
->assert_is_op_nth_input("concat", "X", i)
->AsIntermediate());
nodes.push_back(
pattern->NewNode(GetNodeName("box_var_out" + std::to_string(i)))
->assert_is_op_output("density_prior_box", "Variances")
->assert_is_op_input("reshape2", "X")
->assert_is_op_output(priorbox_type, "Variances")
->assert_is_op_input(op_after_priorbox, "X")
->AsIntermediate());
nodes.push_back(
pattern->NewNode(GetNodeName("reshape2" + std::to_string(i)))
->assert_is_op("reshape2"));
->assert_is_op(op_after_priorbox));
nodes.push_back(
pattern->NewNode(GetNodeName("reshape2_out" + std::to_string(i)))
->assert_is_op_output("reshape2")
->assert_is_op_output(op_after_priorbox)
->assert_is_op_nth_input("concat", "X", i)
->AsIntermediate());
}
......@@ -1612,7 +1614,7 @@ PDNode *patterns::AnakinDetectionPattern::operator()(
return multiclass_nms_out;
}
PDNode *patterns::AnakinFillConstantElementWiseMulFuse::operator()(
PDNode *patterns::FillConstantElementWiseMulFuse::operator()(
PDNode *elementwise_op_input) {
auto fill_constant =
pattern->NewNode(fill_constant_repr())->assert_is_op("fill_constant");
......@@ -1635,6 +1637,76 @@ PDNode *patterns::AnakinFillConstantElementWiseMulFuse::operator()(
return elementwise_mul_out;
}
void patterns::QuantDequantOpFuse::operator()(PDNode *quant_op_input,
const std::string &op_type,
const std::string &weight_name,
int times) {
const int kNumFields = 5;
const int kQuantizedWeightOffset = 0;
const int kQuantizedOpOffset = 1;
const int kQuantizedOpOutOffset = 2;
const int kDequantOpOffset = 3;
const int kDequantOpOutOffset = 4;
// the quant op always be one.
auto quant_op_in_scale =
pattern->NewNode(GetNodeName("quant_op_in_scale"))
->assert_is_op_input("fake_quantize_range_abs_max", "InScale")
->AsInput();
auto quant_op = pattern->NewNode(GetNodeName("quant_op"))
->assert_is_op("fake_quantize_range_abs_max");
auto quant_op_out_scale =
pattern->NewNode(GetNodeName("quant_op_out_scale"))
->assert_is_op_output("fake_quantize_range_abs_max", "OutScale")
->assert_is_op_input("fake_dequantize_max_abs", "Scale")
->AsIntermediate();
auto quant_op_out =
pattern->NewNode(GetNodeName("quant_op_out"))
->assert_is_op_output("fake_quantize_range_abs_max", "Out")
->assert_is_op_input(op_type)
->AsIntermediate();
// there are 'times' quantized and dequant op
std::vector<PDNode *> nodes;
for (int i = 0; i < times; i++) {
nodes.push_back(
pattern->NewNode(GetNodeName("quantized_op_weight") + std::to_string(i))
->assert_is_op_input(op_type, weight_name)
->AsInput());
nodes.push_back(
pattern->NewNode(GetNodeName("quantized_op") + std::to_string(i))
->assert_is_op(op_type));
nodes.push_back(
pattern->NewNode(GetNodeName("quantized_op_out") + std::to_string(i))
->assert_is_op_output(op_type)
->assert_is_op_input("fake_dequantize_max_abs", "X")
->AsIntermediate());
nodes.push_back(
pattern->NewNode(GetNodeName("dequant_op") + std::to_string(i))
->assert_is_op("fake_dequantize_max_abs"));
nodes.push_back(
pattern->NewNode(GetNodeName("dequant_op_out") + std::to_string(i))
->assert_is_op_output("fake_dequantize_max_abs", "Out")
->AsOutput());
}
quant_op->LinksFrom({quant_op_input, quant_op_in_scale});
quant_op_out->LinksFrom({quant_op});
for (int i = 0; i < times; i++) {
nodes[i * kNumFields + kQuantizedOpOffset]->LinksFrom(
{quant_op_out, nodes[i * kNumFields + kQuantizedWeightOffset]});
nodes[i * kNumFields + kQuantizedOpOutOffset]->LinksFrom(
{nodes[i * kNumFields + kQuantizedOpOffset]});
nodes[i * kNumFields + kDequantOpOffset]->LinksFrom(
{nodes[i * kNumFields + kQuantizedOpOutOffset], quant_op_out_scale});
nodes[i * kNumFields + kDequantOpOutOffset]->LinksFrom(
{nodes[i * kNumFields + kDequantOpOffset]});
}
}
} // namespace ir
} // namespace framework
} // namespace paddle
......@@ -848,7 +848,8 @@ struct AnakinDetectionPattern : public PatternBase {
AnakinDetectionPattern(PDPattern* pattern, const std::string& name_scope)
: PatternBase(pattern, name_scope, "anakin_detect_pattern") {}
PDNode* operator()(std::vector<PDNode*> conv_inputs, int times);
PDNode* operator()(std::vector<PDNode*> conv_inputs, int times,
std::string priorbox_type, bool is_reshape);
std::string GetNodeName(const std::string& op_type) {
return PDNodeName(name_scope_, repr_, id_, op_type);
......@@ -859,9 +860,9 @@ struct AnakinDetectionPattern : public PatternBase {
}
};
struct AnakinFillConstantElementWiseMulFuse : public PatternBase {
AnakinFillConstantElementWiseMulFuse(PDPattern* pattern,
const std::string& name_scope)
struct FillConstantElementWiseMulFuse : public PatternBase {
FillConstantElementWiseMulFuse(PDPattern* pattern,
const std::string& name_scope)
: PatternBase(pattern, name_scope,
"anakin_fillconstant_elementwisemul_fuse") {}
......@@ -874,6 +875,22 @@ struct AnakinFillConstantElementWiseMulFuse : public PatternBase {
PATTERN_DECL_NODE(elementwise_mul_out);
};
struct QuantDequantOpFuse : public PatternBase {
QuantDequantOpFuse(PDPattern* pattern, const std::string& name_scope)
: PatternBase(pattern, name_scope, "quant_dequant_fuse") {}
void operator()(PDNode* quant_op_input, const std::string& op_name,
const std::string& weight_name, int times = 1);
std::string GetNodeName(const std::string& op_type) {
return PDNodeName(name_scope_, repr_, id_, op_type);
}
PDNode* GetPDNode(const std::string& op_type) {
return pattern->RetrieveNode(GetNodeName(op_type));
}
};
} // namespace patterns
// Link two ir::Nodes from each other.
......
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <memory>
#include <string>
#include <unordered_set>
#include <vector>
#include "paddle/fluid/framework/ir/graph_viz_pass.h"
#include "paddle/fluid/framework/ir/quant_conv2d_dequant_fuse_pass.h"
namespace paddle {
namespace framework {
namespace ir {
void RunQuantDequant(ir::Graph* graph, Scope* scope, int times,
std::string op_type) {
const std::string pattern_name = "quant_dequant_fuse";
// FusePassBase::Init(pattern_name, graph);
const int kNumFields = 5;
const int kQuantizedWeightOffset = 0;
const int kQuantizedOpOffset = 1;
const int kQuantizedOpOutOffset = 2;
const int kDequantOpOffset = 3;
const int kDequantOpOutOffset = 4;
GraphPatternDetector gpd;
auto* x = gpd.mutable_pattern()
->NewNode("x")
->assert_is_op_input("fake_quantize_range_abs_max", "X")
->AsInput();
std::string quantized_op_type = "";
std::string weight_name = "";
if (op_type == "conv2d") {
quantized_op_type = "conv2d";
weight_name = "Filter";
} else if (op_type == "conv2d_fusion") {
quantized_op_type = "conv2d_fusion";
weight_name = "Filter";
} else if (op_type == "mul") {
quantized_op_type = "mul";
weight_name = "Y";
} else if (op_type == "fc") {
quantized_op_type = "fc";
weight_name = "W";
} else {
PADDLE_ENFORCE(
"QuantDequantFuse: We only support conv2d, conv2d_fusion, fc, mul for "
"now.");
}
patterns::QuantDequantOpFuse pattern(gpd.mutable_pattern(), pattern_name);
pattern(x, quantized_op_type, weight_name, times);
auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
Graph* g) {
PADDLE_ENFORCE(subgraph.count(x));
auto* input_node = subgraph.at(x);
Node* quant_op_in_scale =
subgraph.at(pattern.GetPDNode("quant_op_in_scale"));
Node* quant_op = subgraph.at(pattern.GetPDNode("quant_op"));
Node* quant_op_out_scale =
subgraph.at(pattern.GetPDNode("quant_op_out_scale"));
Node* quant_op_out = subgraph.at(pattern.GetPDNode("quant_op_out"));
std::vector<Node*> nodes;
for (int i = 0; i < times; i++) {
nodes.push_back(subgraph.at(
pattern.GetPDNode("quantized_op_weight" + std::to_string(i))));
nodes.push_back(
subgraph.at(pattern.GetPDNode("quantized_op" + std::to_string(i))));
nodes.push_back(subgraph.at(
pattern.GetPDNode("quantized_op_out" + std::to_string(i))));
nodes.push_back(
subgraph.at(pattern.GetPDNode("dequant_op" + std::to_string(i))));
nodes.push_back(
subgraph.at(pattern.GetPDNode("dequant_op_out" + std::to_string(i))));
}
int bit_length = boost::get<int>(quant_op->Op()->GetAttr("bit_length"));
int range = ((1 << (bit_length - 1)) - 1);
// Prepare input scale
std::string input_scale_var_name = quant_op->Op()->Input("InScale").front();
PADDLE_ENFORCE(scope);
const LoDTensor& input_scale_tensor =
scope->FindVar(input_scale_var_name)->Get<LoDTensor>();
PADDLE_ENFORCE(paddle::platform::is_cpu_place(input_scale_tensor.place()));
const float* input_scale_data = input_scale_tensor.data<float>();
float input_scale = input_scale_data[0];
std::unordered_set<const Node*> delete_nodes;
for (int i = 0; i < times; i++) {
// max_range = (range * range) / weight_scale
float max_range = boost::get<float>(
nodes[i * kNumFields + kDequantOpOffset]->Op()->GetAttr("max_range"));
float weight_scale = (range * range) / max_range;
auto base_op_desc =
*nodes[i * kNumFields + kQuantizedOpOffset]->Op()->Proto();
std::string new_input = input_node->Name();
std::string new_output =
nodes[i * kNumFields + kDequantOpOutOffset]->Name();
framework::OpDesc new_op_desc(base_op_desc, nullptr);
new_op_desc.SetType(quantized_op_type);
if (quantized_op_type == "conv2d" ||
quantized_op_type == "conv2d_fusion") {
new_op_desc.SetInput("Input", {new_input});
new_op_desc.SetOutput("Output", {new_output});
} else if (quantized_op_type == "fc") {
new_op_desc.SetInput("Input", {new_input});
new_op_desc.SetOutput("Out", {new_output});
} else if (quantized_op_type == "mul") {
new_op_desc.SetInput("X", {new_input});
new_op_desc.SetOutput("Out", {new_output});
}
new_op_desc.SetAttr("enable_int8", true);
new_op_desc.SetAttr("input_scale", input_scale);
new_op_desc.SetAttr("weight_scale", weight_scale);
new_op_desc.Flush();
auto* new_op = graph->CreateOpNode(&new_op_desc);
IR_NODE_LINK_TO(input_node, new_op);
IR_NODE_LINK_TO(nodes[i * kNumFields + kQuantizedWeightOffset], new_op);
IR_NODE_LINK_TO(new_op, nodes[i * kNumFields + kDequantOpOutOffset]);
delete_nodes.insert(nodes[i * kNumFields + kQuantizedOpOffset]);
delete_nodes.insert(nodes[i * kNumFields + kQuantizedOpOutOffset]);
delete_nodes.insert(nodes[i * kNumFields + kDequantOpOffset]);
}
delete_nodes.insert(quant_op_in_scale);
delete_nodes.insert(quant_op);
delete_nodes.insert(quant_op_out);
delete_nodes.insert(quant_op_out_scale);
// Delete the unneeded nodes.
GraphSafeRemoveNodes(graph, delete_nodes);
};
gpd(graph, handler);
}
void QuantDequantFusePass::ApplyImpl(ir::Graph* graph) const {
const std::string pattern_name = "quant_dequant_fuse";
FusePassBase::Init(pattern_name, graph);
std::unordered_set<std::string> quantized_op_types = {"conv2d", "mul"};
auto* scope = param_scope();
for (auto& op_type : quantized_op_types) {
for (int i = 1; i <= 6; i++) {
RunQuantDequant(graph, scope, i, op_type);
}
}
}
} // namespace ir
} // namespace framework
} // namespace paddle
REGISTER_PASS(quant_conv2d_dequant_fuse_pass,
paddle::framework::ir::QuantDequantFusePass);
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <memory>
#include "paddle/fluid/framework/ir/fuse_pass_base.h"
#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
namespace paddle {
namespace framework {
namespace ir {
class QuantDequantFusePass : public FusePassBase {
public:
virtual ~QuantDequantFusePass() {}
protected:
void ApplyImpl(ir::Graph* graph) const override;
};
} // namespace ir
} // namespace framework
} // namespace paddle
......@@ -17,25 +17,24 @@
#include "paddle/fluid/framework/ir/graph_viz_pass.h"
#include "paddle/fluid/framework/ir/node.h"
#include "paddle/fluid/framework/ir/simplify_anakin_detection_pattern_pass.h"
#include "paddle/fluid/framework/ir/simplify_anakin_priorbox_detection_out_pass.h"
namespace paddle {
namespace framework {
namespace ir {
template <int times>
void SimplifyAnakinDetectionPatternPass<times>::ApplyImpl(
ir::Graph *graph) const {
void RunSimplifyAnakinDetection(ir::Graph *graph, int times, bool is_density,
bool is_reshape) {
const std::string pattern_name =
"simplify_anakin_detection_pattern_pass" + std::to_string(times);
FusePassBase::Init(pattern_name, graph);
std::string priorbox_type = is_density ? "density_prior_box" : "prior_box";
GraphPatternDetector gpd;
std::vector<PDNode *> input_nodes;
for (int i = 0; i < times; i++) {
input_nodes.push_back(gpd.mutable_pattern()
->NewNode("x" + std::to_string(i))
->assert_is_op_input("density_prior_box", "Input")
->assert_is_op_input(priorbox_type, "Input")
->AsInput());
}
input_nodes.push_back(gpd.mutable_pattern()
......@@ -49,7 +48,7 @@ void SimplifyAnakinDetectionPatternPass<times>::ApplyImpl(
->AsInput());
patterns::AnakinDetectionPattern pattern(gpd.mutable_pattern(), pattern_name);
pattern(input_nodes, times);
pattern(input_nodes, times, priorbox_type, is_reshape);
auto handler = [&](const GraphPatternDetector::subgraph_t &subgraph,
Graph *g) {
......@@ -119,8 +118,7 @@ void SimplifyAnakinDetectionPatternPass<times>::ApplyImpl(
boost::get<std::string>(box_coder_op->Op()->GetAttr("code_type"));
bool box_normalized =
boost::get<bool>(box_coder_op->Op()->GetAttr("box_normalized"));
// auto variance =
// boost::get<std::vector<float>>(box_coder_op->Op()->GetAttr("variance"));
int background_label =
boost::get<int>(multiclass_nms->Op()->GetAttr("background_label"));
float score_threshold =
......@@ -138,7 +136,6 @@ void SimplifyAnakinDetectionPatternPass<times>::ApplyImpl(
nodes[i * kNumFields + kPriorBoxLocOffset]->Name());
}
// int axis = boost::get<int>(concat_op1->Op()->GetAttr("axis"));
framework::OpDesc concat1_desc;
concat1_desc.SetType("concat");
concat1_desc.SetInput("X", concat1_input_names);
......@@ -213,31 +210,24 @@ void SimplifyAnakinDetectionPatternPass<times>::ApplyImpl(
gpd(graph, handler);
}
template class SimplifyAnakinDetectionPatternPass<1>;
template class SimplifyAnakinDetectionPatternPass<2>;
template class SimplifyAnakinDetectionPatternPass<3>;
template class SimplifyAnakinDetectionPatternPass<4>;
template class SimplifyAnakinDetectionPatternPass<5>;
template class SimplifyAnakinDetectionPatternPass<6>;
void SimplifyAnakinDetectionPatternPass::ApplyImpl(ir::Graph *graph) const {
const int pattern_nums = 6;
const std::string pattern_name = "simplify_anakin_detection_pattern_pass";
FusePassBase::Init(pattern_name, graph);
std::vector<bool> options = {true, false};
for (const auto &is_density : options) {
for (const auto &is_reshape : options) {
for (int i = 1; i <= pattern_nums; i++) {
RunSimplifyAnakinDetection(graph, i, is_density, is_reshape);
}
}
}
}
} // namespace ir
} // namespace framework
} // namespace paddle
REGISTER_PASS(simplify_anakin_detection_pattern_pass,
paddle::framework::ir::SimplifyAnakinDetectionPatternPass<1>);
REGISTER_PASS(simplify_anakin_detection_pattern_pass2,
paddle::framework::ir::SimplifyAnakinDetectionPatternPass<2>);
REGISTER_PASS(simplify_anakin_detection_pattern_pass3,
paddle::framework::ir::SimplifyAnakinDetectionPatternPass<3>);
REGISTER_PASS(simplify_anakin_detection_pattern_pass4,
paddle::framework::ir::SimplifyAnakinDetectionPatternPass<4>);
REGISTER_PASS(simplify_anakin_detection_pattern_pass5,
paddle::framework::ir::SimplifyAnakinDetectionPatternPass<5>);
REGISTER_PASS(simplify_anakin_detection_pattern_pass6,
paddle::framework::ir::SimplifyAnakinDetectionPatternPass<6>);
typedef paddle::framework::ir::SimplifyAnakinDetectionPatternPass
priorbox_pattern;
REGISTER_PASS(simplify_anakin_priorbox_detection_out_pass, priorbox_pattern);
......@@ -26,7 +26,6 @@ namespace ir {
// these structures will be used as inputs to the concat Op. This pattern will
// be detected by our pass. The times here represents the repeat times of this
// structure.
template <int times>
class SimplifyAnakinDetectionPatternPass : public FusePassBase {
public:
virtual ~SimplifyAnakinDetectionPatternPass() {}
......
......@@ -25,11 +25,9 @@ namespace paddle {
namespace framework {
namespace ir {
template <int times>
void TransposeFlattenConcatFusePass<times>::ApplyImpl(ir::Graph *graph) const {
void RunTransposeFlattenConcatFuse(ir::Graph *graph, int times) {
const std::string pattern_name =
"transpose_flatten" + std::to_string(times) + "_concat_fuse";
FusePassBase::Init(pattern_name, graph);
GraphPatternDetector gpd;
std::vector<PDNode *> input_nodes;
......@@ -122,31 +120,18 @@ void TransposeFlattenConcatFusePass<times>::ApplyImpl(ir::Graph *graph) const {
gpd(graph, handler);
}
template class TransposeFlattenConcatFusePass<1>;
template class TransposeFlattenConcatFusePass<2>;
template class TransposeFlattenConcatFusePass<3>;
template class TransposeFlattenConcatFusePass<4>;
template class TransposeFlattenConcatFusePass<5>;
template class TransposeFlattenConcatFusePass<6>;
void TransposeFlattenConcatFusePass::ApplyImpl(ir::Graph *graph) const {
const int pattern_nums = 6;
const std::string pattern_name = "transpose_flatten_concat_fuse";
FusePassBase::Init(pattern_name, graph);
for (int i = 1; i <= pattern_nums; i++) {
RunTransposeFlattenConcatFuse(graph, i);
}
}
} // namespace ir
} // namespace framework
} // namespace paddle
REGISTER_PASS(transpose_flatten_concat_fuse_pass,
paddle::framework::ir::TransposeFlattenConcatFusePass<1>);
REGISTER_PASS(transpose_flatten2_concat_fuse_pass,
paddle::framework::ir::TransposeFlattenConcatFusePass<2>);
REGISTER_PASS(transpose_flatten3_concat_fuse_pass,
paddle::framework::ir::TransposeFlattenConcatFusePass<3>);
REGISTER_PASS(transpose_flatten4_concat_fuse_pass,
paddle::framework::ir::TransposeFlattenConcatFusePass<4>);
REGISTER_PASS(transpose_flatten5_concat_fuse_pass,
paddle::framework::ir::TransposeFlattenConcatFusePass<5>);
REGISTER_PASS(transpose_flatten6_concat_fuse_pass,
paddle::framework::ir::TransposeFlattenConcatFusePass<6>);
paddle::framework::ir::TransposeFlattenConcatFusePass);
......@@ -13,6 +13,8 @@
// limitations under the License.
#pragma once
#include <memory>
#include "paddle/fluid/framework/ir/fuse_pass_base.h"
#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
......@@ -24,7 +26,6 @@ namespace ir {
// these structures will be used as inputs to the concat Op. This pattern will
// be detected by our pass. The times here represents the repeat times of this
// structure.
template <int times>
class TransposeFlattenConcatFusePass : public FusePassBase {
public:
virtual ~TransposeFlattenConcatFusePass() {}
......
......@@ -56,8 +56,8 @@ proto::VarType::Type GetDataTypeOfVar(const Variable* var) {
}
}
static DDim GetDims(const Scope& scope, const std::string& name,
bool get_actual_dim = false) {
static DDim GetDimsDebug(const Scope& scope, const std::string& name,
bool get_actual_dim = false) {
Variable* var = scope.FindVar(name);
if (var == nullptr) {
return DDim({-1});
......@@ -65,9 +65,9 @@ static DDim GetDims(const Scope& scope, const std::string& name,
if (var->IsType<LoDTensor>()) {
const LoDTensor& tensor = var->Get<LoDTensor>();
// if (UNLIKELY(!tensor.IsInitialized())) {
// return DDim({-1});
// }
if (UNLIKELY(!tensor.IsInitialized())) {
return DDim({-1});
}
return tensor.dims();
} else if (var->IsType<SelectedRows>()) {
if (get_actual_dim) {
......@@ -123,7 +123,7 @@ static int GetRowSize(const Scope& scope, const std::string& name) {
return -1;
}
static LoD GetLoD(const Scope& scope, const std::string& name) {
static LoD GetLoDDebug(const Scope& scope, const std::string& name) {
Variable* var = scope.FindVar(name);
auto default_lod = LoD({{}});
......@@ -133,9 +133,9 @@ static LoD GetLoD(const Scope& scope, const std::string& name) {
if (var->IsType<LoDTensor>()) {
const LoDTensor& tensor = var->Get<LoDTensor>();
// if (UNLIKELY(!tensor.IsInitialized())) {
// return default_lod;
// }
if (UNLIKELY(!tensor.IsInitialized())) {
return default_lod;
}
return tensor.lod();
} else {
return default_lod;
......@@ -274,8 +274,8 @@ std::string OperatorBase::DebugStringEx(const Scope* scope) const {
}
std::string dtype = GetDtype(*scope, var_name);
ss << ":" << dtype;
ss << "[" << GetDims(*scope, var_name, true) << "]";
ss << "(" << GetLoD(*scope, var_name) << ")";
ss << "[" << GetDimsDebug(*scope, var_name, true) << "]";
ss << "(" << GetLoDDebug(*scope, var_name) << ")";
}
}
if (i != input.second.size() - 1) {
......@@ -305,8 +305,8 @@ std::string OperatorBase::DebugStringEx(const Scope* scope) const {
}
std::string dtype = GetDtype(*scope, output.second[i]);
ss << ":" << dtype;
ss << "[" << GetDims(*scope, var_name, true) << "]";
ss << "(" << GetLoD(*scope, var_name) << ")";
ss << "[" << GetDimsDebug(*scope, var_name, true) << "]";
ss << "(" << GetLoDDebug(*scope, var_name) << ")";
}
}
if (i != output.second.size() - 1) {
......
......@@ -365,6 +365,9 @@ class ExecutionContext {
auto shared_allocation = std::shared_ptr<memory::allocation::Allocation>(
allocation_ptr, deleter);
PADDLE_ENFORCE(
dynamic_cast<platform::TemporaryAllocation*>(allocation_ptr) != nullptr,
"The AllocationPtr must be TemporaryAllocation.");
PADDLE_ENFORCE_GE(allocation_ptr->size(),
framework::product(dim) * sizeof(T));
......
......@@ -70,7 +70,7 @@ Tensor& Tensor::ShareDataWith(const Tensor& src) {
return *this;
}
Tensor Tensor::Slice(int begin_idx, int end_idx) const {
Tensor Tensor::Slice(int64_t begin_idx, int64_t end_idx) const {
check_memory_size();
PADDLE_ENFORCE_GE(begin_idx, 0,
"The start row index must be greater than 0.");
......
......@@ -18,6 +18,7 @@ limitations under the License. */
#include <cstring>
#include <memory>
#include <typeindex>
#include <utility>
#include <vector>
#include "paddle/fluid/framework/data_layout.h"
#include "paddle/fluid/framework/ddim.h"
......@@ -27,10 +28,6 @@ limitations under the License. */
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/place.h"
#ifdef PADDLE_WITH_MKLDNN
#include "paddle/fluid/platform/mkldnn_utils.h"
#endif
namespace paddle {
namespace framework {
......@@ -41,34 +38,10 @@ class Tensor {
#ifdef PADDLE_WITH_MKLDNN
public:
// TODO(jczaja): This is depracted and will be removed
inline mkldnn::memory::format format() const {
if (layout_ == DataLayout::kMKLDNN) {
return static_cast<mkldnn::memory::format>(mem_pd_.desc().data.format);
} else {
return mkldnn::memory::format::format_undef;
}
}
inline mkldnn::memory::format format() const { return format_; }
// TODO(jczaja): This is depracted and will be removed
inline void set_format(
const mkldnn::memory::format fmt,
mkldnn::memory::data_type data_type = mkldnn::memory::f32) {
mem_pd_ = paddle::platform::create_prim_desc_from_format(
paddle::framework::vectorize2int(dims()), fmt, data_type);
layout_ = DataLayout::kMKLDNN;
}
inline mkldnn::memory::primitive_desc get_mkldnn_prim_desc() const {
return mem_pd_;
}
inline void set_mkldnn_prim_desc(
const mkldnn::memory::primitive_desc& mem_pd) {
// Internally MKL-DNN is just copying (increasing reference counter)
// to shared_ptr. So asignment should be quite cheap
mem_pd_ = mem_pd;
layout_ = DataLayout::kMKLDNN;
inline void set_format(const mkldnn::memory::format format) {
format_ = format;
}
protected:
......@@ -76,9 +49,12 @@ class Tensor {
* @brief the detail format of memory block which have layout as kMKLDNN
*
* @note MKLDNN lib support various memory format like nchw, nhwc, nChw8C,
* nChw16c, etc. For a MKLDNN memory block, we store memory descriptor
* nChw16c, etc. For a MKLDNN memory block, layout will be set as
* DataLayout::kMKLDNN meanwhile detail memory format will be kept in
* this field.
*/
mutable mkldnn::memory::primitive_desc mem_pd_;
mkldnn::memory::format format_ = mkldnn::memory::format::format_undef;
#endif
public:
......@@ -157,7 +133,7 @@ class Tensor {
* @param[in] end_idx The index of the end row(exclusive) to slice.
* The index number begins from 0.
*/
Tensor Slice(int begin_idx, int end_idx) const;
Tensor Slice(int64_t begin_idx, int64_t end_idx) const;
platform::Place place() const {
PADDLE_ENFORCE_NOT_NULL(
......
......@@ -44,11 +44,6 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place,
<< dst_place;
return;
}
#ifdef PADDLE_WITH_MKLDNN
if (src.layout() == DataLayout::kMKLDNN) {
dst->set_mkldnn_prim_desc(src.get_mkldnn_prim_desc());
}
#endif
memory::Copy(boost::get<platform::CPUPlace>(dst_place), dst_ptr,
boost::get<platform::CPUPlace>(src_place), src_ptr, size);
}
......
......@@ -34,25 +34,41 @@ void DensityPriorBoxOpConverter::operator()(const framework::proto::OpDesc& op,
auto input_name = op_desc.Input("Input").front();
auto image_name = op_desc.Input("Image").front();
auto output_name = op_desc.Output("Boxes").front();
auto op_type = op_desc.Type();
auto op_name = op_type + ":" + op_desc.Output("Boxes").front();
auto op_name = op_desc.Type() + ":" + op_desc.Output("Boxes").front();
// only for density_prior_box
std::vector<float> fixed_sizes = {};
std::vector<float> fixed_ratios = {};
std::vector<int> densities = {};
auto fixed_sizes =
boost::get<std::vector<float>>(op_desc.GetAttr("fixed_sizes"));
auto fixed_ratios =
boost::get<std::vector<float>>(op_desc.GetAttr("fixed_ratios"));
auto densities = boost::get<std::vector<int>>(op_desc.GetAttr("densities"));
std::vector<float> min_sizes = {};
std::vector<float> max_sizes = {};
std::vector<float> aspect_ratios = {};
bool is_clip = false;
bool is_flip = false;
if (op_type == "density_prior_box") {
fixed_sizes =
boost::get<std::vector<float>>(op_desc.GetAttr("fixed_sizes"));
fixed_ratios =
boost::get<std::vector<float>>(op_desc.GetAttr("fixed_ratios"));
densities = boost::get<std::vector<int>>(op_desc.GetAttr("densities"));
is_clip = boost::get<bool>(op_desc.GetAttr("clip"));
} else if (op_type == "prior_box") {
min_sizes = boost::get<std::vector<float>>(op_desc.GetAttr("min_sizes"));
max_sizes = boost::get<std::vector<float>>(op_desc.GetAttr("max_sizes"));
aspect_ratios =
boost::get<std::vector<float>>(op_desc.GetAttr("aspect_ratios"));
is_clip = boost::get<bool>(op_desc.GetAttr("clip"));
is_flip = boost::get<bool>(op_desc.GetAttr("flip"));
}
std::vector<float> dens;
for (auto& ele : densities) {
dens.push_back(static_cast<float>(ele));
}
// lack flip
// auto clip = boost::get<bool>(op_desc.GetAttr("clip"));
auto variances = boost::get<std::vector<float>>(op_desc.GetAttr("variances"));
for (auto& ele : variances) {
LOG(INFO) << ele;
}
// lack img_h, img_w
auto step_h = boost::get<float>(op_desc.GetAttr("step_h"));
......@@ -66,14 +82,14 @@ void DensityPriorBoxOpConverter::operator()(const framework::proto::OpDesc& op,
std::vector<float> temp_v = {};
engine_->AddOp(op_name, "PriorBox", {input_name, image_name}, {output_name});
engine_->AddOpAttr<PTuple<float>>(op_name, "min_size", temp_v);
engine_->AddOpAttr<PTuple<float>>(op_name, "max_size", temp_v);
engine_->AddOpAttr<PTuple<float>>(op_name, "aspect_ratio", temp_v);
engine_->AddOpAttr<PTuple<float>>(op_name, "min_size", min_sizes);
engine_->AddOpAttr<PTuple<float>>(op_name, "max_size", max_sizes);
engine_->AddOpAttr<PTuple<float>>(op_name, "aspect_ratio", aspect_ratios);
engine_->AddOpAttr<PTuple<float>>(op_name, "fixed_size", fixed_sizes);
engine_->AddOpAttr<PTuple<float>>(op_name, "fixed_ratio", fixed_ratios);
engine_->AddOpAttr<PTuple<float>>(op_name, "density", dens);
engine_->AddOpAttr(op_name, "is_flip", static_cast<bool>(false));
engine_->AddOpAttr(op_name, "is_clip", static_cast<bool>(false));
engine_->AddOpAttr(op_name, "is_flip", is_flip);
engine_->AddOpAttr(op_name, "is_clip", is_clip);
engine_->AddOpAttr<PTuple<float>>(op_name, "variance", variances);
engine_->AddOpAttr(op_name, "img_h", static_cast<int>(0));
engine_->AddOpAttr(op_name, "img_w", static_cast<int>(0));
......@@ -88,3 +104,4 @@ void DensityPriorBoxOpConverter::operator()(const framework::proto::OpDesc& op,
} // namespace paddle
REGISTER_ANAKIN_OP_CONVERTER(density_prior_box, DensityPriorBoxOpConverter);
REGISTER_ANAKIN_OP_CONVERTER(prior_box, DensityPriorBoxOpConverter);
......@@ -48,7 +48,7 @@ class AnakinOpConverter {
framework::OpDesc op_desc(op, nullptr);
std::string op_type = op_desc.Type();
AnakinOpConverter *it = nullptr;
if (op_type == "depthwise_conv2d") op_type = "conv2d";
if (op_type == "reshape2") op_type = "reshape";
if (op_type == "transpose2") op_type = "transpose";
if (op_type == "flatten2") op_type = "flatten";
......
......@@ -42,6 +42,8 @@ struct SimpleOpTypeSetTeller : public Teller {
teller_set.insert("dropout");
teller_set.insert("sigmoid");
teller_set.insert("sum");
teller_set.insert("depthwise_conv2d");
teller_set.insert("prior_box");
}
bool operator()(const std::string& op_type,
......
......@@ -37,14 +37,14 @@ using framework::ir::Node;
void analysis::AnakinSubgraphPass::ApplyImpl(
framework::ir::Graph *graph) const {
framework::ir::FusePassBase::Init("anakin_subgraph_pass", graph.get());
framework::ir::FusePassBase::Init("anakin_subgraph_pass", graph);
auto teller = [](const framework::ir::Node *node) {
if (!node->IsOp() || !node->Op()) return false;
return anakin::OpTeller::Global().Tell(node->Op()->Type(), *node->Op());
};
SubGraphFuser fuser(graph.get(), teller, 6 /* min_subgraph_size */);
SubGraphFuser fuser(graph, teller, 6 /* min_subgraph_size */);
fuser();
std::vector<std::string> graph_param_names =
......@@ -56,10 +56,10 @@ void analysis::AnakinSubgraphPass::ApplyImpl(
for (auto *node : graph->Nodes()) {
if (node->IsOp() && !Agent(node).subgraph()->empty()) {
CreateAnakinOp(node, graph.get(), graph_param_names, &repetitive_params);
CreateAnakinOp(node, graph, graph_param_names, &repetitive_params);
std::unordered_set<const Node *> nodes2remove(
Agent(node).subgraph()->begin(), Agent(node).subgraph()->end());
framework::ir::GraphSafeRemoveNodes(graph.get(), nodes2remove);
framework::ir::GraphSafeRemoveNodes(graph, nodes2remove);
}
}
......@@ -69,7 +69,7 @@ void analysis::AnakinSubgraphPass::ApplyImpl(
nodes2remove.insert(node);
}
}
framework::ir::GraphSafeRemoveNodes(graph.get(), nodes2remove);
framework::ir::GraphSafeRemoveNodes(graph, nodes2remove);
graph->Set(framework::ir::kRepetitiveParamAttr,
new std::vector<std::string>(repetitive_params));
}
......
......@@ -192,6 +192,7 @@ void TensorRtSubgraphPass::CreateTensorRTOp(
block_desc.Proto()->SerializeAsString());
SetAttr(op_desc->Proto(), "max_batch_size", Get<int>("max_batch_size"));
SetAttr(op_desc->Proto(), "workspace_size", Get<int>("workspace_size"));
SetAttr(op_desc->Proto(), "gpu_id", Get<int>("gpu_device_id"));
SetAttr(op_desc->Proto(), "output_name_mapping", output_mapping);
SetAttr(op_desc->Proto(), "parameters", params);
......
......@@ -52,6 +52,7 @@ void IrParamsSyncAmongDevicesPass::RunImpl(Argument *argument) {
for (auto &var_name : all_vars) {
if (std::count(repetitive_params.begin(), repetitive_params.end(),
var_name)) {
scope->EraseVars({var_name});
continue;
}
auto *var = scope->FindLocalVar(var_name);
......
......@@ -886,4 +886,5 @@ USE_ANAKIN_CONVERTER(detection_out);
USE_ANAKIN_CONVERTER(density_prior_box);
USE_ANAKIN_CONVERTER(dropout);
USE_ANAKIN_CONVERTER(sum);
USE_ANAKIN_CONVERTER(prior_box);
#endif
......@@ -70,17 +70,15 @@ void GpuPassStrategy::EnableMKLDNN() {
// The following passes works for Anakin sub-graph engine.
const std::vector<std::string> kAnakinSubgraphPasses({
"infer_clean_graph_pass", //
"simplify_anakin_detection_pattern_pass5", //
"simplify_anakin_detection_pattern_pass4", //
"simplify_anakin_detection_pattern_pass3", //
"simplify_anakin_detection_pattern_pass2", //
"anakin_fillconstant_elementwisemul_fuse", //
"fc_fuse_pass", //
"conv_elementwise_add_fuse_pass", //
"conv_bn_fuse_pass", //
"conv_elementwise_add_fuse_pass", //
"fc_gru_fuse_pass", //
"infer_clean_graph_pass", //
"simplify_anakin_priorbox_detection_out_pass", //
"fillconstant_elementwisemul_fuse", //
"fc_fuse_pass", //
"conv_elementwise_add_fuse_pass", //
"conv_bn_fuse_pass", //
"conv_elementwise_add_fuse_pass", //
"fc_gru_fuse_pass", //
"quant_conv2d_dequant_fuse_pass", //
"anakin_subgraph_pass",
});
......@@ -97,13 +95,10 @@ GpuPassStrategy::GpuPassStrategy() : PassStrategy({}) {
"conv_elementwise_add2_act_fuse_pass", //
"conv_elementwise_add_fuse_pass", //
"runtime_context_cache_pass", //
#endif
#endif //
"transpose_flatten_concat_fuse_pass",
});
for (int i = 6; i >= 2; i--) {
passes_.push_back("transpose_flatten" + std::to_string(i) +
"_concat_fuse_pass");
}
use_gpu_ = true;
}
......
......@@ -23,6 +23,12 @@ function(inference_analysis_api_test target install_dir filename)
ARGS --infer_model=${install_dir}/model --infer_data=${install_dir}/data.txt)
endfunction()
function(inference_analysis_api_int8_test target model_dir data_dir filename)
inference_analysis_test(${target} SRCS ${filename}
EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} benchmark
ARGS --infer_model=${model_dir}/model --infer_data=${data_dir}/data.bin --batch_size=100)
endfunction()
function(inference_analysis_api_test_with_fake_data target install_dir filename model_name)
download_model(${install_dir} ${model_name})
inference_analysis_test(${target} SRCS ${filename}
......@@ -138,6 +144,28 @@ inference_analysis_api_test_with_fake_data(test_analyzer_resnet50
inference_analysis_api_test_with_fake_data(test_analyzer_mobilenet_depthwise_conv
"${INFERENCE_DEMO_INSTALL_DIR}/mobilenet_depthwise_conv" analyzer_resnet50_tester.cc "mobilenet_model.tar.gz" SERIAL)
# int8 image classification tests
if(WITH_MKLDNN)
set(INT8_DATA_DIR "${INFERENCE_DEMO_INSTALL_DIR}/int8")
if (NOT EXISTS ${INT8_DATA_DIR})
inference_download_and_uncompress(${INT8_DATA_DIR} "https://paddle-inference-dist.bj.bcebos.com/int8" "imagenet_val_100.tar.gz")
endif()
#resnet50 int8
set(INT8_RESNET50_MODEL_DIR "${INT8_DATA_DIR}/resnet50")
if (NOT EXISTS ${INT8_RESNET50_MODEL_DIR})
inference_download_and_uncompress(${INT8_RESNET50_MODEL_DIR} "https://paddle-inference-dist.bj.bcebos.com/int8" "resnet50_int8_model.tar.gz" )
endif()
inference_analysis_api_int8_test(test_analyzer_int8_resnet50 ${INT8_RESNET50_MODEL_DIR} ${INT8_DATA_DIR} analyzer_int8_image_classification_tester.cc SERIAL)
#mobilenet int8
set(INT8_MOBILENET_MODEL_DIR "${INT8_DATA_DIR}/mobilenet")
if (NOT EXISTS ${INT8_MOBILENET_MODEL_DIR})
inference_download_and_uncompress(${INT8_MOBILENET_MODEL_DIR} "https://paddle-inference-dist.bj.bcebos.com/int8" "mobilenetv1_int8_model.tar.gz" )
endif()
inference_analysis_api_int8_test(test_analyzer_int8_mobilenet ${INT8_MOBILENET_MODEL_DIR} ${INT8_DATA_DIR} analyzer_int8_image_classification_tester.cc SERIAL)
endif()
# bert, max_len=20, embedding_dim=128
set(BERT_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/bert_emb128")
download_model_and_data(${BERT_INSTALL_DIR} "bert_emb128_model.tar.gz" "bert_data_len20.txt.tar.gz")
......
......@@ -53,19 +53,6 @@ void Split(const std::string &line, char sep, std::vector<T> *v) {
}
}
template <typename T>
constexpr paddle::PaddleDType GetPaddleDType();
template <>
constexpr paddle::PaddleDType GetPaddleDType<int64_t>() {
return paddle::PaddleDType::INT64;
}
template <>
constexpr paddle::PaddleDType GetPaddleDType<float>() {
return paddle::PaddleDType::FLOAT32;
}
// Parse tensor from string
template <typename T>
bool ParseTensor(const std::string &field, paddle::PaddleTensor *tensor) {
......
/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <fstream>
#include <iostream>
#include "paddle/fluid/inference/api/paddle_analysis_config.h"
#include "paddle/fluid/inference/tests/api/tester_helper.h"
DEFINE_int32(iterations, 0, "Number of iterations");
namespace paddle {
namespace inference {
namespace analysis {
void SetConfig(AnalysisConfig *cfg) {
cfg->SetModel(FLAGS_infer_model);
cfg->SetProgFile("__model__");
cfg->DisableGpu();
cfg->SwitchIrOptim();
cfg->SwitchSpecifyInputNames(false);
cfg->SetCpuMathLibraryNumThreads(FLAGS_paddle_num_threads);
cfg->EnableMKLDNN();
}
template <typename T>
class TensorReader {
public:
TensorReader(std::ifstream &file, size_t beginning_offset,
std::vector<int> shape, std::string name)
: file_(file), position(beginning_offset), shape_(shape), name_(name) {
numel =
std::accumulate(shape_.begin(), shape_.end(), 1, std::multiplies<T>());
}
PaddleTensor NextBatch() {
PaddleTensor tensor;
tensor.name = name_;
tensor.shape = shape_;
tensor.dtype = GetPaddleDType<T>();
tensor.data.Resize(numel * sizeof(T));
file_.seekg(position);
file_.read(static_cast<char *>(tensor.data.data()), numel * sizeof(T));
position = file_.tellg();
if (file_.eof()) LOG(ERROR) << name_ << ": reached end of stream";
if (file_.fail())
throw std::runtime_error(name_ + ": failed reading file.");
return tensor;
}
protected:
std::ifstream &file_;
size_t position;
std::vector<int> shape_;
std::string name_;
size_t numel;
};
std::shared_ptr<std::vector<PaddleTensor>> GetWarmupData(
const std::vector<std::vector<PaddleTensor>> &test_data, int num_images) {
int test_data_batch_size = test_data[0][0].shape[0];
CHECK_LE(static_cast<size_t>(num_images),
test_data.size() * test_data_batch_size);
PaddleTensor images;
images.name = "input";
images.shape = {num_images, 3, 224, 224};
images.dtype = PaddleDType::FLOAT32;
images.data.Resize(sizeof(float) * num_images * 3 * 224 * 224);
PaddleTensor labels;
labels.name = "labels";
labels.shape = {num_images, 1};
labels.dtype = PaddleDType::INT64;
labels.data.Resize(sizeof(int64_t) * num_images);
for (int i = 0; i < num_images; i++) {
auto batch = i / test_data_batch_size;
auto element_in_batch = i % test_data_batch_size;
std::copy_n(static_cast<float *>(test_data[batch][0].data.data()) +
element_in_batch * 3 * 224 * 224,
3 * 224 * 224,
static_cast<float *>(images.data.data()) + i * 3 * 224 * 224);
std::copy_n(static_cast<int64_t *>(test_data[batch][1].data.data()) +
element_in_batch,
1, static_cast<int64_t *>(labels.data.data()) + i);
}
auto warmup_data = std::make_shared<std::vector<PaddleTensor>>(2);
(*warmup_data)[0] = std::move(images);
(*warmup_data)[1] = std::move(labels);
return warmup_data;
}
void SetInput(std::vector<std::vector<PaddleTensor>> *inputs,
int32_t batch_size = FLAGS_batch_size) {
std::ifstream file(FLAGS_infer_data, std::ios::binary);
if (!file) {
FAIL() << "Couldn't open file: " << FLAGS_infer_data;
}
int64_t total_images{0};
file.read(reinterpret_cast<char *>(&total_images), sizeof(total_images));
LOG(INFO) << "Total images in file: " << total_images;
std::vector<int> image_batch_shape{batch_size, 3, 224, 224};
std::vector<int> label_batch_shape{batch_size, 1};
auto labels_offset_in_file =
static_cast<size_t>(file.tellg()) +
sizeof(float) * total_images *
std::accumulate(image_batch_shape.begin() + 1,
image_batch_shape.end(), 1, std::multiplies<int>());
TensorReader<float> image_reader(file, 0, image_batch_shape, "input");
TensorReader<int64_t> label_reader(file, labels_offset_in_file,
label_batch_shape, "label");
auto iterations = total_images / batch_size;
if (FLAGS_iterations > 0 && FLAGS_iterations < iterations)
iterations = FLAGS_iterations;
for (auto i = 0; i < iterations; i++) {
auto images = image_reader.NextBatch();
auto labels = label_reader.NextBatch();
inputs->emplace_back(
std::vector<PaddleTensor>{std::move(images), std::move(labels)});
}
}
TEST(Analyzer_int8_resnet50, quantization) {
AnalysisConfig cfg;
SetConfig(&cfg);
AnalysisConfig q_cfg;
SetConfig(&q_cfg);
std::vector<std::vector<PaddleTensor>> input_slots_all;
SetInput(&input_slots_all, 100);
std::shared_ptr<std::vector<PaddleTensor>> warmup_data =
GetWarmupData(input_slots_all, 100);
q_cfg.EnableMkldnnQuantizer();
q_cfg.mkldnn_quantizer_config()->SetWarmupData(warmup_data);
q_cfg.mkldnn_quantizer_config()->SetWarmupBatchSize(100);
CompareQuantizedAndAnalysis(
reinterpret_cast<const PaddlePredictor::Config *>(&cfg),
reinterpret_cast<const PaddlePredictor::Config *>(&q_cfg),
input_slots_all);
}
TEST(Analyzer_int8_resnet50, profile) {
AnalysisConfig cfg;
SetConfig(&cfg);
std::vector<std::vector<PaddleTensor>> input_slots_all;
SetInput(&input_slots_all);
std::shared_ptr<std::vector<PaddleTensor>> warmup_data =
GetWarmupData(input_slots_all, 100);
cfg.EnableMkldnnQuantizer();
cfg.mkldnn_quantizer_config()->SetWarmupData(warmup_data);
cfg.mkldnn_quantizer_config()->SetWarmupBatchSize(100);
std::vector<PaddleTensor> outputs;
TestPrediction(reinterpret_cast<const PaddlePredictor::Config *>(&cfg),
input_slots_all, &outputs, FLAGS_num_threads);
}
} // namespace analysis
} // namespace inference
} // namespace paddle
# copyright (c) 2019 paddlepaddle authors. all rights reserved.
#
# licensed under the apache license, version 2.0 (the "license");
# you may not use this file except in compliance with the license.
# you may obtain a copy of the license at
#
# http://www.apache.org/licenses/license-2.0
#
# unless required by applicable law or agreed to in writing, software
# distributed under the license is distributed on an "as is" basis,
# without warranties or conditions of any kind, either express or implied.
# see the license for the specific language governing permissions and
# limitations under the license.
import unittest
import os
import numpy as np
import time
import sys
import random
import functools
import contextlib
from PIL import Image, ImageEnhance
import math
from paddle.dataset.common import download
random.seed(0)
np.random.seed(0)
DATA_DIM = 224
SIZE_FLOAT32 = 4
SIZE_INT64 = 8
img_mean = np.array([0.485, 0.456, 0.406]).reshape((3, 1, 1))
img_std = np.array([0.229, 0.224, 0.225]).reshape((3, 1, 1))
def resize_short(img, target_size):
percent = float(target_size) / min(img.size[0], img.size[1])
resized_width = int(round(img.size[0] * percent))
resized_height = int(round(img.size[1] * percent))
img = img.resize((resized_width, resized_height), Image.LANCZOS)
return img
def crop_image(img, target_size, center):
width, height = img.size
size = target_size
if center == True:
w_start = (width - size) / 2
h_start = (height - size) / 2
else:
w_start = np.random.randint(0, width - size + 1)
h_start = np.random.randint(0, height - size + 1)
w_end = w_start + size
h_end = h_start + size
img = img.crop((w_start, h_start, w_end, h_end))
return img
def process_image(img_path, mode, color_jitter, rotate):
img = Image.open(img_path)
img = resize_short(img, target_size=256)
img = crop_image(img, target_size=DATA_DIM, center=True)
if img.mode != 'RGB':
img = img.convert('RGB')
img = np.array(img).astype('float32').transpose((2, 0, 1)) / 255
img -= img_mean
img /= img_std
return img
def download_unzip():
int8_download = 'int8/download'
target_name = 'data'
cache_folder = os.path.expanduser('~/.cache/paddle/dataset/' +
int8_download)
target_folder = os.path.join(cache_folder, target_name)
data_urls = []
data_md5s = []
data_urls.append(
'https://paddle-inference-dist.bj.bcebos.com/int8/ILSVRC2012_img_val.tar.gz.partaa'
)
data_md5s.append('60f6525b0e1d127f345641d75d41f0a8')
data_urls.append(
'https://paddle-inference-dist.bj.bcebos.com/int8/ILSVRC2012_img_val.tar.gz.partab'
)
data_md5s.append('1e9f15f64e015e58d6f9ec3210ed18b5')
file_names = []
for i in range(0, len(data_urls)):
download(data_urls[i], cache_folder, data_md5s[i])
file_names.append(data_urls[i].split('/')[-1])
zip_path = os.path.join(cache_folder, 'full_imagenet_val.tar.gz')
if not os.path.exists(zip_path):
cat_command = 'cat'
for file_name in file_names:
cat_command += ' ' + os.path.join(cache_folder, file_name)
cat_command += ' > ' + zip_path
os.system(cat_command)
print('Data is downloaded at {0}\n').format(zip_path)
if not os.path.exists(target_folder):
cmd = 'mkdir {0} && tar xf {1} -C {0}'.format(target_folder, zip_path)
os.system(cmd)
print('Data is unzipped at {0}\n'.format(target_folder))
data_dir = os.path.join(target_folder, 'ILSVRC2012')
print('ILSVRC2012 full val set at {0}\n'.format(data_dir))
return data_dir
def reader():
data_dir = download_unzip()
file_list = os.path.join(data_dir, 'val_list.txt')
output_file = os.path.join(data_dir, 'int8_full_val.bin')
with open(file_list) as flist:
lines = [line.strip() for line in flist]
num_images = len(lines)
if not os.path.exists(output_file):
print(
'Preprocessing to binary file...<num_images><all images><all labels>...\n'
)
with open(output_file, "w+b") as of:
#save num_images(int64_t) to file
of.seek(0)
num = np.array(int(num_images)).astype('int64')
of.write(num.tobytes())
for idx, line in enumerate(lines):
img_path, label = line.split()
img_path = os.path.join(data_dir, img_path)
if not os.path.exists(img_path):
continue
#save image(float32) to file
img = process_image(
img_path, 'val', color_jitter=False, rotate=False)
np_img = np.array(img)
of.seek(SIZE_INT64 + SIZE_FLOAT32 * DATA_DIM * DATA_DIM * 3
* idx)
of.write(np_img.astype('float32').tobytes())
#save label(int64_t) to file
label_int = (int)(label)
np_label = np.array(label_int)
of.seek(SIZE_INT64 + SIZE_FLOAT32 * DATA_DIM * DATA_DIM * 3
* num_images + idx * SIZE_INT64)
of.write(np_label.astype('int64').tobytes())
print('The preprocessed binary file path {}\n'.format(output_file))
if __name__ == '__main__':
reader()
......@@ -50,6 +50,7 @@ DEFINE_bool(use_analysis, true,
DEFINE_bool(record_benchmark, false,
"Record benchmark after profiling the model");
DEFINE_double(accuracy, 1e-3, "Result Accuracy.");
DEFINE_double(quantized_accuracy, 1e-2, "Result Quantized Accuracy.");
DEFINE_bool(zero_copy, false, "Use ZeroCopy to speedup Feed/Fetch.");
DECLARE_bool(profile);
......@@ -58,6 +59,19 @@ DECLARE_int32(paddle_num_threads);
namespace paddle {
namespace inference {
template <typename T>
constexpr paddle::PaddleDType GetPaddleDType();
template <>
constexpr paddle::PaddleDType GetPaddleDType<int64_t>() {
return paddle::PaddleDType::INT64;
}
template <>
constexpr paddle::PaddleDType GetPaddleDType<float>() {
return paddle::PaddleDType::FLOAT32;
}
void PrintConfig(const PaddlePredictor::Config *config, bool use_analysis) {
const auto *analysis_config =
reinterpret_cast<const AnalysisConfig *>(config);
......@@ -392,6 +406,32 @@ void TestPrediction(const PaddlePredictor::Config *config,
}
}
void CompareTopAccuracy(const std::vector<PaddleTensor> &output_slots1,
const std::vector<PaddleTensor> &output_slots2) {
// first output: avg_cost
if (output_slots1.size() == 0 || output_slots2.size() == 0)
throw std::invalid_argument(
"CompareTopAccuracy: output_slots vector is empty.");
PADDLE_ENFORCE(output_slots1.size() >= 2UL);
PADDLE_ENFORCE(output_slots2.size() >= 2UL);
// second output: acc_top1
if (output_slots1[1].lod.size() > 0 || output_slots2[1].lod.size() > 0)
throw std::invalid_argument(
"CompareTopAccuracy: top1 accuracy output has nonempty LoD.");
if (output_slots1[1].dtype != paddle::PaddleDType::FLOAT32 ||
output_slots2[1].dtype != paddle::PaddleDType::FLOAT32)
throw std::invalid_argument(
"CompareTopAccuracy: top1 accuracy output is of a wrong type.");
float *top1_quantized = static_cast<float *>(output_slots1[1].data.data());
float *top1_reference = static_cast<float *>(output_slots2[1].data.data());
LOG(INFO) << "top1 INT8 accuracy: " << *top1_quantized;
LOG(INFO) << "top1 FP32 accuracy: " << *top1_reference;
LOG(INFO) << "Accepted accuracy drop threshold: " << FLAGS_quantized_accuracy;
CHECK_LE(std::abs(*top1_quantized - *top1_reference),
FLAGS_quantized_accuracy);
}
void CompareDeterministic(
const PaddlePredictor::Config *config,
const std::vector<std::vector<PaddleTensor>> &inputs) {
......@@ -421,6 +461,17 @@ void CompareNativeAndAnalysis(
CompareResult(analysis_outputs, native_outputs);
}
void CompareQuantizedAndAnalysis(
const PaddlePredictor::Config *config,
const PaddlePredictor::Config *qconfig,
const std::vector<std::vector<PaddleTensor>> &inputs) {
PrintConfig(config, true);
std::vector<PaddleTensor> analysis_outputs, quantized_outputs;
TestOneThreadPrediction(config, inputs, &analysis_outputs, true);
TestOneThreadPrediction(qconfig, inputs, &quantized_outputs, true);
CompareTopAccuracy(quantized_outputs, analysis_outputs);
}
void CompareNativeAndAnalysis(
PaddlePredictor *native_pred, PaddlePredictor *analysis_pred,
const std::vector<std::vector<PaddleTensor>> &inputs) {
......
......@@ -4,7 +4,6 @@ cc_library(best_fit_allocator SRCS best_fit_allocator.cc DEPS allocator)
cc_library(locked_allocator SRCS locked_allocator.cc DEPS allocator)
cc_library(buffered_allocator SRCS buffered_allocator.cc DEPS allocator)
cc_library(legacy_allocator SRCS legacy_allocator.cc DEPS allocator buddy_allocator profiler)
cc_library(zero_size_allocator SRCS zero_size_allocator.cc DEPS allocator)
cc_test(buffered_allocator_test SRCS buffered_allocator_test.cc DEPS best_fit_allocator locked_allocator buffered_allocator cpu_allocator)
if (WITH_GPU)
......@@ -38,20 +37,30 @@ else ()
set(AllocatorFacadeDeps)
endif()
list(APPEND AllocatorFacadeDeps cpu_allocator locked_allocator best_fit_allocator aligned_allocator auto_increment_allocator conditional_allocator retry_allocator buffered_allocator legacy_allocator zero_size_allocator)
cc_library(aligned_allocator SRCS aligned_allocator.cc DEPS allocator)
cc_library(auto_increment_allocator SRCS auto_increment_allocator.cc DEPS allocator)
cc_library(zero_size_allocator SRCS zero_size_allocator.cc DEPS allocator)
cc_library(conditional_allocator SRCS conditional_allocator.cc DEPS allocator)
cc_library(allocator_strategy SRCS allocator_strategy.cc DEPS gflags ${AllocatorFacadeDeps})
cc_library(allocator_facade SRCS allocator_facade.cc DEPS allocator_strategy)
cc_library(allocator_strategy SRCS allocator_strategy.cc DEPS gflags)
cc_library(allocator_facade SRCS allocator_facade.cc DEPS
${AllocatorFacadeDeps}
cpu_allocator
locked_allocator
best_fit_allocator
aligned_allocator
auto_increment_allocator
zero_size_allocator
conditional_allocator
retry_allocator
buffered_allocator
allocator_strategy
legacy_allocator
)
nv_test(allocation_and_eigen_test SRCS allocation_and_eigen_test.cu DEPS allocator_facade)
cc_test(retry_allocator_test SRCS retry_allocator_test.cc DEPS retry_allocator best_fit_allocator locked_allocator cpu_allocator)
cc_test(naive_best_fit_allocator_facade_test SRCS naive_best_fit_allocator_facade_test.cc DEPS allocator_facade)
cc_test(allocator_facade_abs_flags_test SRCS allocator_facade_abs_flags_test.cc DEPS allocator_facade)
cc_test(allocator_facade_frac_flags_test SRCS allocator_facade_frac_flags_test.cc DEPS allocator_facade)
......@@ -94,8 +94,6 @@ class AlignedAllocator : public ThinAlignedAllocator {
underlying_allocator_->Allocate(size + kAlignment, attr);
return new AlignedAllocation<kAlignment>(std::move(raw_allocation), size);
}
void FreeImpl(Allocation* allocation) override { delete allocation; }
};
} // namespace allocation
......
......@@ -27,24 +27,16 @@ bool Allocator::IsAllocThreadSafe() const { return false; }
AllocationPtr Allocator::Allocate(size_t size, Allocator::Attr attr) {
auto ptr = AllocateImpl(size, attr);
ptr->RegisterDecoratedAllocator(this);
ptr->set_allocator(this);
return AllocationPtr(ptr);
}
void Allocator::FreeImpl(Allocation* allocation) {
Allocator* allocator = allocation->TopDecoratedAllocator();
allocator->Free(allocation);
}
void Allocator::Free(Allocation* allocation) {
allocation->PopDecoratedAllocator();
FreeImpl(allocation);
}
void Allocator::Free(Allocation* allocation) { delete allocation; }
const char* BadAlloc::what() const noexcept { return msg_.c_str(); }
void AllocationDeleter::operator()(Allocation* allocation) const {
Allocator* allocator = allocation->TopDecoratedAllocator();
auto* allocator = allocation->allocator();
allocator->Free(allocation);
}
......
......@@ -46,56 +46,13 @@ class Allocator;
// NOTE: this is the base class of Allocation. Each allocator can use its own
// allocation object.
// NOTE: the `Allocation::ptr()` could be nullptr, if the allocation size is 0
/**
* Allocation is returned by Allocator::Allocate() method.
*
* An allocator may be decorated by another allocator. For example, we can
* decorate
* a RetryAllocator to any allocator to perform allocation retry when first
* allocation request fails.
*
* Explanations of Allocator design is as follows:
*
* Suppose we have an allocator which is decorated by several allocators:
*
* A(1) <- A(2) <- A(3) <- ... <- A(n)
*
* , and the public allocator is A(1).
*
* The allocation process would be:
*
* A(n).Allocate() -> ... -> A(2).Allocate() -> A(1).Allocate()
*
* , and the free process would be:
*
* A(1).Free() -> A(2).Free() -> ... -> A(n).Free()
*
* Therefore, we should record the allocator chain when allocating, so
* that we can free the allocation in the reverse order of allocator chain.
* The field `decorated_allocators_` is used to record this chain.
*
* Another example is that we want to add additional fields in Allocation,
* e.g., something what is done in AlignedAllocator, etc.
* In this case, we should declare a derived class of Allocation, which
* contains an underlying Allocation allocated by the underlying allocator.
* Therefore, `decorated_allocators_` of the new Allocation object would
* be a new chain, differing from the underlying Allocation object.
*/
class Allocation {
public:
Allocation(void* ptr, size_t size, platform::Place place)
: ptr_(ptr), size_(size), place_(place) {
// NOTE(zjl): Since decorated_allocators_ is usually a small vector
// We reserve a small buffer to it to prevent frequent heap allocation
// Not quite sure whether we need something like gtl vector.
decorated_allocators_.reserve(8);
}
: allocator_(nullptr), ptr_(ptr), size_(size), place_(place) {}
Allocation(const Allocation& o) = delete;
Allocation& operator=(const Allocation& o) = delete;
Allocation(Allocation&& o) = delete;
Allocation& operator=(Allocation&& o) = delete;
// Returns the holding pointer.
// NOTE: For performance consideration, it is better not to make this method
......@@ -117,31 +74,17 @@ class Allocation {
const platform::Place& place() const { return place_; }
virtual ~Allocation();
private:
const std::vector<Allocator*>& DecoratedAllocators() const {
return decorated_allocators_;
}
inline void RegisterDecoratedAllocator(Allocator* allocator) {
decorated_allocators_.push_back(allocator);
}
Allocator* allocator() { return allocator_; }
inline void PopDecoratedAllocator() { decorated_allocators_.pop_back(); }
void set_allocator(Allocator* allocator) { allocator_ = allocator; }
inline Allocator* TopDecoratedAllocator() {
return decorated_allocators_.back();
}
virtual ~Allocation();
private:
Allocator* allocator_;
void* ptr_;
size_t size_;
platform::Place place_;
std::vector<Allocator*> decorated_allocators_;
friend class Allocator;
friend class AllocationDeleter;
};
using AllocationPtr = std::unique_ptr<Allocation, AllocationDeleter>;
......@@ -191,12 +134,9 @@ class Allocator {
// True if the `Allocate` is thread safe.
virtual bool IsAllocThreadSafe() const;
// This function should not be called outside
void Free(Allocation* allocation);
protected:
virtual void Free(Allocation* allocation);
virtual Allocation* AllocateImpl(size_t size, Allocator::Attr attr) = 0;
virtual void FreeImpl(Allocation* allocation);
private:
friend class AllocationDeleter;
......
......@@ -49,17 +49,6 @@ namespace paddle {
namespace memory {
namespace allocation {
static inline std::shared_ptr<Allocator> WrapRetryAllocator(
std::shared_ptr<Allocator> allocator, int64_t retry_time) {
if (retry_time > 0) {
auto* retry_allocator =
new RetryAllocator(std::move(allocator), retry_time);
allocator.reset(retry_allocator);
}
return allocator;
}
// TODO(yy): Dirty code here. This class should be configurable in runtime.
class CPUManagedAllocator : public Allocator {
public:
......@@ -123,10 +112,14 @@ class ChunkedAllocator : public Allocator {
std::shared_ptr<Allocator> CreateAllocatorWithChunk() {
chunks_.emplace_back(raw_allocator_->Allocate(max_chunk_size_));
auto* allocation = chunks_.back().get();
std::shared_ptr<Allocator> allocator(new LockedAllocator(
std::shared_ptr<Allocator>(new BestFitAllocator(allocation))));
std::unique_ptr<Allocator> allocator(new LockedAllocator(
std::unique_ptr<Allocator>(new BestFitAllocator(allocation))));
allocator = WrapRetryAllocator(allocator, retry_time_);
if (retry_time_ > 0) {
auto* retry_allocator =
new RetryAllocator(std::move(allocator), retry_time_);
allocator.reset(retry_allocator);
}
return std::make_shared<AlignedAllocator<64u>>(std::move(allocator));
}
......@@ -197,23 +190,13 @@ class AllocatorFacadePrivate {
~AllocatorFacadePrivate() = default;
AllocatorFacadePrivate() {
auto strategy = GetAllocatorStrategy();
switch (strategy) {
case AllocatorStrategy::kLegacy: {
InitLegacyAllocator();
break;
}
case AllocatorStrategy::kNaiveBestFit: {
InitCPUAllocator();
InitCUDAAllocator();
InitCUDAPinnedAllocator();
WrapZeroSizeAllocator();
break;
}
default: {
PADDLE_THROW("Unsupported allocator strategy: %d",
static_cast<int>(strategy));
}
if (GetAllocatorStrategy() == AllocatorStrategy::kLegacy) {
InitLegacyAllocator();
} else {
InitCPUAllocator();
InitCUDAAllocator();
InitCUDAPinnedAllocator();
WrapZeroSizeAllocator();
}
}
......@@ -271,7 +254,8 @@ AllocatorFacade& AllocatorFacade::Instance() {
std::shared_ptr<Allocation> AllocatorFacade::AllocShared(
const platform::Place& place, size_t size, Allocator::Attr attr) {
return std::shared_ptr<Allocation>(Alloc(place, size, attr));
return std::shared_ptr<Allocation>(Alloc(place, size, attr).release(),
AllocationDeleter());
}
AllocationPtr AllocatorFacade::Alloc(const platform::Place& place, size_t size,
......
......@@ -19,22 +19,16 @@
DEFINE_string(
allocator_strategy, "legacy",
"The allocation strategy. Legacy means the original allocator of Fluid."
"naive_best_fit means the experimental best fit allocator. "
"allocator. Enum in [legacy, naive_best_fit].");
"New means the experimental allocators of Fluid. in [legacy, new]");
namespace paddle {
namespace memory {
namespace allocation {
static AllocatorStrategy GetStrategyFromFlag() {
if (FLAGS_allocator_strategy == "legacy") {
return AllocatorStrategy::kLegacy;
} else if (FLAGS_allocator_strategy == "naive_best_fit") {
return AllocatorStrategy::kNaiveBestFit;
} else {
PADDLE_THROW("Unsupported allocator strategy: %s",
FLAGS_allocator_strategy);
}
return FLAGS_allocator_strategy == "legacy"
? AllocatorStrategy::kLegacy
: AllocatorStrategy::kNaiveBestFit;
}
AllocatorStrategy GetAllocatorStrategy() {
......
......@@ -109,7 +109,7 @@ size_t BestFitAllocator::NumFreeChunks() const {
}
return num;
}
void BestFitAllocator::FreeImpl(Allocation* allocation) {
void BestFitAllocator::Free(Allocation* allocation) {
auto* bf_allocation = dynamic_cast<BestFitAllocation*>(allocation);
PADDLE_ENFORCE_NOT_NULL(bf_allocation,
"The input allocation is not BestFitAllocation.");
......
......@@ -119,7 +119,7 @@ class BestFitAllocator : public Allocator {
void InsertFreeNode(const ListIt& it);
protected:
void FreeImpl(Allocation* allocation) override;
void Free(Allocation* allocation) override;
Allocation* AllocateImpl(size_t size, Allocator::Attr attr) override;
private:
......
......@@ -22,11 +22,11 @@ namespace paddle {
namespace memory {
namespace allocation {
BufferedAllocator::BufferedAllocator(std::shared_ptr<Allocator> allocator)
BufferedAllocator::BufferedAllocator(std::unique_ptr<Allocator> &&allocator)
: underlying_allocator_(std::move(allocator)) {
PADDLE_ENFORCE_NOT_NULL(
underlying_allocator_,
"Underlying allocator of BufferedAllocator must not be null");
"Underlying allocator of BufferedAllocator must be unmanaged");
if (underlying_allocator_->IsAllocThreadSafe()) {
mtx_.reset(new std::mutex());
}
......@@ -41,19 +41,19 @@ void BufferedAllocator::FreeCache(size_t size) {
while (!allocations_.empty()) { // free the largest
auto it = --allocations_.end();
cur += it->second->size();
underlying_allocator_->Free(it->second.release());
delete it->second.release();
allocations_.erase(it);
if (cur >= size) return;
}
}
bool BufferedAllocator::IsAllocThreadSafe() const { return mtx_ != nullptr; }
void BufferedAllocator::FreeImpl(Allocation *allocation) {
bool BufferedAllocator::IsAllocThreadSafe() const {
return this->underlying_allocator_->IsAllocThreadSafe();
}
void BufferedAllocator::Free(Allocation *allocation) {
platform::LockGuardPtr<std::mutex> guard(mtx_);
allocations_.emplace(allocation->size(), AllocationPtr(allocation));
}
Allocation *BufferedAllocator::AllocateImpl(size_t size, Allocator::Attr attr) {
{
platform::LockGuardPtr<std::mutex> guard(mtx_);
......@@ -61,15 +61,17 @@ Allocation *BufferedAllocator::AllocateImpl(size_t size, Allocator::Attr attr) {
if (it != allocations_.end() && it->first < size * 2) {
AllocationPtr result(std::move(it->second));
allocations_.erase(it);
return result.release();
return new AllocationWithUnderlying(std::move(result));
}
}
try {
return underlying_allocator_->Allocate(size, attr).release();
return new AllocationWithUnderlying(
underlying_allocator_->Allocate(size, attr));
} catch (BadAlloc &) {
FreeCache(size);
return underlying_allocator_->Allocate(size, attr).release();
return new AllocationWithUnderlying(
underlying_allocator_->Allocate(size, attr));
}
}
......
......@@ -31,7 +31,7 @@ namespace allocation {
// underlying_allocator_
class BufferedAllocator : public Allocator {
public:
explicit BufferedAllocator(std::shared_ptr<Allocator> allocator);
explicit BufferedAllocator(std::unique_ptr<Allocator> &&allocator);
~BufferedAllocator();
......@@ -44,11 +44,11 @@ class BufferedAllocator : public Allocator {
void FreeCache(size_t size);
protected:
void FreeImpl(Allocation *allocation) override;
void Free(Allocation *allocation) override;
Allocation *AllocateImpl(size_t size, Allocator::Attr attr) override;
private:
std::shared_ptr<Allocator> underlying_allocator_;
std::unique_ptr<Allocator> underlying_allocator_;
std::multimap<size_t, AllocationPtr> allocations_;
std::unique_ptr<std::mutex> mtx_;
};
......
......@@ -14,6 +14,7 @@
#include "paddle/fluid/memory/allocation/buffered_allocator.h"
#include <gtest/gtest.h>
#include <memory>
#include <utility>
#include "paddle/fluid/memory/allocation/best_fit_allocator.h"
#include "paddle/fluid/memory/allocation/cpu_allocator.h"
......@@ -65,7 +66,7 @@ class StubAllocator : public Allocator {
size_t GetFreeCount() const { return destruct_count_; }
protected:
void FreeImpl(Allocation *allocation) override {
void Free(Allocation *allocation) override {
auto *alloc = dynamic_cast<StubAllocation *>(allocation);
PADDLE_ENFORCE_NOT_NULL(alloc);
if (alloc->ptr()) delete[] static_cast<uint8_t *>(alloc->ptr());
......
......@@ -20,27 +20,25 @@ namespace paddle {
namespace memory {
namespace allocation {
CPUAllocation::CPUAllocation(void *ptr, size_t size)
: Allocation(ptr, size, platform::CPUPlace()) {}
bool CPUAllocator::IsAllocThreadSafe() const { return true; }
void CPUAllocator::FreeImpl(Allocation *allocation) {
void *p = allocation->ptr();
#ifdef _WIN32
_aligned_free(p);
#else
free(p);
#endif
void CPUAllocator::Free(Allocation *allocation) {
PADDLE_ENFORCE_NOT_NULL(dynamic_cast<CPUAllocation *>(allocation));
free(allocation->ptr());
delete allocation;
}
Allocation *CPUAllocator::AllocateImpl(size_t size, Allocator::Attr attr) {
void *p;
#ifdef _WIN32
p = _aligned_malloc(size, kAlignment);
#else
PADDLE_ENFORCE_EQ(posix_memalign(&p, kAlignment, size), 0, "Alloc %ld error!",
size);
#endif
return new Allocation(p, size, platform::CPUPlace());
void *ptr;
auto status = posix_memalign(&ptr, kAlignment, size);
if (UNLIKELY(status) != 0) {
throw BadAlloc(string::Sprintf("Cannot allocate cpu memory %d. Errno is %d",
size, status));
}
return new CPUAllocation(ptr, size);
}
} // namespace allocation
} // namespace memory
......
......@@ -31,13 +31,19 @@ namespace allocation {
//
// NOTE(yy): It is no need to use `BestFitAllocator` in CPU. We can import
// an open-sourced allocator into Paddle.
class CPUAllocator;
class CPUAllocation : public Allocation {
public:
CPUAllocation(void* ptr, size_t size);
};
class CPUAllocator : public Allocator {
public:
constexpr static size_t kAlignment = 4096UL;
constexpr static size_t kAlignment = 64u;
bool IsAllocThreadSafe() const override;
protected:
void FreeImpl(Allocation* allocation) override;
void Free(Allocation* allocation) override;
Allocation* AllocateImpl(size_t size, Allocator::Attr attr) override;
};
} // namespace allocation
......
......@@ -23,14 +23,15 @@ namespace paddle {
namespace memory {
namespace allocation {
bool CUDAAllocator::IsAllocThreadSafe() const { return true; }
void CUDAAllocator::FreeImpl(Allocation* allocation) {
void CUDAAllocator::Free(Allocation* allocation) {
platform::CUDADeviceGuard guard(place_.device);
PADDLE_ENFORCE_EQ(boost::get<platform::CUDAPlace>(allocation->place()),
auto* cuda_allocation = dynamic_cast<CUDAAllocation*>(allocation);
PADDLE_ENFORCE_NOT_NULL(cuda_allocation);
PADDLE_ENFORCE_EQ(boost::get<platform::CUDAPlace>(cuda_allocation->place()),
place_);
PADDLE_ENFORCE(cudaFree(allocation->ptr()));
delete allocation;
}
Allocation* CUDAAllocator::AllocateImpl(size_t size, Allocator::Attr attr) {
platform::CUDADeviceGuard guard(place_.device);
void* ptr;
......@@ -40,9 +41,8 @@ Allocation* CUDAAllocator::AllocateImpl(size_t size, Allocator::Attr attr) {
"Cannot allocate %d on GPU %d, cuda status %d, %s", size, place_.device,
status, cudaGetErrorString(status)));
}
return new Allocation(ptr, size, platform::Place(place_));
return new CUDAAllocation(ptr, size, platform::Place(place_));
}
} // namespace allocation
} // namespace memory
} // namespace paddle
......@@ -20,6 +20,13 @@ namespace paddle {
namespace memory {
namespace allocation {
// CUDA System allocator and allocation.
// Just a flag type.
class CUDAAllocation : public Allocation {
public:
using Allocation::Allocation;
};
class CUDAAllocator : public Allocator {
public:
explicit CUDAAllocator(const platform::CUDAPlace& place) : place_(place) {}
......@@ -28,7 +35,7 @@ class CUDAAllocator : public Allocator {
bool IsAllocThreadSafe() const override;
protected:
void FreeImpl(Allocation* allocation) override;
void Free(Allocation* allocation) override;
Allocation* AllocateImpl(size_t size, Allocator::Attr attr) override;
private:
......
......@@ -134,22 +134,26 @@ size_t Used<platform::CPUPlace>(const platform::CPUPlace &place) {
}
#ifdef PADDLE_WITH_CUDA
class GPUBuddyAllocatorList {
public:
GPUBuddyAllocatorList()
: allocators_(platform::GetCUDADeviceCount()),
flags_(platform::GetCUDADeviceCount()) {
allocation::GPUMemMonitor.Initialize(allocators_.size());
}
BuddyAllocator *GetGPUBuddyAllocator(int gpu_id) {
static std::once_flag init_flag;
static detail::BuddyAllocator **a_arr = nullptr;
static std::vector<int> devices;
std::call_once(init_flag, [gpu_id]() {
devices = platform::GetSelectedDevices();
int gpu_num = devices.size();
BuddyAllocator *Get(size_t dev_id) {
PADDLE_ENFORCE(dev_id < flags_.size(), "Invalid device id %s", dev_id);
std::call_once(flags_[dev_id], [this, dev_id] {
allocation::GPUMemMonitor.Initialize(devices.size());
a_arr = new BuddyAllocator *[gpu_num];
for (size_t i = 0; i < devices.size(); ++i) {
int dev_id = devices[i];
a_arr[i] = nullptr;
platform::SetDeviceId(dev_id);
allocators_[dev_id] = new BuddyAllocator(
std::unique_ptr<detail::SystemAllocator>(
new detail::GPUAllocator(dev_id)),
platform::GpuMinChunkSize(), platform::GpuMaxChunkSize());
a_arr[i] = new BuddyAllocator(std::unique_ptr<detail::SystemAllocator>(
new detail::GPUAllocator(dev_id)),
platform::GpuMinChunkSize(),
platform::GpuMaxChunkSize());
VLOG(10) << "\n\nNOTE:\n"
<< "You can set GFlags environment variable "
......@@ -163,19 +167,13 @@ class GPUBuddyAllocatorList {
<< FLAGS_initial_gpu_memory_in_mb
<< ". Current 'FLAGS_reallocate_gpu_memory_in_mb' value is "
<< FLAGS_reallocate_gpu_memory_in_mb << "\n\n";
});
return allocators_[dev_id];
}
private:
std::vector<BuddyAllocator *> allocators_;
std::vector<std::once_flag> flags_;
};
}
});
BuddyAllocator *GetGPUBuddyAllocator(int gpu_id) {
static GPUBuddyAllocatorList allocators;
platform::SetDeviceId(gpu_id);
return allocators.Get(gpu_id);
auto pos = std::distance(devices.begin(),
std::find(devices.begin(), devices.end(), gpu_id));
return a_arr[pos];
}
#endif
......@@ -194,7 +192,7 @@ void *Alloc<platform::CUDAPlace>(const platform::CUDAPlace &place,
#ifdef PADDLE_WITH_CUDA
auto *buddy_allocator = GetGPUBuddyAllocator(place.device);
auto *ptr = buddy_allocator->Alloc(size);
if (ptr == nullptr && size > 0) {
if (ptr == nullptr) {
int cur_dev = platform::GetCurrentDeviceId();
platform::SetDeviceId(place.device);
size_t avail, total;
......@@ -349,7 +347,7 @@ Allocation *LegacyAllocator::AllocateImpl(size_t size, Allocator::Attr attr) {
return tmp_alloc;
}
void LegacyAllocator::FreeImpl(Allocation *allocation) {
void LegacyAllocator::Free(Allocation *allocation) {
boost::apply_visitor(
legacy::FreeVisitor(allocation->ptr(), allocation->size()),
allocation->place());
......
......@@ -73,7 +73,7 @@ class LegacyAllocator : public Allocator {
protected:
Allocation *AllocateImpl(size_t size, Allocator::Attr attr) override;
void FreeImpl(Allocation *allocation) override;
void Free(Allocation *allocation) override;
private:
platform::Place place_;
......
......@@ -17,7 +17,6 @@
#include <utility>
#include "paddle/fluid/memory/allocation/allocation_with_underlying.h"
#include "paddle/fluid/platform/lock_guard_ptr.h"
namespace paddle {
namespace memory {
namespace allocation {
......@@ -25,24 +24,26 @@ namespace allocation {
bool LockedAllocator::IsAllocThreadSafe() const { return true; }
LockedAllocator::LockedAllocator(
std::shared_ptr<Allocator> underlying_allocator)
std::unique_ptr<Allocator> &&underlying_allocator)
: underlying_allocator_(std::move(underlying_allocator)) {
PADDLE_ENFORCE_NOT_NULL(underlying_allocator_);
if (!underlying_allocator_->IsAllocThreadSafe()) {
mtx_.reset(new std::mutex());
}
}
void LockedAllocator::FreeImpl(Allocation *allocation) {
platform::LockGuardPtr<std::mutex> guard(mtx_);
underlying_allocator_->Free(allocation);
void LockedAllocator::Free(Allocation *allocation) {
{
platform::LockGuardPtr<std::mutex> guard(mtx_);
reinterpret_cast<AllocationWithUnderlying *>(allocation)
->allocation_.reset(); // Destroy inner allocation
}
delete allocation;
}
Allocation *LockedAllocator::AllocateImpl(size_t size, Allocator::Attr attr) {
platform::LockGuardPtr<std::mutex> guard(mtx_);
return underlying_allocator_->Allocate(size, attr).release();
return new AllocationWithUnderlying(
underlying_allocator_->Allocate(size, attr));
}
} // namespace allocation
} // namespace memory
} // namespace paddle
......@@ -24,15 +24,15 @@ namespace allocation {
// A allocator to make underlying allocator thread safe.
class LockedAllocator : public Allocator {
public:
explicit LockedAllocator(std::shared_ptr<Allocator> underlying_allocator);
explicit LockedAllocator(std::unique_ptr<Allocator> &&underlying_allocator);
bool IsAllocThreadSafe() const override;
protected:
void FreeImpl(Allocation *allocation) override;
void Free(Allocation *allocation) override;
Allocation *AllocateImpl(size_t size, Allocator::Attr attr) override;
private:
std::shared_ptr<Allocator> underlying_allocator_;
std::unique_ptr<Allocator> underlying_allocator_;
std::unique_ptr<std::mutex> mtx_;
};
......
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <gflags/gflags.h>
#include <gtest/gtest.h>
#include "paddle/fluid/memory/allocation/allocator_facade.h"
#ifdef PADDLE_WITH_CUDA
DECLARE_double(fraction_of_gpu_memory_to_use);
DECLARE_double(fraction_of_cuda_pinned_memory_to_use);
DECLARE_int64(gpu_allocator_retry_time);
#endif
DECLARE_string(allocator_strategy);
namespace paddle {
namespace memory {
namespace allocation {
TEST(allocator, allocator) {
#ifdef PADDLE_WITH_CUDA
FLAGS_fraction_of_gpu_memory_to_use = 0.01;
FLAGS_gpu_allocator_retry_time = 500;
FLAGS_fraction_of_cuda_pinned_memory_to_use = 0.5;
#endif
FLAGS_allocator_strategy = "naive_best_fit";
auto &instance = AllocatorFacade::Instance();
platform::Place place;
size_t size = 1024;
{
place = platform::CPUPlace();
size = 1024;
auto cpu_allocation = instance.Alloc(place, size);
ASSERT_NE(cpu_allocation, nullptr);
ASSERT_NE(cpu_allocation->ptr(), nullptr);
ASSERT_EQ(cpu_allocation->place(), place);
ASSERT_EQ(cpu_allocation->size(), size);
}
#ifdef PADDLE_WITH_CUDA
{
place = platform::CUDAPlace(0);
size = 1024;
auto gpu_allocation = instance.Alloc(place, size);
ASSERT_NE(gpu_allocation, nullptr);
ASSERT_NE(gpu_allocation->ptr(), nullptr);
ASSERT_EQ(gpu_allocation->place(), place);
ASSERT_GE(gpu_allocation->size(), size);
}
{
// Allocate 2GB gpu memory
place = platform::CUDAPlace(0);
size = 2 * static_cast<size_t>(1 << 30);
auto gpu_allocation = instance.Alloc(place, size);
ASSERT_NE(gpu_allocation, nullptr);
ASSERT_NE(gpu_allocation->ptr(), nullptr);
ASSERT_EQ(gpu_allocation->place(), place);
ASSERT_GE(gpu_allocation->size(), size);
}
{
place = platform::CUDAPinnedPlace();
size = (1 << 20);
auto cuda_pinned_allocation =
instance.Alloc(platform::CUDAPinnedPlace(), 1 << 20);
ASSERT_NE(cuda_pinned_allocation, nullptr);
ASSERT_NE(cuda_pinned_allocation->ptr(), nullptr);
ASSERT_EQ(cuda_pinned_allocation->place(), place);
ASSERT_GE(cuda_pinned_allocation->size(), size);
}
#endif
}
} // namespace allocation
} // namespace memory
} // namespace paddle
......@@ -20,15 +20,20 @@ namespace paddle {
namespace memory {
namespace allocation {
bool CPUPinnedAllocator::IsAllocThreadSafe() const { return true; }
void CPUPinnedAllocator::FreeImpl(Allocation *allocation) {
void CPUPinnedAllocator::Free(Allocation *allocation) {
PADDLE_ENFORCE_NOT_NULL(dynamic_cast<CPUPinnedAllocation *>(allocation));
PADDLE_ENFORCE(cudaFreeHost(allocation->ptr()));
delete allocation;
}
Allocation *CPUPinnedAllocator::AllocateImpl(size_t size,
Allocator::Attr attr) {
// PADDLE_ENFORCE_EQ(
// attr, kCrossDevice,
// "CPUPinnedAllocator should be used for Cross-Device Communication");
void *ptr;
PADDLE_ENFORCE(cudaHostAlloc(&ptr, size, cudaHostAllocPortable));
return new Allocation(ptr, size, platform::CUDAPinnedPlace());
return new CPUPinnedAllocation(ptr, size);
}
} // namespace allocation
} // namespace memory
......
......@@ -20,12 +20,18 @@ namespace memory {
namespace allocation {
// Allocator uses `cudaHostAlloc`
class CPUPinnedAllocation : public Allocation {
public:
CPUPinnedAllocation(void *ptr, size_t size)
: Allocation(ptr, size, platform::CUDAPinnedPlace()) {}
};
class CPUPinnedAllocator : public Allocator {
public:
bool IsAllocThreadSafe() const override;
protected:
void FreeImpl(Allocation *allocation) override;
void Free(Allocation *allocation) override;
Allocation *AllocateImpl(size_t size, Allocator::Attr attr) override;
};
......
......@@ -18,15 +18,25 @@ namespace paddle {
namespace memory {
namespace allocation {
void RetryAllocator::FreeImpl(Allocation* allocation) {
bool RetryAllocator::IsAllocThreadSafe() const {
return underlying_allocator_->IsAllocThreadSafe();
}
void RetryAllocator::Free(Allocation* allocation) {
// Delete underlying allocation first.
underlying_allocator_->Free(allocation);
cv_.notify_all();
reinterpret_cast<AllocationWithUnderlying*>(allocation)->allocation_.reset();
{
// notify all waited allocators, they can try to allocate memory after free.
std::lock_guard<std::mutex> lock(mutex_);
cv_.notify_all();
}
delete allocation;
}
Allocation* RetryAllocator::AllocateImpl(size_t size, Allocator::Attr attr) {
auto alloc_func = [&, this]() {
return underlying_allocator_->Allocate(size, attr).release();
return new AllocationWithUnderlying(
underlying_allocator_->Allocate(size, attr));
};
// In fact, we can unify the code of allocation success and failure
// But it would add lock even when allocation success at the first time
......
......@@ -25,25 +25,32 @@ namespace paddle {
namespace memory {
namespace allocation {
class RetryAllocator;
class RetryAllocator : public Allocator {
public:
RetryAllocator(std::shared_ptr<Allocator> allocator, size_t retry_ms)
RetryAllocator(std::unique_ptr<Allocator>&& allocator, size_t retry_ms)
: underlying_allocator_(std::move(allocator)), retry_time_(retry_ms) {
EnforceCheck();
}
bool IsAllocThreadSafe() const override;
private:
void EnforceCheck() {
PADDLE_ENFORCE_NOT_NULL(
underlying_allocator_,
"UnderlyingAllocator of RetryAllocator must not be null");
underlying_allocator_.get(),
"UnderlyingAllocator of RetryAllocator must be UnmanagedAllocator");
PADDLE_ENFORCE(underlying_allocator_->IsAllocThreadSafe(),
"UnderlyingAllocator of RetryAllocator must be thread-safe");
}
bool IsAllocThreadSafe() const override { return true; }
protected:
void FreeImpl(Allocation* allocation) override;
void Free(Allocation* allocation) override;
Allocation* AllocateImpl(size_t size, Allocator::Attr attr) override;
private:
std::shared_ptr<Allocator> underlying_allocator_;
std::unique_ptr<Allocator> underlying_allocator_;
std::chrono::milliseconds retry_time_;
std::mutex mutex_;
std::condition_variable cv_;
......@@ -51,6 +58,8 @@ class RetryAllocator : public Allocator {
// For debug, We can add an atomic integer to record how many memory sizes are
// waited to allocate
// std::atomic<size_t> waited_allocate_size_{0};
friend class RetryAllocation;
};
} // namespace allocation
......
......@@ -24,20 +24,11 @@ bool ZeroSizeAllocator::IsAllocThreadSafe() const {
Allocation *ZeroSizeAllocator::AllocateImpl(size_t size, Allocator::Attr attr) {
if (size == 0) {
return new Allocation(nullptr, 0, place_);
return new ZeroSizeAllocation(place_);
} else {
return underlying_allocator_->Allocate(size, attr).release();
}
}
void ZeroSizeAllocator::FreeImpl(Allocation *allocation) {
if (allocation->size() == 0) {
delete allocation;
} else {
underlying_allocator_->Free(allocation);
}
}
} // namespace allocation
} // namespace memory
} // namespace paddle
......@@ -24,6 +24,12 @@ namespace allocation {
// The allocator handles the request's size is zero. Allocator will always
// return an allocation even the request size is zero. However, the
// allocation.ptr() is nullptr
class ZeroSizeAllocation : public Allocation {
public:
explicit ZeroSizeAllocation(const platform::Place& p)
: Allocation(nullptr, 0, p) {}
};
class ZeroSizeAllocator : public Allocator {
public:
ZeroSizeAllocator(std::shared_ptr<Allocator> underlying_allocator,
......@@ -34,7 +40,6 @@ class ZeroSizeAllocator : public Allocator {
protected:
Allocation* AllocateImpl(size_t size, Allocator::Attr attr) override;
void FreeImpl(Allocation* allocation) override;
private:
std::shared_ptr<Allocator> underlying_allocator_;
......
......@@ -65,7 +65,8 @@ class AllocContinuousSpaceKernel : public framework::OpKernel<T> {
// Get numel and dtype
size_t numel = 0;
auto dtype = kDefaultDtype;
GetMemSizeAndDtype(in_tensors, in_var_names, &numel, &dtype);
GetMemSizeAndDtype(in_tensors, in_var_names, &numel, &dtype,
context.GetPlace());
// Alloc the continuous space
auto fused_tensor = context.Output<framework::LoDTensor>("FusedOutput");
......@@ -74,14 +75,18 @@ class AllocContinuousSpaceKernel : public framework::OpKernel<T> {
// Init the continuous space
auto out_tensors = context.MultiOutput<framework::LoDTensor>("Output");
int64_t offset = 0;
size_t offset = 0;
size_t size_of_dtype = framework::SizeOfType(dtype);
if (context.Attr<bool>("copy_data")) {
for (size_t i = 0; i < in_var_names.size(); ++i) {
int64_t len = out_tensors[i]->numel();
auto sub_tensor = fused_tensor->Slice(offset, offset + len);
offset += len;
framework::TensorCopy(*out_tensors[i], context.GetPlace(), dev_ctx,
size_t len = static_cast<size_t>(in_tensors[i]->numel());
auto sub_tensor = fused_tensor->Slice(
static_cast<int64_t>(offset), static_cast<int64_t>(offset + len));
framework::TensorCopy(*in_tensors[i], context.GetPlace(), dev_ctx,
&sub_tensor);
offset +=
Alignment(len * size_of_dtype, context.GetPlace()) / size_of_dtype;
}
} else if (context.Attr<bool>("set_constant")) {
math::SetConstant<DeviceContext, T> set_constant;
......@@ -92,11 +97,13 @@ class AllocContinuousSpaceKernel : public framework::OpKernel<T> {
// Make the outputs point to the continuous space.
offset = 0;
for (size_t i = 0; i < out_tensors.size(); ++i) {
int64_t len = out_tensors[i]->numel();
size_t len = static_cast<size_t>(out_tensors[i]->numel());
auto dim = out_tensors[i]->dims();
out_tensors[i]
->ShareDataWith(fused_tensor->Slice(offset, offset + len))
->ShareDataWith(fused_tensor->Slice(
static_cast<int64_t>(offset), static_cast<int64_t>(offset + len)))
.Resize(dim);
len = Alignment(len * size_of_dtype, context.GetPlace()) / size_of_dtype;
offset += len;
VLOG(10) << "alloc_space_for_vars: output(" << out_var_names[i]
<< ") ,dim:(" << dim << ")"
......@@ -104,12 +111,28 @@ class AllocContinuousSpaceKernel : public framework::OpKernel<T> {
}
}
private:
// Note(zcd): Addresses should be aligned, otherwise, the results may have
// diff.
size_t Alignment(size_t size, const platform::Place &place) const {
// Allow to allocate the minimum chunk size is 4 KB.
size_t alignment = 1 << 12;
if (platform::is_gpu_place(place)) {
// Allow to allocate the minimum chunk size is 256 B.
alignment = 1 << 8;
}
size_t remaining = size % alignment;
return remaining == 0 ? size : size + (alignment - remaining);
}
void GetMemSizeAndDtype(
const std::vector<const framework::LoDTensor *> &lod_tensors,
const std::vector<std::string> var_names, size_t *numel,
framework::proto::VarType::Type *dtype) const {
framework::proto::VarType::Type *dtype,
const platform::Place &place) const {
PADDLE_ENFORCE_EQ(lod_tensors.size(), var_names.size());
*numel = 0;
size_t size_of_dtype = 0;
for (size_t i = 0; i < var_names.size(); ++i) {
PADDLE_ENFORCE(lod_tensors[i]->IsInitialized(), "%s is not initialized.",
var_names[i]);
......@@ -119,6 +142,7 @@ class AllocContinuousSpaceKernel : public framework::OpKernel<T> {
PADDLE_ENFORCE_NE(p_dtype, kDefaultDtype, "%s's type should not be %s.",
var_names[i], kDefaultDtype);
*dtype = p_dtype;
size_of_dtype = framework::SizeOfType(p_dtype);
}
PADDLE_ENFORCE_EQ(p_dtype, *dtype, "Input vars is not equal.");
......@@ -126,7 +150,8 @@ class AllocContinuousSpaceKernel : public framework::OpKernel<T> {
PADDLE_ENFORCE_GT(size, 0);
VLOG(10) << "alloc_space_for_vars: input(" << var_names[i] << ") ,dim:("
<< lod_tensors[i]->dims() << ")";
*numel += size;
*numel += Alignment(static_cast<size_t>(size) * size_of_dtype, place) /
size_of_dtype;
}
}
};
......
......@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/bpr_loss_op.h"
#include <memory>
namespace paddle {
namespace operators {
......@@ -127,6 +128,23 @@ neural networks>(https://arxiv.org/abs/1511.06939)
)DOC");
}
};
class BprLossGradDescMaker : public framework::SingleGradOpDescMaker {
public:
using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
protected:
std::unique_ptr<framework::OpDesc> Apply() const override {
std::unique_ptr<framework::OpDesc> op(new framework::OpDesc());
op->SetType("bpr_loss_grad");
op->SetInput("X", Input("X"));
op->SetInput("Label", Input("Label"));
op->SetInput(framework::GradVarName("Y"), OutputGrad("Y"));
op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
op->SetAttrMap(Attrs());
return op;
}
};
} // namespace operators
} // namespace paddle
......@@ -134,7 +152,7 @@ namespace ops = paddle::operators;
using CPUCtx = paddle::platform::CPUDeviceContext;
REGISTER_OPERATOR(bpr_loss, ops::BprLossOp, ops::BprLossOpMaker,
paddle::framework::DefaultGradOpDescMaker<true>);
ops::BprLossGradDescMaker);
REGISTER_OPERATOR(bpr_loss_grad, ops::BprLossGradientOp);
REGISTER_OP_CPU_KERNEL(bpr_loss, ops::BprLossOpKernel<CPUCtx, float>,
ops::BprLossOpKernel<CPUCtx, double>);
......
......@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include <algorithm>
#include <memory>
#include <vector>
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/math/math_function.h"
......@@ -568,13 +569,31 @@ class ROIPerspectiveTransformOpMaker
}
};
class ROIPerspectiveTransformGradDescMaker
: public framework::SingleGradOpDescMaker {
public:
using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
protected:
std::unique_ptr<framework::OpDesc> Apply() const override {
std::unique_ptr<framework::OpDesc> op(new framework::OpDesc());
op->SetType("roi_perspective_transform_grad");
op->SetInput("X", Input("X"));
op->SetInput("ROIs", Input("ROIs"));
op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
op->SetAttrMap(Attrs());
return op;
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OPERATOR(roi_perspective_transform, ops::ROIPerspectiveTransformOp,
ops::ROIPerspectiveTransformOpMaker,
paddle::framework::DefaultGradOpDescMaker<true>);
ops::ROIPerspectiveTransformGradDescMaker);
REGISTER_OPERATOR(roi_perspective_transform_grad,
ops::ROIPerspectiveTransformGradOp);
REGISTER_OP_CPU_KERNEL(roi_perspective_transform,
......
......@@ -77,7 +77,8 @@ class EltwiseAddMKLDNNKernel : public framework::OpKernel<T> {
} else {
functor.RunMidWise(n, pre, post);
}
z->set_mkldnn_prim_desc(x->get_mkldnn_prim_desc());
z->set_layout(DataLayout::kMKLDNN);
z->set_format(x->format());
} else {
PADDLE_ENFORCE(x->layout() == DataLayout::kMKLDNN &&
x->format() != memory::format::format_undef,
......@@ -115,8 +116,7 @@ class EltwiseAddMKLDNNKernel : public framework::OpKernel<T> {
auto sum_pd = sum::primitive_desc(dst_md, scales, srcs_pd);
// create mkldnn memory for dst
auto dst_mem_pd = sum_pd.dst_primitive_desc();
memory dst_memory = memory(dst_mem_pd, z_data);
memory dst_memory = memory(sum_pd.dst_primitive_desc(), z_data);
std::vector<primitive::at> inputs;
inputs.push_back(srcs[0]);
......@@ -129,7 +129,9 @@ class EltwiseAddMKLDNNKernel : public framework::OpKernel<T> {
pipeline.push_back(sum_prim);
stream(stream::kind::eager).submit(pipeline).wait();
z->set_mkldnn_prim_desc(dst_mem_pd);
z->set_layout(DataLayout::kMKLDNN);
z->set_format(
(memory::format)dst_memory.get_primitive_desc().desc().data.format);
}
}
};
......@@ -150,19 +152,24 @@ class EltwiseAddMKLDNNGradKernel : public ElemwiseGradKernel<T> {
auto* out = dout;
auto *x = dout, *y = dout;
auto set_mkldnn_format = [](Tensor* in, const Tensor* out) {
in->set_layout(DataLayout::kMKLDNN);
in->set_format(out->format());
};
if (dx != nullptr && dy != nullptr && dx->dims() == dy->dims()) {
if (dx->dims() == dy->dims()) {
auto blas = math::GetBlas<paddle::platform::CPUDeviceContext, T>(ctx);
if (dx) {
blas.VCOPY(dout->numel(), dout->data<T>(),
dx->mutable_data<T>(ctx.GetPlace()));
dx->set_mkldnn_prim_desc(dout->get_mkldnn_prim_desc());
set_mkldnn_format(dx, dout);
}
if (dy) {
blas.VCOPY(dout->numel(), dout->data<T>(),
dy->mutable_data<T>(ctx.GetPlace()));
dy->set_mkldnn_prim_desc(dout->get_mkldnn_prim_desc());
set_mkldnn_format(dy, dout);
}
}
} else {
......
......@@ -65,11 +65,17 @@ by input arguments.
}
};
DECLARE_NO_NEED_BUFFER_VARS_INFERENCE(
GaussianRandomBatchSizeLikeNoNeedBufferVarsInference, "Input");
} // namespace operators
} // namespace paddle
REGISTER_OP_WITHOUT_GRADIENT(
REGISTER_OPERATOR(
gaussian_random_batch_size_like,
paddle::operators::GaussianRandomBatchSizeLikeOp,
paddle::operators::GaussianRandomBatchSizeLikeOpMaker);
paddle::operators::GaussianRandomBatchSizeLikeOpMaker,
paddle::framework::EmptyGradOpMaker,
paddle::operators::GaussianRandomBatchSizeLikeNoNeedBufferVarsInference);
// Kernels are registered in gaussian_random_op.cc and gaussian_random_op.cu
......@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/im2sequence_op.h"
#include <memory>
#include <string>
#include <vector>
......@@ -146,12 +147,28 @@ class Im2SequenceGradOp : public framework::OperatorWithKernel {
}
};
class Im2SequenceGradDescMaker : public framework::SingleGradOpDescMaker {
public:
using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
protected:
std::unique_ptr<framework::OpDesc> Apply() const override {
std::unique_ptr<framework::OpDesc> op(new framework::OpDesc());
op->SetType("im2sequence_grad");
op->SetInput("X", Input("X"));
op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
op->SetAttrMap(Attrs());
return op;
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OPERATOR(im2sequence, ops::Im2SequenceOp, ops::Im2SequenceOpMaker,
paddle::framework::DefaultGradOpDescMaker<true>);
ops::Im2SequenceGradDescMaker);
REGISTER_OPERATOR(im2sequence_grad, ops::Im2SequenceGradOp);
REGISTER_OP_CPU_KERNEL(
im2sequence,
......
......@@ -10,6 +10,7 @@
limitations under the License. */
#include "paddle/fluid/operators/interpolate_op.h"
#include <memory>
#include <string>
#include <vector>
#include "paddle/fluid/framework/op_registry.h"
......@@ -194,21 +195,46 @@ class InterpolateOpGrad : public framework::OperatorWithKernel {
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override {
return framework::OpKernelType(ctx.Input<Tensor>("X")->type(),
ctx.GetPlace());
return framework::OpKernelType(
ctx.Input<Tensor>(framework::GradVarName("Out"))->type(),
ctx.GetPlace());
}
};
class InterpolateGradDescMaker : public framework::SingleGradOpDescMaker {
public:
using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
protected:
std::unique_ptr<framework::OpDesc> Apply() const override {
std::unique_ptr<framework::OpDesc> op(new framework::OpDesc());
op->SetType(ForwardOp().Type() + "_grad");
op->SetInput("X", Input("X"));
if (ForwardOp().Inputs().count("OutSize") > 0) {
op->SetInput("OutSize", Input("OutSize"));
}
op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
op->SetAttrMap(Attrs());
return op;
}
};
DECLARE_NO_NEED_BUFFER_VARS_INFERENCE(InterpolateGradNoNeedBufferVarsInference,
"X");
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OPERATOR(bilinear_interp, ops::InterpolateOp, ops::InterpolateOpMaker,
paddle::framework::DefaultGradOpDescMaker<true>);
REGISTER_OPERATOR(bilinear_interp_grad, ops::InterpolateOpGrad);
ops::InterpolateGradDescMaker);
REGISTER_OPERATOR(bilinear_interp_grad, ops::InterpolateOpGrad,
ops::InterpolateGradNoNeedBufferVarsInference);
REGISTER_OPERATOR(nearest_interp, ops::InterpolateOp, ops::InterpolateOpMaker,
paddle::framework::DefaultGradOpDescMaker<true>);
REGISTER_OPERATOR(nearest_interp_grad, ops::InterpolateOpGrad);
ops::InterpolateGradDescMaker);
REGISTER_OPERATOR(nearest_interp_grad, ops::InterpolateOpGrad,
ops::InterpolateGradNoNeedBufferVarsInference);
REGISTER_OP_CPU_KERNEL(bilinear_interp, ops::InterpolateKernel<float>,
ops::InterpolateKernel<double>,
ops::InterpolateKernel<uint8_t>);
......
......@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/l1_norm_op.h"
#include <memory>
namespace paddle {
namespace operators {
......@@ -62,12 +63,28 @@ $$Out = \sum{|X|}$$
}
};
class L1NormGradDescMaker : public framework::SingleGradOpDescMaker {
public:
using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
protected:
std::unique_ptr<framework::OpDesc> Apply() const override {
std::unique_ptr<framework::OpDesc> op(new framework::OpDesc());
op->SetType("l1_norm_grad");
op->SetInput("X", Input("X"));
op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
op->SetAttrMap(Attrs());
return op;
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OPERATOR(l1_norm, ops::L1NormOp, ops::L1NormOpMaker,
paddle::framework::DefaultGradOpDescMaker<true>);
ops::L1NormGradDescMaker);
REGISTER_OPERATOR(l1_norm_grad, ops::L1NormGradOp);
REGISTER_OP_CPU_KERNEL(
l1_norm, ops::L1NormKernel<paddle::platform::CPUDeviceContext, float>);
......
......@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/label_smooth_op.h"
#include <memory>
#include <string>
namespace paddle {
......@@ -105,10 +106,23 @@ class LabelSmoothGradOp : public framework::OperatorWithKernel {
: OperatorWithKernel(type, inputs, outputs, attrs) {}
void InferShape(framework::InferShapeContext *ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) shouldn't be null.");
PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
"Input(Out@GRAD) shouldn't be null.");
ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
ctx->SetOutputDim(framework::GradVarName("X"),
ctx->GetInputDim(framework::GradVarName("Out")));
}
};
class LabelSmoothGradDescMaker : public framework::SingleGradOpDescMaker {
public:
using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
protected:
std::unique_ptr<framework::OpDesc> Apply() const override {
std::unique_ptr<framework::OpDesc> op(new framework::OpDesc());
op->SetType("label_smooth_grad");
op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
op->SetAttrMap(Attrs());
return op;
}
};
......@@ -117,7 +131,7 @@ class LabelSmoothGradOp : public framework::OperatorWithKernel {
namespace ops = paddle::operators;
REGISTER_OPERATOR(label_smooth, ops::LabelSmoothOp, ops::LabelSmoothOpMaker,
paddle::framework::DefaultGradOpDescMaker<true>);
ops::LabelSmoothGradDescMaker);
REGISTER_OPERATOR(label_smooth_grad, ops::LabelSmoothGradOp);
REGISTER_OP_CPU_KERNEL(
label_smooth,
......
......@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/linear_chain_crf_op.h"
#include <memory>
namespace paddle {
namespace operators {
......@@ -250,14 +251,46 @@ class LinearChainCRFGradOp : public framework::OperatorWithKernel {
}
};
class LinearChainCRFGradDescMaker : public framework::SingleGradOpDescMaker {
public:
using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
protected:
std::unique_ptr<framework::OpDesc> Apply() const override {
std::unique_ptr<framework::OpDesc> op(new framework::OpDesc());
op->SetType("linear_chain_crf_grad");
op->SetAttrMap(Attrs());
op->SetInput("Emission", Input("Emission"));
op->SetInput("Transition", Input("Transition"));
op->SetInput("Label", Input("Label"));
op->SetInput("Alpha", Output("Alpha"));
op->SetInput("EmissionExps", Output("EmissionExps"));
op->SetInput("TransitionExps", Output("TransitionExps"));
op->SetInput(framework::GradVarName("LogLikelihood"),
OutputGrad("LogLikelihood"));
op->SetOutput(framework::GradVarName("Emission"), InputGrad("Emission"));
op->SetOutput(framework::GradVarName("Transition"),
InputGrad("Transition"));
return op;
}
};
DECLARE_NO_NEED_BUFFER_VARS_INFERENCE(
LinearChainCRFGradNoNeedBufferVarsInference, "Transition", "Emission");
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OPERATOR(linear_chain_crf, ops::LinearChainCRFOp,
ops::LinearChainCRFOpMaker,
paddle::framework::DefaultGradOpDescMaker<true>);
REGISTER_OPERATOR(linear_chain_crf_grad, ops::LinearChainCRFGradOp);
ops::LinearChainCRFOpMaker, ops::LinearChainCRFGradDescMaker);
REGISTER_OPERATOR(linear_chain_crf_grad, ops::LinearChainCRFGradOp,
ops::LinearChainCRFGradNoNeedBufferVarsInference);
REGISTER_OP_CPU_KERNEL(
linear_chain_crf,
ops::LinearChainCRFOpKernel<paddle::platform::CPUDeviceContext, float>,
......
......@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/log_loss_op.h"
#include <memory>
namespace paddle {
namespace operators {
......@@ -100,12 +101,29 @@ class LogLossGradOp : public framework::OperatorWithKernel {
}
};
class LogLossGradDescMaker : public framework::SingleGradOpDescMaker {
public:
using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
protected:
std::unique_ptr<framework::OpDesc> Apply() const override {
std::unique_ptr<framework::OpDesc> op(new framework::OpDesc());
op->SetType("log_loss_grad");
op->SetInput("Predicted", Input("Predicted"));
op->SetInput("Labels", Input("Labels"));
op->SetInput(framework::GradVarName("Loss"), OutputGrad("Loss"));
op->SetOutput(framework::GradVarName("Predicted"), InputGrad("Predicted"));
op->SetAttrMap(Attrs());
return op;
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OPERATOR(log_loss, ops::LogLossOp, ops::LogLossOpMaker<float>,
paddle::framework::DefaultGradOpDescMaker<true>);
ops::LogLossGradDescMaker);
REGISTER_OPERATOR(log_loss_grad, ops::LogLossGradOp);
REGISTER_OP_CPU_KERNEL(
log_loss, ops::LogLossKernel<paddle::platform::CPUDeviceContext, float>);
......
......@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/lstm_op.h"
#include <memory>
#include <string>
namespace paddle {
......@@ -264,12 +265,51 @@ class LSTMGradOp : public framework::OperatorWithKernel {
}
};
class LSTMGradOpDescMaker : public framework::SingleGradOpDescMaker {
public:
using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
protected:
std::unique_ptr<framework::OpDesc> Apply() const override {
std::unique_ptr<framework::OpDesc> op(new framework::OpDesc());
op->SetType("lstm_grad");
op->SetAttrMap(Attrs());
op->SetInput("Input", Input("Input"));
op->SetOutput(framework::GradVarName("Input"), InputGrad("Input"));
if (ForwardOp().Inputs().count("H0") > 0) {
op->SetInput("H0", Input("H0"));
op->SetOutput(framework::GradVarName("H0"), InputGrad("H0"));
}
if (ForwardOp().Inputs().count("C0") > 0) {
op->SetInput("C0", Input("C0"));
op->SetOutput(framework::GradVarName("C0"), InputGrad("C0"));
}
op->SetInput("Weight", Input("Weight"));
op->SetOutput(framework::GradVarName("Weight"), InputGrad("Weight"));
op->SetInput("Bias", Input("Bias"));
op->SetOutput(framework::GradVarName("Bias"), InputGrad("Bias"));
op->SetInput("Cell", Output("Cell"));
op->SetInput("Hidden", Output("Hidden"));
op->SetInput(framework::GradVarName("Hidden"), OutputGrad("Hidden"));
op->SetInput("BatchGate", Output("BatchGate"));
op->SetInput("BatchCellPreAct", Output("BatchCellPreAct"));
return op;
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OPERATOR(lstm, ops::LSTMOp, ops::LSTMOpMaker,
paddle::framework::DefaultGradOpDescMaker<true>);
ops::LSTMGradOpDescMaker);
REGISTER_OPERATOR(lstm_grad, ops::LSTMGradOp);
REGISTER_OP_CPU_KERNEL(
lstm, ops::LSTMKernel<paddle::platform::CPUDeviceContext, float>,
......
......@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/margin_rank_loss_op.h"
#include <memory>
namespace paddle {
namespace operators {
......@@ -94,8 +95,6 @@ class MarginRankLossGradOp : public framework::OperatorWithKernel {
void InferShape(framework::InferShapeContext *ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("Label"), "Input(Label) shouldn't be null.");
PADDLE_ENFORCE(ctx->HasInput("X1"), "Input(X1) shouldn't be null.");
PADDLE_ENFORCE(ctx->HasInput("X2"), "Input(X2) shouldn't be null.");
PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
"Input(Out@GRAD) shouldn't be null.");
PADDLE_ENFORCE(ctx->HasInput("Activated"),
......@@ -106,13 +105,31 @@ class MarginRankLossGradOp : public framework::OperatorWithKernel {
}
};
class MarginRankLossGradDescMaker : public framework::SingleGradOpDescMaker {
public:
using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
protected:
std::unique_ptr<framework::OpDesc> Apply() const override {
std::unique_ptr<framework::OpDesc> op(new framework::OpDesc());
op->SetType("margin_rank_loss_grad");
op->SetInput("Activated", Output("Activated"));
op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
op->SetInput("Label", Input("Label"));
op->SetOutput(framework::GradVarName("X1"), InputGrad("X1"));
op->SetOutput(framework::GradVarName("X2"), InputGrad("X2"));
op->SetAttrMap(Attrs());
return op;
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OPERATOR(margin_rank_loss, ops::MarginRankLossOp,
ops::MarginRankLossOpMaker<float>,
paddle::framework::DefaultGradOpDescMaker<true>);
ops::MarginRankLossGradDescMaker);
REGISTER_OPERATOR(margin_rank_loss_grad, ops::MarginRankLossGradOp);
REGISTER_OP_CPU_KERNEL(
margin_rank_loss,
......
......@@ -13,7 +13,10 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/mean_op.h"
#include <memory>
#include <string>
#include <unordered_map>
namespace paddle {
namespace operators {
......@@ -61,7 +64,8 @@ class MeanGradOp : public framework::OperatorWithKernel {
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override {
auto input_data_type = ctx.Input<Tensor>("X")->type();
auto input_data_type =
ctx.Input<Tensor>(framework::GradVarName("Out"))->type();
return framework::OpKernelType(input_data_type, ctx.GetPlace());
}
};
......@@ -81,13 +85,16 @@ class MeanGradMaker : public framework::SingleGradOpDescMaker {
}
};
DECLARE_NO_NEED_BUFFER_VARS_INFERENCE(MeanGradNoNeedBufferVarsInference, "X");
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OPERATOR(mean, ops::MeanOp, ops::MeanOpMaker, ops::MeanOpInferVarType,
ops::MeanGradMaker);
REGISTER_OPERATOR(mean_grad, ops::MeanGradOp);
REGISTER_OPERATOR(mean_grad, ops::MeanGradOp,
ops::MeanGradNoNeedBufferVarsInference);
REGISTER_OP_CPU_KERNEL(
mean, ops::MeanKernel<paddle::platform::CPUDeviceContext, float>,
ops::MeanKernel<paddle::platform::CPUDeviceContext, double>);
......
......@@ -96,7 +96,8 @@ void eltwise_forward(const framework::ExecutionContext &ctx,
std::vector<int> src_tz = framework::vectorize2int(x->dims());
auto src_format = x->format();
auto src_format =
src_tz.size() == 2 ? mkldnn::memory::format::nc : x->format();
const std::string key = gethash(src_tz, algorithm);
const std::string key_src_data =
......@@ -126,8 +127,10 @@ void eltwise_forward(const framework::ExecutionContext &ctx,
if (p_fwd == nullptr) {
// create mkldnn memory for input X
auto src_md = platform::MKLDNNMemDesc(
src_tz, platform::MKLDNNGetDataType<T>(), src_format);
auto src_memory = std::shared_ptr<memory>(
new memory(x->get_mkldnn_prim_desc(), to_void_cast(x_data)));
new memory({src_md, mkldnn_engine}, to_void_cast(x_data)));
// save src_memory to be referred in backward path
dev_ctx.SetBlob(key_src_mem, src_memory);
......@@ -174,7 +177,8 @@ void eltwise_forward(const framework::ExecutionContext &ctx,
pipeline.push_back(*p_fwd);
stream(stream::kind::eager).submit(pipeline).wait();
y->set_mkldnn_prim_desc(dst_memory->get_primitive_desc());
y->set_layout(DataLayout::kMKLDNN);
y->set_format(GetMKLDNNFormat(*dst_memory));
}
template <typename T>
......@@ -192,6 +196,9 @@ void eltwise_grad(const framework::ExecutionContext &ctx,
std::vector<int> diff_dst_tz = framework::vectorize2int(diff_y->dims());
auto diff_y_format =
diff_dst_tz.size() == 2 ? mkldnn::memory::format::nc : diff_y->format();
const std::string key = gethash(diff_dst_tz, algorithm);
const std::string key_src_data =
key + ctx.op().Input("Out") + "@eltwise_fwd_src_data";
......@@ -203,8 +210,8 @@ void eltwise_grad(const framework::ExecutionContext &ctx,
key + std::to_string(*p_src_layout) + "@eltwise_fwd_src_mem";
const std::string key_fwd_pd =
key + std::to_string(*p_src_layout) + "@eltwise_fwd_pd";
const std::string key_with_layouts = key + std::to_string(*p_src_layout) +
"-" + std::to_string(diff_y->format());
const std::string key_with_layouts =
key + std::to_string(*p_src_layout) + "-" + std::to_string(diff_y_format);
const std::string key_diff_src_mem =
key_with_layouts + "@eltwise_diff_src_mem";
const std::string key_diff_dst_mem =
......@@ -227,8 +234,10 @@ void eltwise_grad(const framework::ExecutionContext &ctx,
if (p_grad == nullptr) {
// create mkldnn memory for input diff_y
auto diff_dst_md = platform::MKLDNNMemDesc(
diff_dst_tz, platform::MKLDNNGetDataType<T>(), diff_y_format);
auto diff_dst_memory = std::shared_ptr<memory>(
new memory(diff_y->get_mkldnn_prim_desc(), to_void_cast(diff_y_data)));
new memory({diff_dst_md, mkldnn_engine}, to_void_cast(diff_y_data)));
dev_ctx.SetBlob(key_diff_dst_mem, diff_dst_memory);
// retrieve eltwise primitive desc from device context
......@@ -272,7 +281,8 @@ void eltwise_grad(const framework::ExecutionContext &ctx,
pipeline.push_back(*p_grad);
stream(stream::kind::eager).submit(pipeline).wait();
diff_x->set_mkldnn_prim_desc(diff_src_memory->get_primitive_desc());
diff_x->set_layout(DataLayout::kMKLDNN);
diff_x->set_format(GetMKLDNNFormat(*diff_src_memory));
}
template <typename T, mkldnn::algorithm algorithm>
......
......@@ -206,14 +206,17 @@ class BatchNormMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
if (fuse_with_relu) flags |= mkldnn::fuse_bn_relu;
// create mkldnn memory from input x tensor
mkldnn::memory::format input_format =
platform::MKLDNNFormatForSize(src_tz.size(), x->format());
// keys for backward pass
const std::string key = BatchNormMKLDNNHandler::GetHash(
src_tz, epsilon, flags, global_stats, x->format(),
src_tz, epsilon, flags, global_stats, input_format,
ctx.op().Output("SavedMean"));
const std::string key_batch_norm_fwd_pd = key + "@bn_fwd_pd";
auto user_src_md = x->get_mkldnn_prim_desc().desc();
auto user_src_md = platform::MKLDNNMemDesc(
{src_tz}, platform::MKLDNNGetDataType<T>(), input_format);
// create primitive descriptor for batch norm forward
using bn_fwd_types = bn_type_traits<mkldnn::batch_normalization_forward>;
......@@ -227,8 +230,8 @@ class BatchNormMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
BatchNormMKLDNNHandler handler(batch_norm_fwd_pd, dev_ctx, mkldnn_engine,
key);
auto src_memory = handler.AcquireSrcMemory(x->get_mkldnn_prim_desc(),
to_void_cast(x_data));
auto src_memory =
handler.AcquireSrcMemory(user_src_md, to_void_cast(x_data));
// crate mkldnn memory for weights(scale/shift)
auto scaleshift_memory =
......@@ -262,7 +265,8 @@ class BatchNormMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
variance_memory, false);
}
y->set_mkldnn_prim_desc(dst_memory->get_primitive_desc());
y->set_layout(DataLayout::kMKLDNN);
y->set_format(platform::GetMKLDNNFormat(*dst_memory));
std::vector<mkldnn::primitive> pipeline;
pipeline.push_back(*batch_norm_p);
......@@ -332,6 +336,9 @@ class BatchNormMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
using bn_bwd_types = bn_type_traits<mkldnn::batch_normalization_backward>;
mkldnn::memory::format dst_format =
platform::MKLDNNFormatForSize(src_tz.size(), diff_y->format());
mkldnn::memory::format input_format =
platform::MKLDNNFormatForSize(src_tz.size(), x->format());
......@@ -339,14 +346,14 @@ class BatchNormMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
// keys from forward pass
const std::string key = BatchNormMKLDNNHandler::GetHash(
src_tz, epsilon, flags, false, x->format(),
src_tz, epsilon, flags, false, input_format,
ctx.op().Input("SavedMean"));
const std::string key_batch_norm_fwd_pd = key + "@bn_fwd_pd";
// keys for primitives reuse
const std::string key_with_hash =
key + BatchNormMKLDNNHandler::GetHash(src_tz, epsilon, flags, false,
x->format());
input_format);
const std::string key_batch_norm_bwd_p =
key_with_hash + "@batch_norm_bwd_p";
const std::string key_batch_norm_src_mem_p =
......@@ -366,8 +373,9 @@ class BatchNormMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
primitive reorder_diff_dst;
bool is_diff_dst_reordered = false;
auto user_diff_dst_memory =
memory(diff_y->get_mkldnn_prim_desc(), to_void_cast(diff_y_data));
auto user_diff_dst_memory = memory(
{{{diff_dst_tz}, memory::data_type::f32, dst_format}, mkldnn_engine},
to_void_cast(diff_y_data));
// MKLDNN requires a single piece of memory for scale and shift/bias data
const size_t scaleshift_size = 2 * ic;
......@@ -451,7 +459,10 @@ class BatchNormMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
dev_ctx.SetBlob(key_batch_norm_diff_dst_mem_p, diff_dst_memory);
// set layout/format of output tensors
diff_x->set_mkldnn_prim_desc(diff_src_memory->get_primitive_desc());
diff_x->set_layout(DataLayout::kMKLDNN);
diff_x->set_format((memory::format)diff_src_memory->get_primitive_desc()
.desc()
.data.format);
} else {
// primitives already exist
UpdateMemoryData(dev_ctx, key_batch_norm_src_mem_p, to_void_cast(x_data));
......@@ -476,7 +487,10 @@ class BatchNormMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
}
// set layout/format of output tensors
diff_x->set_mkldnn_prim_desc(diff_src_memory->get_primitive_desc());
diff_x->set_layout(DataLayout::kMKLDNN);
diff_x->set_format((memory::format)diff_src_memory->get_primitive_desc()
.desc()
.data.format);
}
// execute optional reorder and batch_norm backward primitive
......
......@@ -210,7 +210,8 @@ class ConcatMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
stream(stream::kind::eager).submit({*concat_p}).wait();
output->set_mkldnn_prim_desc(concat_pd->dst_primitive_desc());
output->set_layout(DataLayout::kMKLDNN);
output->set_format(GetDstMemFormat(*concat_pd));
}
};
} // namespace operators
......
......@@ -96,8 +96,12 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
auto* bias = ctx.HasInput("Bias") ? ctx.Input<Tensor>("Bias") : nullptr;
auto* output = ctx.Output<Tensor>("Output");
PADDLE_ENFORCE(input->layout() == DataLayout::kMKLDNN);
PADDLE_ENFORCE(filter->layout() == DataLayout::kMKLDNN);
PADDLE_ENFORCE(input->layout() == DataLayout::kMKLDNN &&
input->format() != memory::format::format_undef,
"Wrong layout/format set for Input tensor");
PADDLE_ENFORCE(filter->layout() == DataLayout::kMKLDNN &&
filter->format() != memory::format::format_undef,
"Wrong layout/format set for Filter tensor");
PADDLE_ENFORCE(input->dims().size() == 4 || input->dims().size() == 5,
"Input must be with 4 or 5 dimensions, i.e. NCHW or NCDHW");
PADDLE_ENFORCE(filter->dims().size() == 4 || filter->dims().size() == 5,
......@@ -144,19 +148,14 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
std::vector<primitive> pipeline;
// For convolution with groups we need to recreate primitive descriptor
// as Paddle tensor is not having group dims while mkldnn treats
// group as another dimensions
mkldnn::memory::primitive_desc user_weights_mpd =
filter->get_mkldnn_prim_desc();
if (g > 1) {
mkldnn::memory::format weights_format =
GetWeightsFormat(filter->format(), g, is_conv3d);
auto user_weights_md = platform::MKLDNNMemDesc(
{weights_tz}, platform::MKLDNNGetDataType<T>(), weights_format);
user_weights_mpd =
mkldnn::memory::primitive_desc(user_weights_md, mkldnn_engine);
}
auto src_format = input->format();
mkldnn::memory::format weights_format =
GetWeightsFormat(filter->format(), g, is_conv3d);
auto user_src_md = platform::MKLDNNMemDesc(
{src_tz}, platform::MKLDNNGetDataType<T>(), src_format);
auto user_weights_md = platform::MKLDNNMemDesc(
{weights_tz}, platform::MKLDNNGetDataType<T>(), weights_format);
/* create memory descriptor for convolution without specified format
* ('any') which lets a primitive (convolution in this case) choose
......@@ -166,7 +165,7 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
auto chosen_memory_format =
platform::data_format_to_memory_format(data_format);
mkldnn::memory::format weights_format = mkldnn::memory::format::any;
weights_format = mkldnn::memory::format::any;
// Check the format for user's special output
if (chosen_memory_format != mkldnn::memory::format::any) {
if (is_conv3d) {
......@@ -206,10 +205,10 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
platform::ConvMKLDNNHandler handler(conv_pd, dev_ctx, mkldnn_engine, key);
// create mkldnn memory from input tensors (data/weights)
auto user_src_memory_p = handler.AcquireSrcMemory(
input->get_mkldnn_prim_desc(), to_void_cast<T>(input_data));
auto user_src_memory_p =
handler.AcquireSrcMemory(user_src_md, to_void_cast<T>(input_data));
auto user_weights_memory_p = handler.AcquireWeightsMemory(
user_weights_mpd, to_void_cast<T>(filter_data));
user_weights_md, to_void_cast<T>(filter_data));
// create reorder primitive if the input format is not the preferred one
auto src_memory_p =
......@@ -282,7 +281,8 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
pipeline.push_back(*conv_p);
stream(stream::kind::eager).submit(pipeline).wait();
output->set_mkldnn_prim_desc(dst_memory_p->get_primitive_desc());
output->set_layout(DataLayout::kMKLDNN);
output->set_format(GetMKLDNNFormat(*dst_memory_p));
}
void ComputeINT8(const paddle::framework::ExecutionContext& ctx) const {
const bool is_test = ctx.Attr<bool>("is_test");
......@@ -948,8 +948,8 @@ class ConvMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
// push primitive to stream and wait until it's executed
pipeline.push_back(*conv_bwd_weights_p);
auto filter_grad_mpd = diff_weights_memory_p->get_primitive_desc();
filter_grad->set_mkldnn_prim_desc(filter_grad_mpd);
filter_grad->set_layout(DataLayout::kMKLDNN);
filter_grad->set_format(GetMKLDNNFormat(*diff_weights_memory_p));
}
if (input_grad) {
......@@ -972,7 +972,8 @@ class ConvMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
pipeline.push_back(*conv_bwd_data_p);
input_grad->set_mkldnn_prim_desc(diff_src_memory_p->get_primitive_desc());
input_grad->set_layout(DataLayout::kMKLDNN);
input_grad->set_format(GetMKLDNNFormat(*diff_src_memory_p));
}
stream(stream::kind::eager).submit(pipeline).wait();
}
......
......@@ -221,7 +221,8 @@ class ConvTransposeMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
pipeline.push_back(*conv_p);
mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait();
output->set_mkldnn_prim_desc(dst_memory_p->get_primitive_desc());
output->set_layout(DataLayout::kMKLDNN);
output->set_format(platform::GetMKLDNNFormat(*dst_memory_p));
}
private:
......
......@@ -42,12 +42,8 @@ class GaussianMKLDNNKernel : public paddle::framework::OpKernel<T> {
// The format of output is set as the mkldnn's format
// TODO(@mozga-intel) The format of matrix sets inside the another layers.
// TODO(jczaja): Remove this hack after checking performance on block layout
auto tensor_mem_pd = paddle::platform::create_prim_desc_from_dims(
paddle::framework::vectorize2int(tensor->dims()),
mkldnn::memory::format::oihw);
tensor->set_mkldnn_prim_desc(tensor_mem_pd);
tensor->set_layout(DataLayout::kMKLDNN);
tensor->set_format(mkldnn::memory::format::oihw);
}
};
} // namespace operators
......
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册